mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/scripts.git
synced 2025-04-04 19:45:48 +02:00
74 lines
2.9 KiB
SQL
74 lines
2.9 KiB
SQL
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_requests_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
web_request String
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.web_requests_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.web_requests_data AS
|
|
SELECT
|
|
JSONExtractString(result, 'batch_id') as batch_id,
|
|
splitByChar('.', domain)[-1] as tld,
|
|
JSONExtractString(result, '_metadata') as metadata,
|
|
job_id,
|
|
JSONExtractString(result, 'domain') as domain,
|
|
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
|
|
web_request
|
|
FROM
|
|
${CH_DATABASE}.raw_results_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'web', 'details') as web_request;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_requests ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.web_requests_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, web_requests_mv);
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_info_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
scheme String,
|
|
tls_valid Nullable(Bool),
|
|
http_status_code Nullable(Int),
|
|
-- response_redirected: true if response is being redirected to new url
|
|
response_redirected Nullable(Bool),
|
|
-- domain_url: domain in url
|
|
domain_url String,
|
|
-- domain_redirected: true if domain in url is not subdomain of original domain
|
|
domain_redirected Bool,
|
|
final_tld String
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.web_info_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.web_info_data AS
|
|
SELECT
|
|
batch_id,
|
|
tld,
|
|
metadata,
|
|
job_id,
|
|
domain,
|
|
started_at,
|
|
JSONExtractString(web_request, 'scheme') as scheme,
|
|
JSONExtract(web_request, 'response', 'tls', 'certificate', 'is_valid', 'valid', 'Nullable(Bool)') as tls_valid,
|
|
JSONExtract(web_request, 'response', 'status', 'Nullable(Int)') as http_status_code,
|
|
-- response_redirected: true if response is being redirected to new url
|
|
JSONExtract(web_request, 'is_redirect', 'redirected', 'Nullable(Bool)') as response_redirected,
|
|
-- domain_url: domain in url
|
|
JSONExtractString(web_request, 'hostname') as domain_url,
|
|
-- domain_redirected: true if domain in url is not subdomain of original domain
|
|
domain_url != '' and not toBool(endsWith(splitByChar('.', idnaEncode(domain_url)), splitByChar('.', idnaEncode(domain)))) as domain_redirected,
|
|
splitByChar('.', domain_url)[-1] as final_tld
|
|
FROM
|
|
${CH_DATABASE}.web_requests_data;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_info ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.web_info_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, web_info_mv);
|