frcrawler-scripts/schemas/20-view_web.clickhouse.sql
Gaël Berthaud-Müller 38679a009a add all files
2024-03-06 14:49:11 +01:00

74 lines
2.9 KiB
SQL

CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_requests_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
web_request String
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.web_requests_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.web_requests_data AS
SELECT
JSONExtractString(result, 'batch_id') as batch_id,
splitByChar('.', domain)[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
job_id,
JSONExtractString(result, 'domain') as domain,
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
web_request
FROM
${CH_DATABASE}.raw_results_data
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'web', 'details') as web_request;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_requests ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.web_requests_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, web_requests_mv);
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_info_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
scheme String,
tls_valid Nullable(Bool),
http_status_code Nullable(Int),
-- response_redirected: true if response is being redirected to new url
response_redirected Nullable(Bool),
-- domain_url: domain in url
domain_url String,
-- domain_redirected: true if domain in url is not subdomain of original domain
domain_redirected Bool,
final_tld String
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.web_info_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.web_info_data AS
SELECT
batch_id,
tld,
metadata,
job_id,
domain,
started_at,
JSONExtractString(web_request, 'scheme') as scheme,
JSONExtract(web_request, 'response', 'tls', 'certificate', 'is_valid', 'valid', 'Nullable(Bool)') as tls_valid,
JSONExtract(web_request, 'response', 'status', 'Nullable(Int)') as http_status_code,
-- response_redirected: true if response is being redirected to new url
JSONExtract(web_request, 'is_redirect', 'redirected', 'Nullable(Bool)') as response_redirected,
-- domain_url: domain in url
JSONExtractString(web_request, 'hostname') as domain_url,
-- domain_redirected: true if domain in url is not subdomain of original domain
domain_url != '' and not toBool(endsWith(splitByChar('.', idnaEncode(domain_url)), splitByChar('.', idnaEncode(domain)))) as domain_redirected,
splitByChar('.', domain_url)[-1] as final_tld
FROM
${CH_DATABASE}.web_requests_data;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.web_info ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.web_info_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, web_info_mv);