frcrawler-scripts/schemas/20-view_dns.clickhouse.sql
Gaël Berthaud-Müller 38679a009a add all files
2024-03-06 14:49:11 +01:00

313 lines
10 KiB
SQL

-- Nameservers views
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.nameservers_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
ns_name String,
-- ipv4 and ipv6 are JSON objects: {"addresses": [{"ip": "..."}]}
ipv4 String,
ipv6 String,
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.nameservers_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.nameservers_data AS
SELECT
JSONExtractString(result, 'batch_id') as batch_id,
splitByChar('.', domain)[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
job_id,
JSONExtractString(result, 'domain') as domain,
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
JSONExtractString(nameserver, 'name') as ns_name,
JSONExtractString(nameserver, 'ipv4') as ipv4,
JSONExtractString(nameserver, 'ipv6') as ipv6
FROM
${CH_DATABASE}.raw_results_data
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'dns', 'ns', 'nameservers') as nameserver;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.nameservers ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.nameservers_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, nameservers_data);
--
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.ns_per_domain_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
ns_name String,
ip String,
asn Nullable(UInt32),
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.ns_per_domain_ipv6_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.ns_per_domain_data AS
SELECT
batch_id,
tld,
metadata,
job_id,
domain,
started_at,
ns_name,
JSONExtractString(ns_ipv6, 'ip') as ip,
if(
ip != '',
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv6', 'autonomous_system_number', tuple(IPv6StringToNumOrDefault(ip))),
null
) as asn
FROM ${CH_DATABASE}.nameservers_data
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv6, 'addresses') as ns_ipv6
WHERE ip != '' and asn is not null;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.ns_per_domain_ipv4_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.ns_per_domain_data AS
SELECT
batch_id,
tld,
metadata,
job_id,
domain,
started_at,
ns_name,
JSONExtractString(ns_ipv4, 'ip') as ip,
if(
ip != '',
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv4', 'autonomous_system_number', tuple(IPv4StringToNumOrDefault(ip))),
null
) as asn
FROM ${CH_DATABASE}.nameservers_data
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv4, 'addresses') as ns_ipv4
WHERE ip != '' and asn is not null;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.ns_per_domain ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.ns_per_domain_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, ns_per_domain_data);
-- Addresses views
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.addresses_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
name String,
-- ipv4 and ipv6 are JSON objects: {"addresses": [{"ip": "..."}]}
ipv4 String,
ipv6 String,
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.addresses_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.addresses_data AS
SELECT
JSONExtractString(result, 'batch_id') as batch_id,
splitByChar('.', domain)[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
job_id,
JSONExtractString(result, 'domain') as domain,
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
JSONExtractString(name_address, 'name') as name,
JSONExtractString(name_address, 'ipv4') as ipv4,
JSONExtractString(name_address, 'ipv6') as ipv6
FROM
${CH_DATABASE}.raw_results_data
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'dns', 'addresses') as name_address;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.addresses ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.addresses_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, addresses_data);
--
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
name String,
ip String,
asn Nullable(UInt32),
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain_ipv6_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.a_aaaa_per_domain_data AS
SELECT
batch_id,
tld,
metadata,
job_id,
domain,
started_at,
name, -- {domain} or www.{domain}
JSONExtractString(aaaa_ipv6, 'ip') as ip,
if(
ip != '',
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv6', 'autonomous_system_number', tuple(IPv6StringToNumOrDefault(ip))),
null
) as asn
FROM ${CH_DATABASE}.addresses_data
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv6, 'addresses') as aaaa_ipv6
WHERE ip != '' and asn is not null;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain_ipv4_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.a_aaaa_per_domain_data AS
SELECT
batch_id,
tld,
metadata,
job_id,
domain,
started_at,
name,
JSONExtractString(a_ipv4, 'ip') as ip,
if(
ip != '',
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv4', 'autonomous_system_number', tuple(IPv4StringToNumOrDefault(ip))),
null
) as asn
from ${CH_DATABASE}.addresses_data
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv4, 'addresses') as a_ipv4
WHERE ip != '' and asn is not null;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.a_aaaa_per_domain_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, a_aaaa_per_domain_data);
-- MXs views
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mail_exchanges_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
mx_name String,
-- ipv4 and ipv6 are JSON objects: {"addresses": [{"ip": "..."}]}
ipv4 String,
ipv6 String,
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.mail_exchanges_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.mail_exchanges_data AS
SELECT
JSONExtractString(result, 'batch_id') as batch_id,
splitByChar('.', domain)[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
job_id,
JSONExtractString(result, 'domain') as domain,
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
JSONExtractString(mail_exchange, 'name') as mx_name,
JSONExtractString(mail_exchange, 'ipv4') as ipv4,
JSONExtractString(mail_exchange, 'ipv6') as ipv6
FROM
${CH_DATABASE}.raw_results_data
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'dns', 'mx', 'exchanges') as mail_exchange;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mail_exchanges ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.mail_exchanges_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, mail_exchanges_data);
--
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mx_per_domain_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
mx_name String,
ip String,
asn Nullable(UInt32),
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.mx_per_domain_ipv6_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.mx_per_domain_data AS
select
batch_id,
tld,
metadata,
job_id,
domain,
started_at,
mx_name,
JSONExtractString(mx_ipv6, 'ip') as ip,
if(
ip != '',
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv6', 'autonomous_system_number', tuple(IPv6StringToNumOrDefault(ip))),
null
) as asn
FROM ${CH_DATABASE}.mail_exchanges_data
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv6, 'addresses') as mx_ipv6
WHERE ip != '' and asn is not null;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.mx_per_domain_ipv4_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.mx_per_domain_data AS
SELECT
batch_id,
tld,
metadata,
job_id,
domain,
started_at,
mx_name,
JSONExtractString(mx_ipv4, 'ip') as ip,
if(
ip != '',
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv4', 'autonomous_system_number', tuple(IPv4StringToNumOrDefault(ip))),
null
) as asn
FROM ${CH_DATABASE}.mail_exchanges_data
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv4, 'addresses') as mx_ipv4
WHERE ip != '' and asn is not null;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mx_per_domain ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.mx_per_domain_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, mx_per_domain_data);
-- SOA view
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.dns_soa_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
rcode Nullable(String),
error Nullable(String)
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.dns_soa_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.dns_soa_data AS
SELECT
JSONExtractString(result, 'batch_id') as batch_id,
splitByChar('.', domain)[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
job_id,
JSONExtractString(result, 'domain') as domain,
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
JSONExtract(result, 'dns', 'soa', 'rcode', 'Nullable(String)') AS rcode,
JSONExtract(result, 'dns', 'soa', 'error', 'Nullable(String)') AS error
FROM
${CH_DATABASE}.raw_results_data;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.dns_soa ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.dns_soa_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, dns_soa_data);