mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/scripts.git
synced 2025-04-04 19:45:48 +02:00
313 lines
10 KiB
SQL
313 lines
10 KiB
SQL
-- Nameservers views
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.nameservers_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
ns_name String,
|
|
-- ipv4 and ipv6 are JSON objects: {"addresses": [{"ip": "..."}]}
|
|
ipv4 String,
|
|
ipv6 String,
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.nameservers_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.nameservers_data AS
|
|
SELECT
|
|
JSONExtractString(result, 'batch_id') as batch_id,
|
|
splitByChar('.', domain)[-1] as tld,
|
|
JSONExtractString(result, '_metadata') as metadata,
|
|
job_id,
|
|
JSONExtractString(result, 'domain') as domain,
|
|
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
|
|
JSONExtractString(nameserver, 'name') as ns_name,
|
|
JSONExtractString(nameserver, 'ipv4') as ipv4,
|
|
JSONExtractString(nameserver, 'ipv6') as ipv6
|
|
FROM
|
|
${CH_DATABASE}.raw_results_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'dns', 'ns', 'nameservers') as nameserver;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.nameservers ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.nameservers_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, nameservers_data);
|
|
|
|
--
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.ns_per_domain_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
ns_name String,
|
|
ip String,
|
|
asn Nullable(UInt32),
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.ns_per_domain_ipv6_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.ns_per_domain_data AS
|
|
SELECT
|
|
batch_id,
|
|
tld,
|
|
metadata,
|
|
job_id,
|
|
domain,
|
|
started_at,
|
|
ns_name,
|
|
JSONExtractString(ns_ipv6, 'ip') as ip,
|
|
if(
|
|
ip != '',
|
|
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv6', 'autonomous_system_number', tuple(IPv6StringToNumOrDefault(ip))),
|
|
null
|
|
) as asn
|
|
FROM ${CH_DATABASE}.nameservers_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv6, 'addresses') as ns_ipv6
|
|
WHERE ip != '' and asn is not null;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.ns_per_domain_ipv4_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.ns_per_domain_data AS
|
|
SELECT
|
|
batch_id,
|
|
tld,
|
|
metadata,
|
|
job_id,
|
|
domain,
|
|
started_at,
|
|
ns_name,
|
|
JSONExtractString(ns_ipv4, 'ip') as ip,
|
|
if(
|
|
ip != '',
|
|
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv4', 'autonomous_system_number', tuple(IPv4StringToNumOrDefault(ip))),
|
|
null
|
|
) as asn
|
|
FROM ${CH_DATABASE}.nameservers_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv4, 'addresses') as ns_ipv4
|
|
WHERE ip != '' and asn is not null;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.ns_per_domain ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.ns_per_domain_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, ns_per_domain_data);
|
|
|
|
-- Addresses views
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.addresses_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
name String,
|
|
-- ipv4 and ipv6 are JSON objects: {"addresses": [{"ip": "..."}]}
|
|
ipv4 String,
|
|
ipv6 String,
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.addresses_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.addresses_data AS
|
|
SELECT
|
|
JSONExtractString(result, 'batch_id') as batch_id,
|
|
splitByChar('.', domain)[-1] as tld,
|
|
JSONExtractString(result, '_metadata') as metadata,
|
|
job_id,
|
|
JSONExtractString(result, 'domain') as domain,
|
|
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
|
|
JSONExtractString(name_address, 'name') as name,
|
|
JSONExtractString(name_address, 'ipv4') as ipv4,
|
|
JSONExtractString(name_address, 'ipv6') as ipv6
|
|
FROM
|
|
${CH_DATABASE}.raw_results_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'dns', 'addresses') as name_address;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.addresses ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.addresses_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, addresses_data);
|
|
|
|
--
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
name String,
|
|
ip String,
|
|
asn Nullable(UInt32),
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain_ipv6_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.a_aaaa_per_domain_data AS
|
|
SELECT
|
|
batch_id,
|
|
tld,
|
|
metadata,
|
|
job_id,
|
|
domain,
|
|
started_at,
|
|
name, -- {domain} or www.{domain}
|
|
JSONExtractString(aaaa_ipv6, 'ip') as ip,
|
|
if(
|
|
ip != '',
|
|
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv6', 'autonomous_system_number', tuple(IPv6StringToNumOrDefault(ip))),
|
|
null
|
|
) as asn
|
|
FROM ${CH_DATABASE}.addresses_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv6, 'addresses') as aaaa_ipv6
|
|
WHERE ip != '' and asn is not null;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain_ipv4_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.a_aaaa_per_domain_data AS
|
|
SELECT
|
|
batch_id,
|
|
tld,
|
|
metadata,
|
|
job_id,
|
|
domain,
|
|
started_at,
|
|
name,
|
|
JSONExtractString(a_ipv4, 'ip') as ip,
|
|
if(
|
|
ip != '',
|
|
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv4', 'autonomous_system_number', tuple(IPv4StringToNumOrDefault(ip))),
|
|
null
|
|
) as asn
|
|
from ${CH_DATABASE}.addresses_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv4, 'addresses') as a_ipv4
|
|
WHERE ip != '' and asn is not null;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.a_aaaa_per_domain ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.a_aaaa_per_domain_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, a_aaaa_per_domain_data);
|
|
|
|
-- MXs views
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mail_exchanges_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
mx_name String,
|
|
-- ipv4 and ipv6 are JSON objects: {"addresses": [{"ip": "..."}]}
|
|
ipv4 String,
|
|
ipv6 String,
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.mail_exchanges_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.mail_exchanges_data AS
|
|
SELECT
|
|
JSONExtractString(result, 'batch_id') as batch_id,
|
|
splitByChar('.', domain)[-1] as tld,
|
|
JSONExtractString(result, '_metadata') as metadata,
|
|
job_id,
|
|
JSONExtractString(result, 'domain') as domain,
|
|
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
|
|
JSONExtractString(mail_exchange, 'name') as mx_name,
|
|
JSONExtractString(mail_exchange, 'ipv4') as ipv4,
|
|
JSONExtractString(mail_exchange, 'ipv6') as ipv6
|
|
FROM
|
|
${CH_DATABASE}.raw_results_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(result, 'dns', 'mx', 'exchanges') as mail_exchange;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mail_exchanges ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.mail_exchanges_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, mail_exchanges_data);
|
|
|
|
--
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mx_per_domain_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
mx_name String,
|
|
ip String,
|
|
asn Nullable(UInt32),
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.mx_per_domain_ipv6_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.mx_per_domain_data AS
|
|
select
|
|
batch_id,
|
|
tld,
|
|
metadata,
|
|
job_id,
|
|
domain,
|
|
started_at,
|
|
mx_name,
|
|
JSONExtractString(mx_ipv6, 'ip') as ip,
|
|
if(
|
|
ip != '',
|
|
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv6', 'autonomous_system_number', tuple(IPv6StringToNumOrDefault(ip))),
|
|
null
|
|
) as asn
|
|
FROM ${CH_DATABASE}.mail_exchanges_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv6, 'addresses') as mx_ipv6
|
|
WHERE ip != '' and asn is not null;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.mx_per_domain_ipv4_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.mx_per_domain_data AS
|
|
SELECT
|
|
batch_id,
|
|
tld,
|
|
metadata,
|
|
job_id,
|
|
domain,
|
|
started_at,
|
|
mx_name,
|
|
JSONExtractString(mx_ipv4, 'ip') as ip,
|
|
if(
|
|
ip != '',
|
|
dictGetUInt32('${CH_DICT_DATABASE}.geolite_asn_blocks_ipv4', 'autonomous_system_number', tuple(IPv4StringToNumOrDefault(ip))),
|
|
null
|
|
) as asn
|
|
FROM ${CH_DATABASE}.mail_exchanges_data
|
|
LEFT ARRAY JOIN JSONExtractArrayRaw(ipv4, 'addresses') as mx_ipv4
|
|
WHERE ip != '' and asn is not null;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.mx_per_domain ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.mx_per_domain_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, mx_per_domain_data);
|
|
|
|
-- SOA view
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.dns_soa_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
rcode Nullable(String),
|
|
error Nullable(String)
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.dns_soa_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.dns_soa_data AS
|
|
SELECT
|
|
JSONExtractString(result, 'batch_id') as batch_id,
|
|
splitByChar('.', domain)[-1] as tld,
|
|
JSONExtractString(result, '_metadata') as metadata,
|
|
job_id,
|
|
JSONExtractString(result, 'domain') as domain,
|
|
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
|
|
JSONExtract(result, 'dns', 'soa', 'rcode', 'Nullable(String)') AS rcode,
|
|
JSONExtract(result, 'dns', 'soa', 'error', 'Nullable(String)') AS error
|
|
FROM
|
|
${CH_DATABASE}.raw_results_data;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.dns_soa ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.dns_soa_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, dns_soa_data);
|