frcrawler-scripts/schemas/20-view_siren.clickhouse.sql
Gaël Berthaud-Müller 38679a009a add all files
2024-03-06 14:49:11 +01:00

31 lines
1.2 KiB
SQL

CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.siren_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
siren Nullable(String),
url Nullable(String),
input Nullable(String)
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.siren_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.siren_data AS
SELECT
toUUID(JSONExtractString(result, 'batch_id')) as batch_id,
splitByChar('.', domain)[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
job_id,
JSONExtractString(result, 'domain') as domain,
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
JSONExtract(result, 'siren', 'siren', 'Nullable(String)') as siren,
JSONExtract(result, 'siren', 'url', 'Nullable(String)') as url,
JSONExtract(result, 'siren', 'input', 'Nullable(String)') as input
FROM ${CH_DATABASE}.raw_results_data
WHERE siren IS NOT NULL;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.siren ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.siren_mv
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, siren_mv);