frcrawler-scripts/schemas/20-view_metadata.clickhouse.sql
Gaël Berthaud-Müller 38679a009a add all files
2024-03-06 14:49:11 +01:00

32 lines
1.3 KiB
SQL

CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.metadata_summary_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
started_at SimpleAggregateFunction(min, DateTime),
job_count AggregateFunction(uniq, UUID),
) ENGINE=AggregatingMergeTree ORDER BY (batch_id, tld, metadata)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.metadata_summary_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.metadata_summary_data AS
SELECT
toUUID(JSONExtractString(result, 'batch_id')) as batch_id,
splitByChar('.', JSONExtractString(result, 'domain'))[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
min(parseDateTimeBestEffort(JSONExtractString(result, 'started_at'))) as started_at,
uniqState(job_id) as job_count
FROM ${CH_DATABASE}.raw_results_data
GROUP BY batch_id, tld, metadata;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.metadata_summary_all ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.metadata_summary_data
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, metadata_summary_data);
CREATE VIEW IF NOT EXISTS ${CH_DATABASE}.metadata_summary ON CLUSTER ${CH_CLUSTER} AS
SELECT
batch_id,
tld,
metadata,
min(started_at) as started_at,
uniqMerge(job_count) as job_count
FROM ${CH_DATABASE}.metadata_summary_all
GROUP BY batch_id, tld, metadata;