frcrawler-scripts/schemas/21-view_clustering.clickhouse.sql
Gaël Berthaud-Müller 38679a009a add all files
2024-03-06 14:49:11 +01:00

29 lines
1.2 KiB
SQL

CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_hashes_data ON CLUSTER ${CH_CLUSTER} (
batch_id UUID,
tld String,
metadata String,
job_id UUID,
domain String,
started_at DateTime,
ssdeep String,
lzjd String,
) ENGINE = MergeTree
ORDER BY (batch_id, tld, domain)
TTL started_at + INTERVAL 12 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.clustering_hashes_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.clustering_hashes_data AS
SELECT
JSONExtract(result, 'batch_id', 'UUID') as batch_id,
splitByChar('.', domain)[-1] as tld,
JSONExtractString(result, '_metadata') as metadata,
job_id,
JSONExtractString(result, 'domain') as domain,
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
JSONExtract(result, 'clustering', 'hashes', 'ssdeep', 'Nullable(String)') as ssdeep,
JSONExtract(result, 'clustering', 'hashes', 'lzjd', 'Nullable(String)') as lzjd
FROM ${CH_DATABASE}.raw_results_data
WHERE ssdeep IS NOT NULL and lzjd IS NOT NULL;
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_hashes ON CLUSTER ${CH_CLUSTER}
AS ${CH_DATABASE}.clustering_hashes_mv
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, clustering_hashes_mv);