mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/scripts.git
synced 2025-04-04 19:45:48 +02:00
29 lines
1.2 KiB
SQL
29 lines
1.2 KiB
SQL
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_hashes_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
domain String,
|
|
started_at DateTime,
|
|
ssdeep String,
|
|
lzjd String,
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.clustering_hashes_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.clustering_hashes_data AS
|
|
SELECT
|
|
JSONExtract(result, 'batch_id', 'UUID') as batch_id,
|
|
splitByChar('.', domain)[-1] as tld,
|
|
JSONExtractString(result, '_metadata') as metadata,
|
|
job_id,
|
|
JSONExtractString(result, 'domain') as domain,
|
|
parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
|
|
JSONExtract(result, 'clustering', 'hashes', 'ssdeep', 'Nullable(String)') as ssdeep,
|
|
JSONExtract(result, 'clustering', 'hashes', 'lzjd', 'Nullable(String)') as lzjd
|
|
FROM ${CH_DATABASE}.raw_results_data
|
|
WHERE ssdeep IS NOT NULL and lzjd IS NOT NULL;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_hashes ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.clustering_hashes_mv
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, clustering_hashes_mv);
|