mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/scripts.git
synced 2025-04-04 19:45:48 +02:00
74 lines
3.1 KiB
SQL
74 lines
3.1 KiB
SQL
-- Clustering
|
|
|
|
-- * clustering_similiarities : Table temporaire pour stocker les similarité entre hashs.
|
|
-- Si deux hash passe le test de similarité alors ils sont insérer dans cette table, l'ordre
|
|
-- entre first_id et second_id n'a pas d'importance.
|
|
-- * clustering_label_hints : Table stockant des indices pour créer les labels, le `hint_id` doit
|
|
-- être unique, label_id correspond au label dans table `clustering_labels`.
|
|
-- * clustering_labels : Liste des labels existant, `label_id` est unique dans cette table.
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_similiarities_data ON CLUSTER ${CH_CLUSTER} (
|
|
first_id UInt32,
|
|
second_id UInt32,
|
|
batch_id UInt32,
|
|
clustering_started_at DateTime
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, first_id, second_id)
|
|
TTL clustering_started_at + INTERVAL 4 MONTH;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_label_hints_data ON CLUSTER ${CH_CLUSTER} (
|
|
hint_id UUID,
|
|
label_id UUID,
|
|
lzjd String,
|
|
ssdeep String,
|
|
) ENGINE = MergeTree
|
|
ORDER BY (hint_id, label_id);
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_labels_data ON CLUSTER ${CH_CLUSTER} (
|
|
label_id UUID,
|
|
label_name String,
|
|
) ENGINE = MergeTree
|
|
ORDER BY (label_id, label_name);
|
|
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_batches_metadata_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
clustering_batch_id Nullable(UInt32),
|
|
started_at DateTime,
|
|
ended_at Nullable(DateTime)
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id)
|
|
TTL started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_batches_metadata ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.clustering_batches_metadata_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, clustering_batches_metadata_data, murmurHash3_32(toString(batch_id)));
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_results_data ON CLUSTER ${CH_CLUSTER} (
|
|
batch_id UUID,
|
|
tld String,
|
|
metadata String,
|
|
job_id UUID,
|
|
clustering_batch_id Nullable(UInt32),
|
|
domain String,
|
|
cluster_label Nullable(UUID),
|
|
clustering_started_at DateTime
|
|
) ENGINE = MergeTree
|
|
ORDER BY (batch_id, tld, domain)
|
|
TTL clustering_started_at + INTERVAL 12 MONTH;
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_similiarities ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.clustering_similiarities_data
|
|
ENGINE = Distributed(${CH_CLUSTER},${CH_DATABASE}, clustering_similiarities_data, rand());
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_label_hints ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.clustering_label_hints_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, clustering_label_hints_data, rand());
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_labels ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.clustering_labels_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, clustering_labels_data, rand());
|
|
|
|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_results ON CLUSTER ${CH_CLUSTER}
|
|
AS ${CH_DATABASE}.clustering_results_data
|
|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, clustering_results_data, murmurHash3_32(toString(job_id)));
|