mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/content-clustering.git
synced 2025-04-04 11:35:13 +02:00
52 lines
1.5 KiB
SQL
52 lines
1.5 KiB
SQL
create table test.clustering_similiarities (
|
|
first_id UInt32,
|
|
second_id UInt32,
|
|
batch_id UInt32,
|
|
clustering_started_at DateTime
|
|
) engine = Memory;
|
|
|
|
create table test.clustering_label_hints (
|
|
hint_id UUID,
|
|
lzjd String,
|
|
ssdeep String,
|
|
label_id UUID,
|
|
) engine = Memory;
|
|
|
|
create table test.clustering_labels (
|
|
label_id UUID,
|
|
label_name String,
|
|
) engine = Memory;
|
|
|
|
create table test.clustering_results (
|
|
batch_id UUID,
|
|
job_id UUID,
|
|
clustering_batch_id UInt32,
|
|
domain String,
|
|
cluster_label UUID,
|
|
clustering_started_at DateTime
|
|
) engine = Memory;
|
|
|
|
create table test.raw_results (
|
|
job_id UUID,
|
|
result String,
|
|
) ENGINE = Memory;
|
|
|
|
create view test.clustering_hashes as
|
|
select
|
|
JSONExtract(result, 'batch_id', 'UUID') as batch_id,
|
|
job_id,
|
|
JSONExtractString(result, 'domain') as domain,
|
|
JSONExtract(result, 'clustering', 'hashes', 'ssdeep', 'Nullable(String)') as ssdeep,
|
|
JSONExtract(result, 'clustering', 'hashes', 'lzjd', 'Nullable(String)') as lzjd
|
|
from test.raw_results;
|
|
|
|
create view test.batches as
|
|
SELECT
|
|
JSONExtract(result, 'batch_id', 'UUID') as batch_id,
|
|
JSONExtractString(result, 'job_name') as job_name,
|
|
JSONExtractString(result, 'datasource') as datasource,
|
|
min(parseDateTimeBestEffort(JSONExtractString(result, 'started_at'))) as started_at,
|
|
max(parseDateTimeBestEffort(JSONExtractString(result, 'started_at'))) as ended_at,
|
|
count(job_id) as job_count
|
|
FROM test.raw_results
|
|
GROUP BY batch_id, job_name, datasource;
|