frcrawler-clustering/schema.dev.sql
Gaël Berthaud-Müller 767f39e53e add all files
2024-02-12 14:46:47 +01:00

52 lines
1.5 KiB
SQL

create table test.clustering_similiarities (
first_id UInt32,
second_id UInt32,
batch_id UInt32,
clustering_started_at DateTime
) engine = Memory;
create table test.clustering_label_hints (
hint_id UUID,
lzjd String,
ssdeep String,
label_id UUID,
) engine = Memory;
create table test.clustering_labels (
label_id UUID,
label_name String,
) engine = Memory;
create table test.clustering_results (
batch_id UUID,
job_id UUID,
clustering_batch_id UInt32,
domain String,
cluster_label UUID,
clustering_started_at DateTime
) engine = Memory;
create table test.raw_results (
job_id UUID,
result String,
) ENGINE = Memory;
create view test.clustering_hashes as
select
JSONExtract(result, 'batch_id', 'UUID') as batch_id,
job_id,
JSONExtractString(result, 'domain') as domain,
JSONExtract(result, 'clustering', 'hashes', 'ssdeep', 'Nullable(String)') as ssdeep,
JSONExtract(result, 'clustering', 'hashes', 'lzjd', 'Nullable(String)') as lzjd
from test.raw_results;
create view test.batches as
SELECT
JSONExtract(result, 'batch_id', 'UUID') as batch_id,
JSONExtractString(result, 'job_name') as job_name,
JSONExtractString(result, 'datasource') as datasource,
min(parseDateTimeBestEffort(JSONExtractString(result, 'started_at'))) as started_at,
max(parseDateTimeBestEffort(JSONExtractString(result, 'started_at'))) as ended_at,
count(job_id) as job_count
FROM test.raw_results
GROUP BY batch_id, job_name, datasource;