mirror of
				https://gitlab.rd.nic.fr/labs/frcrawler/scripts.git
				synced 2025-11-04 15:47:00 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			29 lines
		
	
	
	
		
			1.2 KiB
		
	
	
	
		
			SQL
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
	
		
			1.2 KiB
		
	
	
	
		
			SQL
		
	
	
	
	
	
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_hashes_data ON CLUSTER ${CH_CLUSTER} (
 | 
						|
    batch_id UUID,
 | 
						|
    tld String,
 | 
						|
    metadata String,
 | 
						|
    job_id UUID,
 | 
						|
    domain String,
 | 
						|
    started_at DateTime,
 | 
						|
    ssdeep String,
 | 
						|
    lzjd String,
 | 
						|
) ENGINE = MergeTree
 | 
						|
ORDER BY (batch_id, tld, domain)
 | 
						|
TTL started_at + INTERVAL 12 MONTH;
 | 
						|
 | 
						|
CREATE MATERIALIZED VIEW IF NOT EXISTS ${CH_DATABASE}.clustering_hashes_mv ON CLUSTER ${CH_CLUSTER} TO ${CH_DATABASE}.clustering_hashes_data AS
 | 
						|
SELECT
 | 
						|
    JSONExtract(result, 'batch_id', 'UUID') as batch_id,
 | 
						|
    splitByChar('.', domain)[-1] as tld,
 | 
						|
    JSONExtractString(result, '_metadata') as metadata,
 | 
						|
    job_id,
 | 
						|
    JSONExtractString(result, 'domain') as domain,
 | 
						|
    parseDateTimeBestEffort(JSONExtractString(result, 'started_at')) as started_at,
 | 
						|
    JSONExtract(result, 'clustering', 'hashes', 'ssdeep', 'Nullable(String)') as ssdeep,
 | 
						|
    JSONExtract(result, 'clustering', 'hashes', 'lzjd', 'Nullable(String)') as lzjd
 | 
						|
FROM ${CH_DATABASE}.raw_results_data
 | 
						|
WHERE ssdeep IS NOT NULL and lzjd IS NOT NULL;
 | 
						|
 | 
						|
CREATE TABLE IF NOT EXISTS ${CH_DATABASE}.clustering_hashes ON CLUSTER ${CH_CLUSTER}
 | 
						|
AS ${CH_DATABASE}.clustering_hashes_mv
 | 
						|
ENGINE = Distributed(${CH_CLUSTER}, ${CH_DATABASE}, clustering_hashes_mv);
 |