frcrawler-clustering/examples/compute_similarity.py
Gaël Berthaud-Müller 767f39e53e add all files
2024-02-12 14:46:47 +01:00

44 lines
1.2 KiB
Python

#
# SPDX-FileCopyrightText: 2023 Afnic
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
import sys
import logging
import datetime
from clickhouse_driver import Client
from frcrawler_content_clustering import HashedContentEntry, HashedContentBag
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)
client = Client(host='localhost', port=9001, secure=False, password='test', user='test', database='test')
((last_batch_id, ), ) = client.execute('SELECT max(batch_id) FROM clustering_similiarities')
def load_content(path):
content = []
logging.info('Reading hashes')
bag = HashedContentBag()
with open(path) as hashes:
for line in hashes:
parts = line.strip().split(' ')
domain = parts[0]
lzjd = parts[1]
ssdeep = parts[2]
bag.insert(domain, lzjd, ssdeep)
return bag
bag = load_content(sys.argv[1])
logging.info('Computing similarities')
bag.compute_similarities(4, 100000000, 'http://test:test@localhost:8124/?database=test', last_batch_id + 1, int(datetime.datetime.utcnow().timestamp()), 80)
logging.info('Done')