mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/content-clustering.git
synced 2025-04-11 23:15:14 +02:00
44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
#
|
|
# SPDX-FileCopyrightText: 2023 Afnic
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
|
|
import sys
|
|
import logging
|
|
import datetime
|
|
|
|
from clickhouse_driver import Client
|
|
|
|
from frcrawler_content_clustering import HashedContentEntry, HashedContentBag
|
|
|
|
|
|
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
|
|
|
|
logging.basicConfig(format=FORMAT, level=logging.INFO)
|
|
|
|
client = Client(host='localhost', port=9001, secure=False, password='test', user='test', database='test')
|
|
((last_batch_id, ), ) = client.execute('SELECT max(batch_id) FROM clustering_similiarities')
|
|
|
|
|
|
def load_content(path):
|
|
content = []
|
|
logging.info('Reading hashes')
|
|
|
|
bag = HashedContentBag()
|
|
|
|
with open(path) as hashes:
|
|
for line in hashes:
|
|
parts = line.strip().split(' ')
|
|
domain = parts[0]
|
|
lzjd = parts[1]
|
|
ssdeep = parts[2]
|
|
|
|
bag.insert(domain, lzjd, ssdeep)
|
|
|
|
return bag
|
|
|
|
bag = load_content(sys.argv[1])
|
|
logging.info('Computing similarities')
|
|
bag.compute_similarities(4, 100000000, 'http://test:test@localhost:8124/?database=test', last_batch_id + 1, int(datetime.datetime.utcnow().timestamp()), 80)
|
|
logging.info('Done')
|