frcrawler-clustering/examples/compute_similarity.py

#
# SPDX-FileCopyrightText: 2023 Afnic
#
# SPDX-License-Identifier: GPL-3.0-or-later
#

import sys
import logging
import datetime

from clickhouse_driver import Client

from frcrawler_content_clustering import HashedContentEntry, HashedContentBag


FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'

logging.basicConfig(format=FORMAT, level=logging.INFO)

client = Client(host='localhost', port=9001, secure=False, password='test', user='test', database='test')
((last_batch_id, ), ) = client.execute('SELECT max(batch_id) FROM clustering_similiarities')


def load_content(path):
    content = []
    logging.info('Reading hashes')

    bag = HashedContentBag()

    with open(path) as hashes:
        for line in hashes:
            parts = line.strip().split(' ')
            domain = parts[0]
            lzjd = parts[1]
            ssdeep = parts[2]

            bag.insert(domain, lzjd, ssdeep)

    return bag

bag = load_content(sys.argv[1])
logging.info('Computing similarities')
bag.compute_similarities(4, 100000000, 'http://test:test@localhost:8124/?database=test', last_batch_id + 1, int(datetime.datetime.utcnow().timestamp()), 80)
logging.info('Done')