frcrawler-clustering/examples/dbscan.py

#
# SPDX-FileCopyrightText: 2023 Afnic
#
# SPDX-License-Identifier: GPL-3.0-or-later
#

import json
import logging
from urllib.parse import urlparse, urlencode, parse_qsl

from clickhouse_driver import Client

from frcrawler_content_clustering.dbscan import DbScan


FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'

logging.basicConfig(format=FORMAT, level=logging.DEBUG)

client = Client(host='localhost', port=19000, secure=False, password='test', user='test', database='test')
((item_counts, ), ) = client.execute('SELECT count(*) FROM clustering_content_entries WHERE batch_id = %(batch_id)s', { 'batch_id': 1},)


def build_query_url(base_url, query):
    base_url = urlparse(base_url)
    query_string = dict(parse_qsl(base_url.query, keep_blank_values=True))
    query_string['query'] = query
    return base_url._replace(query=urlencode(query_string)).geturl()

logging.info('Reading data from db')
db_scan = DbScan(item_counts)
db_scan.feed_from_http(
    build_query_url(
        'http://test:test@localhost:8124/?database=test',
        'SELECT first_id, second_id FROM clustering_similiarities WHERE batch_id = 1 order by first_id, second_id FORMAT RowBinary'
    )
)
logging.info('Computing clusters')
db_scan.compute()

with open('clusters.json', 'w') as out_file:
    json.dump(db_scan.clusters(), out_file)