frcrawler-clustering/examples/dbscan.py
Gaël Berthaud-Müller 767f39e53e add all files
2024-02-12 14:46:47 +01:00

42 lines
1.3 KiB
Python

#
# SPDX-FileCopyrightText: 2023 Afnic
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
import json
import logging
from urllib.parse import urlparse, urlencode, parse_qsl
from clickhouse_driver import Client
from frcrawler_content_clustering.dbscan import DbScan
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
logging.basicConfig(format=FORMAT, level=logging.DEBUG)
client = Client(host='localhost', port=19000, secure=False, password='test', user='test', database='test')
((item_counts, ), ) = client.execute('SELECT count(*) FROM clustering_content_entries WHERE batch_id = %(batch_id)s', { 'batch_id': 1},)
def build_query_url(base_url, query):
base_url = urlparse(base_url)
query_string = dict(parse_qsl(base_url.query, keep_blank_values=True))
query_string['query'] = query
return base_url._replace(query=urlencode(query_string)).geturl()
logging.info('Reading data from db')
db_scan = DbScan(item_counts)
db_scan.feed_from_http(
build_query_url(
'http://test:test@localhost:8124/?database=test',
'SELECT first_id, second_id FROM clustering_similiarities WHERE batch_id = 1 order by first_id, second_id FORMAT RowBinary'
)
)
logging.info('Computing clusters')
db_scan.compute()
with open('clusters.json', 'w') as out_file:
json.dump(db_scan.clusters(), out_file)