mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/content-clustering.git
synced 2025-04-11 23:15:14 +02:00
42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
#
|
|
# SPDX-FileCopyrightText: 2023 Afnic
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
|
|
import json
|
|
import logging
|
|
from urllib.parse import urlparse, urlencode, parse_qsl
|
|
|
|
from clickhouse_driver import Client
|
|
|
|
from frcrawler_content_clustering.dbscan import DbScan
|
|
|
|
|
|
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
|
|
|
|
logging.basicConfig(format=FORMAT, level=logging.DEBUG)
|
|
|
|
client = Client(host='localhost', port=19000, secure=False, password='test', user='test', database='test')
|
|
((item_counts, ), ) = client.execute('SELECT count(*) FROM clustering_content_entries WHERE batch_id = %(batch_id)s', { 'batch_id': 1},)
|
|
|
|
|
|
def build_query_url(base_url, query):
|
|
base_url = urlparse(base_url)
|
|
query_string = dict(parse_qsl(base_url.query, keep_blank_values=True))
|
|
query_string['query'] = query
|
|
return base_url._replace(query=urlencode(query_string)).geturl()
|
|
|
|
logging.info('Reading data from db')
|
|
db_scan = DbScan(item_counts)
|
|
db_scan.feed_from_http(
|
|
build_query_url(
|
|
'http://test:test@localhost:8124/?database=test',
|
|
'SELECT first_id, second_id FROM clustering_similiarities WHERE batch_id = 1 order by first_id, second_id FORMAT RowBinary'
|
|
)
|
|
)
|
|
logging.info('Computing clusters')
|
|
db_scan.compute()
|
|
|
|
with open('clusters.json', 'w') as out_file:
|
|
json.dump(db_scan.clusters(), out_file)
|