mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/content-clustering.git
synced 2025-04-11 23:15:14 +02:00
41 lines
1.3 KiB
Rust
41 lines
1.3 KiB
Rust
//
|
|
// SPDX-FileCopyrightText: 2023 Afnic
|
|
//
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
//
|
|
|
|
use std::env;
|
|
use std::fs::File;
|
|
use std::io::BufReader;
|
|
use std::io::prelude::*;
|
|
use std::sync::Arc;
|
|
use std::time::SystemTime;
|
|
|
|
use content_classification::init_logger;
|
|
use content_classification::similarity::{HashedContentBag, HashedContentEntry, ComputeSimilarity};
|
|
use content_classification::storage::DbClickhouseSaver;
|
|
|
|
fn main () {
|
|
init_logger();
|
|
|
|
let filename = env::args().nth(1).unwrap();
|
|
let file = File::open(filename).unwrap();
|
|
let buf_reader = BufReader::new(file);
|
|
let bag: HashedContentBag = buf_reader.lines()
|
|
.map(|line| line.unwrap())
|
|
.map(|line| {
|
|
let splitted_line = line.split(' ').collect::<Vec<_>>();
|
|
(splitted_line[0].to_string(), HashedContentEntry::from_encoded_str(splitted_line[1], splitted_line[2]).unwrap())
|
|
})
|
|
.collect();
|
|
let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).expect("System time before unix epoch");
|
|
|
|
let system = ComputeSimilarity {
|
|
data: Arc::new(bag),
|
|
saver: DbClickhouseSaver::new("http://test:test@localhost:8124/?database=test", 1, now.as_secs(), 80),
|
|
nb_workers: 1,
|
|
max_memory: 100_000_000,
|
|
};
|
|
|
|
system.compute();
|
|
}
|