frcrawler-clustering/examples/compute_similarity.rs
Gaël Berthaud-Müller 767f39e53e add all files
2024-02-12 14:46:47 +01:00

41 lines
1.3 KiB
Rust

//
// SPDX-FileCopyrightText: 2023 Afnic
//
// SPDX-License-Identifier: GPL-3.0-or-later
//
use std::env;
use std::fs::File;
use std::io::BufReader;
use std::io::prelude::*;
use std::sync::Arc;
use std::time::SystemTime;
use content_classification::init_logger;
use content_classification::similarity::{HashedContentBag, HashedContentEntry, ComputeSimilarity};
use content_classification::storage::DbClickhouseSaver;
fn main () {
init_logger();
let filename = env::args().nth(1).unwrap();
let file = File::open(filename).unwrap();
let buf_reader = BufReader::new(file);
let bag: HashedContentBag = buf_reader.lines()
.map(|line| line.unwrap())
.map(|line| {
let splitted_line = line.split(' ').collect::<Vec<_>>();
(splitted_line[0].to_string(), HashedContentEntry::from_encoded_str(splitted_line[1], splitted_line[2]).unwrap())
})
.collect();
let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).expect("System time before unix epoch");
let system = ComputeSimilarity {
data: Arc::new(bag),
saver: DbClickhouseSaver::new("http://test:test@localhost:8124/?database=test", 1, now.as_secs(), 80),
nb_workers: 1,
max_memory: 100_000_000,
};
system.compute();
}