frcrawler-clustering/frcrawler_content_clustering/crawler.py
Gaël Berthaud-Müller 767f39e53e add all files
2024-02-12 14:46:47 +01:00

30 lines
728 B
Python

#
# SPDX-FileCopyrightText: 2023 Afnic
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
import logging
from frcrawler.tasks import BaseTask
from . import ContentHash
logger = logging.getLogger(__name__)
class ComputeHashes(BaseTask):
def do(self, data):
if 'web' in data:
terminal = [
details
for details in data['web']['details']
if not details['is_redirect']['redirected'] and 'content' in details['response']
]
if terminal:
hashes = ContentHash(terminal[0]['response']['content'])
return {
'clustering': { 'hashes': hashes.as_dict() }
}
return {}