mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/content-clustering.git
synced 2025-04-11 23:15:14 +02:00
30 lines
728 B
Python
30 lines
728 B
Python
#
|
|
# SPDX-FileCopyrightText: 2023 Afnic
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
|
|
import logging
|
|
|
|
from frcrawler.tasks import BaseTask
|
|
|
|
from . import ContentHash
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class ComputeHashes(BaseTask):
|
|
def do(self, data):
|
|
if 'web' in data:
|
|
terminal = [
|
|
details
|
|
for details in data['web']['details']
|
|
if not details['is_redirect']['redirected'] and 'content' in details['response']
|
|
]
|
|
|
|
if terminal:
|
|
hashes = ContentHash(terminal[0]['response']['content'])
|
|
return {
|
|
'clustering': { 'hashes': hashes.as_dict() }
|
|
}
|
|
|
|
return {}
|