mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/frcrawler.git
synced 2025-04-04 03:15:14 +02:00
213 lines
8.8 KiB
YAML
213 lines
8.8 KiB
YAML
#
|
|
# SPDX-FileCopyrightText: 2023 Afnic
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
|
|
metrics:
|
|
workers:
|
|
# PROMETHEUS_MULTIPROC_DIR env needs to be set if metrics are enabled for workers
|
|
# PROMETHEUS_MULTIPROC_DIR directory needs to be emptied before starting the workers
|
|
enabled: yes
|
|
listen_port: 8150
|
|
|
|
results_processor:
|
|
enabled: yes
|
|
listen_port: 8151
|
|
|
|
messaging:
|
|
# Kombu is used as a AMPQ messaging engine to distribute the jobs and get
|
|
# the results back. It supports many transports, including Redis, Rabbitmq,
|
|
# Amazon SQS and more.
|
|
# More information on Kombu: <https://docs.celeryq.dev/projects/kombu/en/latest/introduction.html>
|
|
# For configuration options see: <https://docs.celeryq.dev/projects/kombu/en/latest/reference/index.html#kombu-transports>
|
|
type: frcrawler.messaging.kombu:KombuQueue
|
|
config:
|
|
transport: redis
|
|
hostname: localhost
|
|
port: 6379
|
|
|
|
datasources:
|
|
definitions:
|
|
- name: test_ds
|
|
# Input domains are read from a PostgreSQL query.
|
|
# The query must at least return a column name 'domain' that contains the domains.
|
|
# If other columns are present they will be put in a '_metadata' object.
|
|
type: frcrawler.datasource.postgres:SimplePostgres
|
|
config:
|
|
db_url: postgres://frcrawler:frcrawler@localhost:5432/frcrawler?sslmode=disable
|
|
query: SELECT 'example.org' as domain, 'something' as registrar;
|
|
|
|
- name: jsonfile
|
|
# Input domains are read from a file.
|
|
# Each line of the file contains a JSON object that must have a field
|
|
# `domain` with the domain name.
|
|
# Extra fields will be kept and available in the result objects.
|
|
type: frcrawler.datasource.json_file:JsonFile
|
|
config:
|
|
path: 'params.json'
|
|
|
|
storage_backends:
|
|
definitions:
|
|
- name: jsonfile
|
|
# Results are saved in a file, on line per results.
|
|
type: frcrawler.storage.json_file:JsonFile
|
|
config:
|
|
path: results.json
|
|
|
|
- name: pg
|
|
# Results are saved in PostgreSQL.
|
|
type: frcrawler.storage.postgres:SimplePostgres
|
|
config:
|
|
# Connection string to connect to the database, see
|
|
# <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING>
|
|
# for more information.
|
|
db_url: postgres://frcrawler:frcrawler@localhost:5432/frcrawler?sslmode=disable
|
|
# Query used to save the results. The substitution string (%s) are
|
|
# required, the values will be passed in the following order:
|
|
# - job_id: a UUID being the uniq identifier of the job
|
|
# - result: a JSON encoded string of the result data of the job
|
|
result_query: 'INSERT INTO crawler_results (job_id, result) VALUES (%s, %s)'
|
|
# Query used to save the batch metadata, optional. If it is omitted
|
|
# or set to `null` the batch metadata will not be saved. The
|
|
# substitution string (%s) are required, the values will be passed in
|
|
# the following order:
|
|
# - batch_id: a UUID being the uniq identifier of the batch
|
|
# - scheduled_at: a UTC timestamp of when the batch has finished being scheduled
|
|
# - queue: the name queue used for message passing
|
|
# - datasource: the name of the datasource as configured
|
|
# - storage: the name of the storage engine as configured
|
|
# - job_name: the name of the job as configured
|
|
# - scheduled_job_count: the total number of jobs that have been scheduled
|
|
batch_info_query: 'INSERT INTO batches_metadata (batch_id, scheduled_at, queue, datasource, storage, job_name, scheduled_job_count) VALUES (%s, %s, %s, %s, %s, %s, %s)'
|
|
|
|
- name: clickhouse
|
|
# Results are saved in Clickhouse.
|
|
type: frcrawler.storage.clickhouse:SimpleClickhouse
|
|
config:
|
|
# Connection string to connect to the database, see
|
|
# <https://clickhouse-driver.readthedocs.io/en/latest/api.html#clickhouse_driver.Client.from_url>
|
|
# for more information.
|
|
db_url: clickhouse://frcrawler:frcrawler@localhost:9000/frcrawler
|
|
# Query used to save the results.
|
|
# The values will be passed in the following order:
|
|
# - job_id: a UUID being the uniq identifier of the job
|
|
# - result: a JSON encoded string of the result data of the job
|
|
result_query: 'INSERT INTO crawler_results (job_id, result) VALUES'
|
|
# Query used to save the batch metadata, optional. If it is omitted
|
|
# or set to `null` the batch metadata will not be saved.
|
|
# The values will be passed in the following order:
|
|
# - batch_id: a UUID being the uniq identifier of the batch
|
|
# - scheduled_at: a UTC timestamp of when the batch has finished being scheduled
|
|
# - queue: the name queue used for message passing
|
|
# - datasource: the name of the datasource as configured
|
|
# - storage: the name of the storage engine as configured
|
|
# - job_name: the name of the job as configured
|
|
# - scheduled_job_count: the total number of jobs that have been scheduled
|
|
batch_info_query: 'INSERT INTO batches_metadata (batch_id, scheduled_at, queue, datasource, storage, job_name, scheduled_job_count) VALUES'
|
|
|
|
jobs:
|
|
polling: 5
|
|
definitions:
|
|
- name: example
|
|
tasks:
|
|
- type: frcrawler.tasks.example:ExampleTask
|
|
config:
|
|
value: example
|
|
|
|
- type: frcrawler.tasks.example:ExampleTask
|
|
config:
|
|
other: value
|
|
|
|
- name: full
|
|
tasks:
|
|
- type: frcrawler.tasks.dns:DnsCrawler
|
|
#config:
|
|
# List of DNS request type to save in the result object.
|
|
# Possible values: 'ns', 'soa', 'addresses', 'mx'
|
|
# The requests and their associated responses are saved in the
|
|
# array `dns.queries`
|
|
# Default: []
|
|
#save_queries: ['soa']
|
|
|
|
# List of maps representing extra DNS requests to perform.
|
|
# The maps have a key `qname` that contain the domain to requests,
|
|
# use `{domain}` as a placeholder for the current domain, and key
|
|
# `qtype` for the query type.
|
|
# The requests and their associated responses are saved in the
|
|
# array `dns.queries`.
|
|
# Default: []
|
|
#custom_queries:
|
|
# - qname: '_matrix._tcp.{domain}'
|
|
# qtype: 'SRV'
|
|
|
|
# List of resolver addresses to use.
|
|
# If empty the crawler uses the configure nameserver on the host.
|
|
# Default: []
|
|
#nameservers:
|
|
# - '127.0.0.1'
|
|
# - '[::1]:5353'
|
|
|
|
- type: frcrawler.tasks.dnssec:DnssecCrawler
|
|
#config:
|
|
# List of resolver addresses to use.
|
|
# If empty the crawler uses the configure nameserver on the host.
|
|
# Default: []
|
|
#nameservers:
|
|
# - '127.0.0.1'
|
|
# - '[::1]:5353'
|
|
|
|
- type: frcrawler.tasks.http:HttpCrawler
|
|
#config:
|
|
# Value of the `User-Agent` header.
|
|
# Default to the default value if the HTTPX library
|
|
# (`python-httpx/{version}`)
|
|
#useragent: null
|
|
|
|
# Global timeout used to configure the connection and read timeout,
|
|
# in seconds.
|
|
# Default: 5
|
|
#timeout: 5
|
|
|
|
# Number of maximal redirections.
|
|
# Default: 5.
|
|
#max_redirect: 5
|
|
|
|
# If true, the crawler stops as soon as a successful non-redirected
|
|
# response is received.
|
|
# If false, all IPs and domains combinations will be requested.
|
|
# Default: yes
|
|
#first_terminal: yes
|
|
|
|
# If true, and the response include content type information
|
|
# header, and that the content type is textual, then the response
|
|
# content will be saved in the result field
|
|
# `web.details[].response.content`
|
|
# Default: no
|
|
#save_content: no
|
|
|
|
# Maximum size of the downloaded content if `save_content` is
|
|
# enabled. Content will be troncated if its size exceeds the
|
|
# maximum content length.
|
|
# Default: 5242880 (5MiB).
|
|
#max_content_length: 5242880
|
|
|
|
# If true, invalid certificates will be treated as errors.
|
|
# If `first_terminal` is enabled this will cause the crawler to
|
|
# continue.
|
|
# Default: no
|
|
#invalid_tls_as_error: no
|
|
|
|
# List of resolver addresses to use.
|
|
# If empty the crawler uses the configure nameserver on the host.
|
|
# Default: []
|
|
#nameservers:
|
|
# - '127.0.0.1'
|
|
# - '[::1]:5353'
|
|
|
|
- type: frcrawler.tasks.filters:IgnoreKeys
|
|
#config:
|
|
# List of paths in the result object to delete.
|
|
# Default: []
|
|
#ignore:
|
|
# - '/web/details/*/response/content'
|