frcrawler/config.example.yml
Gaël Berthaud-Müller 770c8086d4 add license mention
2024-02-08 18:45:47 +01:00

213 lines
8.8 KiB
YAML

#
# SPDX-FileCopyrightText: 2023 Afnic
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
metrics:
workers:
# PROMETHEUS_MULTIPROC_DIR env needs to be set if metrics are enabled for workers
# PROMETHEUS_MULTIPROC_DIR directory needs to be emptied before starting the workers
enabled: yes
listen_port: 8150
results_processor:
enabled: yes
listen_port: 8151
messaging:
# Kombu is used as a AMPQ messaging engine to distribute the jobs and get
# the results back. It supports many transports, including Redis, Rabbitmq,
# Amazon SQS and more.
# More information on Kombu: <https://docs.celeryq.dev/projects/kombu/en/latest/introduction.html>
# For configuration options see: <https://docs.celeryq.dev/projects/kombu/en/latest/reference/index.html#kombu-transports>
type: frcrawler.messaging.kombu:KombuQueue
config:
transport: redis
hostname: localhost
port: 6379
datasources:
definitions:
- name: test_ds
# Input domains are read from a PostgreSQL query.
# The query must at least return a column name 'domain' that contains the domains.
# If other columns are present they will be put in a '_metadata' object.
type: frcrawler.datasource.postgres:SimplePostgres
config:
db_url: postgres://frcrawler:frcrawler@localhost:5432/frcrawler?sslmode=disable
query: SELECT 'example.org' as domain, 'something' as registrar;
- name: jsonfile
# Input domains are read from a file.
# Each line of the file contains a JSON object that must have a field
# `domain` with the domain name.
# Extra fields will be kept and available in the result objects.
type: frcrawler.datasource.json_file:JsonFile
config:
path: 'params.json'
storage_backends:
definitions:
- name: jsonfile
# Results are saved in a file, on line per results.
type: frcrawler.storage.json_file:JsonFile
config:
path: results.json
- name: pg
# Results are saved in PostgreSQL.
type: frcrawler.storage.postgres:SimplePostgres
config:
# Connection string to connect to the database, see
# <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING>
# for more information.
db_url: postgres://frcrawler:frcrawler@localhost:5432/frcrawler?sslmode=disable
# Query used to save the results. The substitution string (%s) are
# required, the values will be passed in the following order:
# - job_id: a UUID being the uniq identifier of the job
# - result: a JSON encoded string of the result data of the job
result_query: 'INSERT INTO crawler_results (job_id, result) VALUES (%s, %s)'
# Query used to save the batch metadata, optional. If it is omitted
# or set to `null` the batch metadata will not be saved. The
# substitution string (%s) are required, the values will be passed in
# the following order:
# - batch_id: a UUID being the uniq identifier of the batch
# - scheduled_at: a UTC timestamp of when the batch has finished being scheduled
# - queue: the name queue used for message passing
# - datasource: the name of the datasource as configured
# - storage: the name of the storage engine as configured
# - job_name: the name of the job as configured
# - scheduled_job_count: the total number of jobs that have been scheduled
batch_info_query: 'INSERT INTO batches_metadata (batch_id, scheduled_at, queue, datasource, storage, job_name, scheduled_job_count) VALUES (%s, %s, %s, %s, %s, %s, %s)'
- name: clickhouse
# Results are saved in Clickhouse.
type: frcrawler.storage.clickhouse:SimpleClickhouse
config:
# Connection string to connect to the database, see
# <https://clickhouse-driver.readthedocs.io/en/latest/api.html#clickhouse_driver.Client.from_url>
# for more information.
db_url: clickhouse://frcrawler:frcrawler@localhost:9000/frcrawler
# Query used to save the results.
# The values will be passed in the following order:
# - job_id: a UUID being the uniq identifier of the job
# - result: a JSON encoded string of the result data of the job
result_query: 'INSERT INTO crawler_results (job_id, result) VALUES'
# Query used to save the batch metadata, optional. If it is omitted
# or set to `null` the batch metadata will not be saved.
# The values will be passed in the following order:
# - batch_id: a UUID being the uniq identifier of the batch
# - scheduled_at: a UTC timestamp of when the batch has finished being scheduled
# - queue: the name queue used for message passing
# - datasource: the name of the datasource as configured
# - storage: the name of the storage engine as configured
# - job_name: the name of the job as configured
# - scheduled_job_count: the total number of jobs that have been scheduled
batch_info_query: 'INSERT INTO batches_metadata (batch_id, scheduled_at, queue, datasource, storage, job_name, scheduled_job_count) VALUES'
jobs:
polling: 5
definitions:
- name: example
tasks:
- type: frcrawler.tasks.example:ExampleTask
config:
value: example
- type: frcrawler.tasks.example:ExampleTask
config:
other: value
- name: full
tasks:
- type: frcrawler.tasks.dns:DnsCrawler
#config:
# List of DNS request type to save in the result object.
# Possible values: 'ns', 'soa', 'addresses', 'mx'
# The requests and their associated responses are saved in the
# array `dns.queries`
# Default: []
#save_queries: ['soa']
# List of maps representing extra DNS requests to perform.
# The maps have a key `qname` that contain the domain to requests,
# use `{domain}` as a placeholder for the current domain, and key
# `qtype` for the query type.
# The requests and their associated responses are saved in the
# array `dns.queries`.
# Default: []
#custom_queries:
# - qname: '_matrix._tcp.{domain}'
# qtype: 'SRV'
# List of resolver addresses to use.
# If empty the crawler uses the configure nameserver on the host.
# Default: []
#nameservers:
# - '127.0.0.1'
# - '[::1]:5353'
- type: frcrawler.tasks.dnssec:DnssecCrawler
#config:
# List of resolver addresses to use.
# If empty the crawler uses the configure nameserver on the host.
# Default: []
#nameservers:
# - '127.0.0.1'
# - '[::1]:5353'
- type: frcrawler.tasks.http:HttpCrawler
#config:
# Value of the `User-Agent` header.
# Default to the default value if the HTTPX library
# (`python-httpx/{version}`)
#useragent: null
# Global timeout used to configure the connection and read timeout,
# in seconds.
# Default: 5
#timeout: 5
# Number of maximal redirections.
# Default: 5.
#max_redirect: 5
# If true, the crawler stops as soon as a successful non-redirected
# response is received.
# If false, all IPs and domains combinations will be requested.
# Default: yes
#first_terminal: yes
# If true, and the response include content type information
# header, and that the content type is textual, then the response
# content will be saved in the result field
# `web.details[].response.content`
# Default: no
#save_content: no
# Maximum size of the downloaded content if `save_content` is
# enabled. Content will be troncated if its size exceeds the
# maximum content length.
# Default: 5242880 (5MiB).
#max_content_length: 5242880
# If true, invalid certificates will be treated as errors.
# If `first_terminal` is enabled this will cause the crawler to
# continue.
# Default: no
#invalid_tls_as_error: no
# List of resolver addresses to use.
# If empty the crawler uses the configure nameserver on the host.
# Default: []
#nameservers:
# - '127.0.0.1'
# - '[::1]:5353'
- type: frcrawler.tasks.filters:IgnoreKeys
#config:
# List of paths in the result object to delete.
# Default: []
#ignore:
# - '/web/details/*/response/content'