mirror of
https://gitlab.rd.nic.fr/labs/frcrawler/scripts.git
synced 2025-04-04 19:45:48 +02:00
120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
#
|
|
# SPDX-FileCopyrightText: 2023 Afnic
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
|
|
import os
|
|
import logging
|
|
import pathlib
|
|
import hashlib
|
|
from urllib.parse import urlparse
|
|
from string import Template
|
|
|
|
import click
|
|
from dotenv import load_dotenv
|
|
from clickhouse_driver import Client
|
|
import clickhouse_driver.errors
|
|
|
|
|
|
load_dotenv(os.getenv('FRCRAWLER_SCRIPT_ENV_FILE', 'crawler.env'))
|
|
|
|
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
|
|
|
|
logging.basicConfig(format=FORMAT, level=logging.INFO)
|
|
|
|
CH_DB_URL = os.getenv('CH_DB_URL', 'clickhouse://test:test@localhost:9001/test')
|
|
DEFAULT_DATABASE = urlparse(CH_DB_URL).path.replace('/', '', 1)
|
|
|
|
@click.command()
|
|
@click.option(
|
|
'--schemas',
|
|
default=pathlib.Path(__file__).parent.parent / 'schemas',
|
|
help='Schema directory or file',
|
|
type=pathlib.Path,
|
|
)
|
|
@click.option(
|
|
'--init-db',
|
|
is_flag=True,
|
|
default=False,
|
|
help='If set, the script will also create the database and configure the users',
|
|
)
|
|
@click.option(
|
|
'--database',
|
|
default=DEFAULT_DATABASE,
|
|
help='Database to create the tables in (default is the one provided in connection string)'
|
|
)
|
|
@click.option(
|
|
'--cluster',
|
|
required=True,
|
|
help='Cluster to create the tables in (queries are using ON CLUSTER to create the tables and views)'
|
|
)
|
|
@click.option(
|
|
'--dictionnary-database',
|
|
required=True,
|
|
help='Database where the `geolite_asn_blocks_ipv6` and `geolite_asn_blocks_ipv4` dictionnary are'
|
|
)
|
|
@click.option(
|
|
'--dry-run',
|
|
is_flag=True,
|
|
default=False,
|
|
help='If set only print the queries'
|
|
)
|
|
def main(schemas, init_db, database, dictionnary_database, cluster, dry_run):
|
|
if not database:
|
|
logging.fatal('No database provided')
|
|
return
|
|
|
|
client = Client.from_url(CH_DB_URL)
|
|
|
|
schemas_files = []
|
|
|
|
if schemas.is_dir():
|
|
schemas_files = [f for f in schemas.iterdir() if f.is_file()]
|
|
schemas_files.sort()
|
|
logging.info('Found %d scripts in directory %s', len(schemas_files), schemas)
|
|
else:
|
|
schemas_files = [ schemas ]
|
|
|
|
crawler_user_password = None
|
|
|
|
if not init_db:
|
|
schemas_files = [f for f in schemas_files if not f.name.startswith('00')]
|
|
else:
|
|
crawler_user_password = click.prompt('`crawler` user password', hide_input=True)
|
|
crawler_user_password = hashlib.sha256(crawler_user_password.encode()).hexdigest()
|
|
|
|
|
|
for schema_path in schemas_files:
|
|
logging.info('Processing file %s', schema_path)
|
|
|
|
with schema_path.open() as schema_file:
|
|
schema = schema_file.read()
|
|
try:
|
|
schema = Template(schema).safe_substitute(
|
|
CH_DATABASE=database,
|
|
CH_CLUSTER=cluster,
|
|
CH_PASSWORD_SHA256=crawler_user_password,
|
|
CH_DICT_DATABASE=dictionnary_database
|
|
)
|
|
|
|
for query in schema.split(';'):
|
|
query = query.strip()
|
|
if not query:
|
|
continue
|
|
try:
|
|
if not dry_run:
|
|
client.execute(query)
|
|
else:
|
|
print('\n' + query + ';')
|
|
except clickhouse_driver.errors.ServerException as e:
|
|
logging.exception('Error while processing query from schema %s:\n%s', schema_path, query)
|
|
return
|
|
|
|
except Exception as e:
|
|
logging.exception('Error while processing schema %s', schema_path)
|
|
return
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|