frcrawler-scripts/scripts/create-db.py
Gaël Berthaud-Müller 23e47ec6e7 add license mention
2024-03-06 14:51:03 +01:00

120 lines
3.6 KiB
Python

#
# SPDX-FileCopyrightText: 2023 Afnic
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
import os
import logging
import pathlib
import hashlib
from urllib.parse import urlparse
from string import Template
import click
from dotenv import load_dotenv
from clickhouse_driver import Client
import clickhouse_driver.errors
load_dotenv(os.getenv('FRCRAWLER_SCRIPT_ENV_FILE', 'crawler.env'))
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)
CH_DB_URL = os.getenv('CH_DB_URL', 'clickhouse://test:test@localhost:9001/test')
DEFAULT_DATABASE = urlparse(CH_DB_URL).path.replace('/', '', 1)
@click.command()
@click.option(
'--schemas',
default=pathlib.Path(__file__).parent.parent / 'schemas',
help='Schema directory or file',
type=pathlib.Path,
)
@click.option(
'--init-db',
is_flag=True,
default=False,
help='If set, the script will also create the database and configure the users',
)
@click.option(
'--database',
default=DEFAULT_DATABASE,
help='Database to create the tables in (default is the one provided in connection string)'
)
@click.option(
'--cluster',
required=True,
help='Cluster to create the tables in (queries are using ON CLUSTER to create the tables and views)'
)
@click.option(
'--dictionnary-database',
required=True,
help='Database where the `geolite_asn_blocks_ipv6` and `geolite_asn_blocks_ipv4` dictionnary are'
)
@click.option(
'--dry-run',
is_flag=True,
default=False,
help='If set only print the queries'
)
def main(schemas, init_db, database, dictionnary_database, cluster, dry_run):
if not database:
logging.fatal('No database provided')
return
client = Client.from_url(CH_DB_URL)
schemas_files = []
if schemas.is_dir():
schemas_files = [f for f in schemas.iterdir() if f.is_file()]
schemas_files.sort()
logging.info('Found %d scripts in directory %s', len(schemas_files), schemas)
else:
schemas_files = [ schemas ]
crawler_user_password = None
if not init_db:
schemas_files = [f for f in schemas_files if not f.name.startswith('00')]
else:
crawler_user_password = click.prompt('`crawler` user password', hide_input=True)
crawler_user_password = hashlib.sha256(crawler_user_password.encode()).hexdigest()
for schema_path in schemas_files:
logging.info('Processing file %s', schema_path)
with schema_path.open() as schema_file:
schema = schema_file.read()
try:
schema = Template(schema).safe_substitute(
CH_DATABASE=database,
CH_CLUSTER=cluster,
CH_PASSWORD_SHA256=crawler_user_password,
CH_DICT_DATABASE=dictionnary_database
)
for query in schema.split(';'):
query = query.strip()
if not query:
continue
try:
if not dry_run:
client.execute(query)
else:
print('\n' + query + ';')
except clickhouse_driver.errors.ServerException as e:
logging.exception('Error while processing query from schema %s:\n%s', schema_path, query)
return
except Exception as e:
logging.exception('Error while processing schema %s', schema_path)
return
if __name__ == '__main__':
main()