nyaa/es_mapping.yml

146 lines
3.8 KiB
YAML

---
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
# fo inline comments.
settings:
analysis:
analyzer:
my_search_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- lowercase
my_index_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- resolution
- lowercase
- word_delimit
- my_ngram
- trim_zero
- unique
# For exact matching - separate each character for substring matching + lowercase
exact_analyzer:
tokenizer: exact_tokenizer
filter:
- lowercase
# For matching full words longer than the ngram limit (15 chars)
my_fullword_index_analyzer:
type: custom
tokenizer: standard
char_filter:
- my_char_filter
filter:
- lowercase
- word_delimit
# Skip tokens shorter than N characters,
# since they're already indexed in the main field
- fullword_min
- unique
tokenizer:
# Splits input into characters, for exact substring matching
exact_tokenizer:
type: pattern
pattern: "(.)"
group: 1
filter:
my_ngram:
type: edge_ngram
min_gram: 1
max_gram: 15
fullword_min:
type: length
# Remember to change this if you change the max_gram below!
min: 16
resolution:
type: pattern_capture
patterns: ["(\\d+)[xX](\\d+)"]
trim_zero:
type: pattern_capture
patterns: ["0*([0-9]*)"]
word_delimit:
type: word_delimiter_graph
preserve_original: true
split_on_numerics: false
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-word-delimiter-graph-tokenfilter.html#word-delimiter-graph-tokenfilter-configure-parms
# since we're using "trim" filters downstream, otherwise
# you get weird lucene errors about startOffset
adjust_offsets: false
char_filter:
my_char_filter:
type: mapping
mappings: ["-=>_", "!=>_", "_=>\\u0020"]
index:
# we're running a single es node, so no sharding necessary,
# plus replicas don't really help either.
number_of_shards: 1
number_of_replicas : 0
query:
default_field: display_name
mappings:
# disable elasticsearch's "helpful" autoschema
dynamic: false
properties:
id:
type: long
display_name:
# TODO could do a fancier tokenizer here to parse out the
# the scene convention of stuff in brackets, plus stuff like k-on
type: text
analyzer: my_index_analyzer
fielddata: true # Is this required?
fields:
# Multi-field for full-word matching (when going over ngram limits)
# Note: will have to be queried for, not automatic
fullword:
type: text
analyzer: my_fullword_index_analyzer
# Stored for exact phrase matching
exact:
type: text
analyzer: exact_analyzer
created_time:
type: date
#
# Only in the ES index for generating magnet links
info_hash:
type: keyword
index: false
filesize:
type: long
anonymous:
type: boolean
trusted:
type: boolean
remake:
type: boolean
complete:
type: boolean
hidden:
type: boolean
deleted:
type: boolean
has_torrent:
type: boolean
download_count:
type: long
leech_count:
type: long
seed_count:
type: long
comment_count:
type: long
# these ids are really only for filtering, thus keyword
uploader_id:
type: keyword
main_category_id:
type: keyword
sub_category_id:
type: keyword