From c2c5ecf6af1ce169468ad0adf68a0b827df32efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Bedna=C5=99=C3=ADk?= <jan.bednarik@gmail.com> Date: Wed, 29 Nov 2017 23:00:48 +0100 Subject: [PATCH] Custom thesaurus of czech synonyms. --- Dockerfile | 1 + README.md | 10 +++++++++- analysis/cs_CZ/synonym.txt | 8 ++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 analysis/cs_CZ/synonym.txt diff --git a/Dockerfile b/Dockerfile index 016a5e5..84f2356 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,3 +4,4 @@ RUN elasticsearch-plugin remove --purge x-pack RUN elasticsearch-plugin install analysis-icu ADD hunspell/ /usr/share/elasticsearch/config/hunspell/ +ADD analysis/ /usr/share/elasticsearch/config/analysis/ diff --git a/README.md b/README.md index ac6a62e..063f786 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,11 @@ Create Index with these settings: 'type': 'hunspell', 'locale': 'cs_CZ', 'dedup': True, - } + }, + 'czech_synonym': { + 'type': 'synonym', + 'synonyms_path': 'analysis/cs_CZ/synonym.txt', + }, }, 'analyzer': { 'czech': { @@ -38,6 +42,7 @@ Create Index with these settings: 'filter': [ 'icu_folding', 'lowercase', + 'czech_synonym', 'czech_stop', 'czech_stemmer', 'cs_CZ', @@ -51,6 +56,9 @@ Create Index with these settings: You can use `'czech'` analyzer on text fields now. +There is custom dictionary of synonyms included. You can adjust it for your +needs or remove it from analyzer settings. + ## Build If you don't want to use pre-built container from diff --git a/analysis/cs_CZ/synonym.txt b/analysis/cs_CZ/synonym.txt new file mode 100644 index 0000000..a76a5e2 --- /dev/null +++ b/analysis/cs_CZ/synonym.txt @@ -0,0 +1,8 @@ +# Czech Synonyms in Solr format +# ============================= +# +# Write comma separated list of words without diacritic in lowercase. +# One set of synonyms per line. + +kava, kafe +tramvaj, salina -- GitLab