本文主要介绍执行一次字符串搜索时,如何查询字符串与返回数据的匹配过程。
注意:因为query string有自己的查询语法,所以查询字符串的内容很容易与其operator冲突
首先创建索引
{
"test_rs_012_index" : {
"aliases" : { },
"mappings" : {
"dynamic" : "strict",
"properties" : {
"title" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
},
"ngram" : {
"type" : "text",
"term_vector" : "with_positions_offsets",
"analyzer" : "ngram_analyzer",
"search_analyzer" : "ngram_search_analyzer",
"search_quote_analyzer" : "ngram_quote_search_analyzer"
}
},
"analyzer" : "text_analyzer"
}
}
},
"settings" : {
"index" : {
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"mapping" : {
"total_fields" : {
"limit" : "10000"
}
},
"number_of_shards" : "1",
"provided_name" : "test_rs_012_index",
"creation_date" : "1737513576830",
"analysis" : {
"filter" : {
"ngram_identifier_search_filter" : {
"length" : "36",
"type" : "truncate"
},
"ngram_search_filter" : {
"length" : "36",
"type" : "truncate"
},
"stemmer_en" : {
"name" : "light_english",
"type" : "stemmer"
},
"ngram_filter" : {
"type" : "edgeNGram",
"min_gram" : "1",
"max_gram" : "10"
},
"word_delimiter_filter" : {
"catenate_all" : "true",
"type" : "word_delimiter_graph",
"catenate_numbers" : "true",
"preserve_original" : "true",
"catenate_words" : "true"
},
"ngram_identifier_filter" : {
"type" : "edgeNGram",
"min_gram" : "1",
"max_gram" : "36"
},
"stemmer_de" : {
"name" : "light_german",
"type" : "stemmer"
}
},
"analyzer" : {
"ngram_analyzer" : {
"filter" : [
"icu_folding",
"ngram_filter"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"ngram_search_analyzer" : {
"filter" : [
"icu_folding",
"ngram_search_filter"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"text_analyzer_en" : {
"filter" : [
"icu_folding",
"stemmer_en"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"ngram_identifier_search_analyzer" : {
"filter" : [
"icu_folding",
"ngram_identifier_search_filter"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"ngram_quote_search_analyzer" : {
"filter" : [
"icu_folding",
"ngram_search_filter"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"text_analyzer_de" : {
"filter" : [
"icu_folding",
"stemmer_de"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"text_analyzer" : {
"filter" : [
"icu_folding",
"stemmer_en",
"stemmer_de"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"ngram_identifier_analyzer" : {
"filter" : [
"icu_folding",
"ngram_identifier_filter"
],
"type" : "custom",
"tokenizer" : "whitespace"
},
"path_analyzer" : {
"tokenizer" : "path_hierarchy"
}
}
},
"number_of_replicas" : "1",
"uuid" : "p05jJcfwR5epj6PMM1aNqA",
"version" : {
"created" : "7171299"
}
}
}
}
}
批量创建测试数据
POST /test_rs_012_index/_doc/_bulk
{ "index" : { "_id" : "8" } }
{ "title": "RS00134567814688"}
{ "index" : { "_id" : "9" } }
{ "title": "RS01134567814688"}
{ "index" : { "_id" : "1" } }
{ "title": "RS-011-345-678-14666" }
{ "index" : { "_id" : "2" } }
{ "title": "RS-011-345-678-14688"}
{ "index" : { "_id" : "3" } }
{ "title": "RS-012-345-678-14666"}
{ "index" : { "_id" : "4" } }
{ "title": "RS-012-345-678-14688"}
{ "index" : { "_id" : "5" } }
{ "title": "RS-001-345-678-14666"}
{ "index" : { "_id" : "6" } }
{ "title": "RS-001-345-678-14688"}
{ "index" : { "_id" : "7" } }
{ "title": "RS01234567814688"}
使用query string做一次字符串查询
注意“-”是query string中的语法的operator,如果查询字符串中需要按照普通字符串使用,则需要使用双斜线进行escape。"explain": true 使查询结果包含了执行计划,非调试时请勿使用。
GET test_rs_012_index/_search?size=100
{
"_source": ["title"],
"explain": true,
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"query_string": {
"allow_leading_wildcard": false,
"analyze_wildcard": true,
"boost": "1.0",
"fields": [
"title.ngram"
],
"analyzer": "ngram_search_analyzer",
"fuzziness": "0",
"minimum_should_match": "1",
"query": "RS\\-012"
}
}
]
}
}
}
查询结果如下
_explanation中的内容解释了每条数据的计分过程。
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 2.1647458,
"hits" : [
{
"_shard" : "[test_rs_012_index][0]",
"_node" : "AXjZEwjjT5ivDZNJj4WJaw",
"_index" : "test_rs_012_index",
"_type" : "_doc",
"_id" : "3",
"_score" : 2.1647458,
"_source" : {
"title" : "RS-012-345-678-14666"
},
"_explanation" : {
"value" : 2.1647458,
"description" : "weight(title.ngram:rs-012-345 in 2) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 2.1647458,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 1.3862944,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 2,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 9,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.7097882,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 5.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 41.333332,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
},
{
"_shard" : "[test_rs_012_index][0]",
"_node" : "AXjZEwjjT5ivDZNJj4WJaw",
"_index" : "test_rs_012_index",
"_type" : "_doc",
"_id" : "4",
"_score" : 2.1647458,
"_source" : {
"title" : "RS-012-345-678-14688"
},
"_explanation" : {
"value" : 2.1647458,
"description" : "weight(title.ngram:rs-012-345 in 3) [PerFieldSimilarity], result of:",
"details" : [
{
"value" : 2.1647458,
"description" : "score(freq=1.0), computed as boost * idf * tf from:",
"details" : [
{
"value" : 2.2,
"description" : "boost",
"details" : [ ]
},
{
"value" : 1.3862944,
"description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details" : [
{
"value" : 2,
"description" : "n, number of documents containing term",
"details" : [ ]
},
{
"value" : 9,
"description" : "N, total number of documents with field",
"details" : [ ]
}
]
},
{
"value" : 0.7097882,
"description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details" : [
{
"value" : 1.0,
"description" : "freq, occurrences of term within document",
"details" : [ ]
},
{
"value" : 1.2,
"description" : "k1, term saturation parameter",
"details" : [ ]
},
{
"value" : 0.75,
"description" : "b, length normalization parameter",
"details" : [ ]
},
{
"value" : 5.0,
"description" : "dl, length of field",
"details" : [ ]
},
{
"value" : 41.333332,
"description" : "avgdl, average length of field",
"details" : [ ]
}
]
}
]
}
]
}
}
]
}
}
Trouble shooting
接下来做一次trouble shooting 的演示。即调查为什么某一条数据会被当前query string搜索到的过程。
比如我们想查询query string “RS\\-012\\-345”与index中数据“RS-012-345-678-14688”的匹配过程。
1.首先要查出数据“RS-012-345-678-14688”在index中被拆出来哪些分词。接口path参数“4”代表的是该条数据的id,这个在上面的返回结果中能查到。
Get test_rs_012_index/_termvectors/4
{
"fields": ["title.ngram"]
}
查询结果:
{
"_index" : "test_rs_012_index",
"_type" : "_doc",
"_id" : "4",
"_version" : 1,
"found" : true,
"took" : 0,
"term_vectors" : {
"title.ngram" : {
"field_statistics" : {
"sum_doc_freq" : 294,
"doc_count" : 9,
"sum_ttf" : 372
},
"terms" : {
"0" : {
"term_freq" : 2,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
},
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 6
}
]
},
"01" : {
"term_freq" : 2,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
},
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 6
}
]
},
"012" : {
"term_freq" : 2,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
},
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 6
}
]
},
"0123" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
}
]
},
"01234" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
}
]
},
"012345" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
}
]
},
"0123456" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
}
]
},
"01234567" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
}
]
},
"012345678" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
}
]
},
"0123456781" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 1,
"start_offset" : 3,
"end_offset" : 20
}
]
},
"1" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 4,
"start_offset" : 15,
"end_offset" : 20
}
]
},
"14" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 4,
"start_offset" : 15,
"end_offset" : 20
}
]
},
"146" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 4,
"start_offset" : 15,
"end_offset" : 20
}
]
},
"1468" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 4,
"start_offset" : 15,
"end_offset" : 20
}
]
},
"14688" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 4,
"start_offset" : 15,
"end_offset" : 20
}
]
},
"3" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 10
}
]
},
"34" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 10
}
]
},
"345" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 2,
"start_offset" : 7,
"end_offset" : 10
}
]
},
"6" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 3,
"start_offset" : 11,
"end_offset" : 14
}
]
},
"67" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 3,
"start_offset" : 11,
"end_offset" : 14
}
]
},
"678" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 3,
"start_offset" : 11,
"end_offset" : 14
}
]
},
"r" : {
"term_freq" : 3,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
},
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
},
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 2
}
]
},
"rs" : {
"term_freq" : 3,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
},
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
},
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 2
}
]
},
"rs-" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs-0" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs-01" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs-012" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs-012-" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs-012-3" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs-012-34" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs-012-345" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs0" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs01" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs012" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs0123" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs01234" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs012345" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs0123456" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
},
"rs01234567" : {
"term_freq" : 1,
"tokens" : [
{
"position" : 0,
"start_offset" : 0,
"end_offset" : 20
}
]
}
}
}
}
}
2. 对查询输入的字符串进行analyze, 看分词情况
GET test_rs_012_index/_analyze
{
"analyzer": "ngram_search_analyzer",
"text": "RS-012 345"
}
查询结果
{
"tokens" : [
{
"token" : "rs-012",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 0
},
{
"token" : "345",
"start_offset" : 7,
"end_offset" : 10,
"type" : "word",
"position" : 1
}
]
}
观察是否有匹配的分词。
下面介绍一些es常用的语句
在index被创建后,如何修改analyzer的定义:
1. 首先调用 POST test_rs_012_index/_close
2. 执行修改语句(注意请求体只是一个例子,这个请求体是声明式的 )
PUT test_rs_012_index/_settings
{
"index" : {
"analysis" : {
"analyzer" : {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"icu_folding",
"ngram_filter"
]
}
}
}
}
}
3. 调用 POST test_rs_012_index/_open
精确匹配查询
GET space-datamanagement-informationobject-autotest_v3/_search?size=100
{
"_source": ["title"],
"query" : {
"constant_score" : {
"filter" : {
"term" : {
"title.keyword" : "RS-012956-01-96475"
}
}
}
}
}