elasticsearch 搜索功能搭建
标签(空格分隔): python scrapy elasticserach
es完成搜索意见
```python
# mappings
# 报错更正
class CustomAnalyzer(_customanalyzer):
def get_analysis_definition(self):
return {}
# 常量定义
ik_analyzer = CustomAnalyzer("ik_max_word",filter=["lowercase"])
# 搜索建议字段
suggest = Completion(analyzer=ik_analyzer)
#items
# 引用
from elasticsearch_dsl.connections import connections
es = connections.create_connection(ESArticleJobbole._doc_type.using)
def get_suggests(index,info_tuple):
# 根据字符串生成搜索建议数组
used_words = set()
suggests = []
for text,weight in info_tuple:
# 调用es的analyzer接口分析字符串
if text:
words = es.indices.analyze(index=index,analyzer = "ik_max_word",params={'filter':['lowercase']},body=text)
analyzed_words = set([r['token'] for r in words['tokens'] if len(r['token'])>1])
new_words = analyzed_words - used_words
else:
new_words = set()
if new_words:
suggests.append({"input":list(new_words),"weight":weight})
return suggests
article.suggest = get_suggests(ESArticleJobbole._doc_type.index,((article.title,10),(article.tags,7)))
```
django 搭建web界面
编辑距离
是字符串之间相似程度的计算方法,即俩个字符串之间的编辑距离等于一个字符串另一个字符串通过插入、删除、替换或者相邻字符交换位置而进行操作的最少次数。
-
django中web
- 通过urls控制请求转向
- 通过veiws处理内容
# urls url(r'^$', TemplateView.as_view(template_name="index.html"),name="index"), url(r'^suggest/$', SearchSuggest.as_view(),name="suggest"), url(r'^search/$', SearchView.as_view(),name="search") # views class SearchView(View): def get(self,request): key_words = request.GET.get('q','') page = request.GET.get('p','1') try: page = int(page) except: page = 1 start_time = datetime.now() response = Client.search( index="jobbole", body={ "query":{ "multi_match":{ "query":key_words, "fields":['tags','title','content'] } }, "from":(page-1)*10, "size":10, "highlight":{ "pre_tags":['<span class = "keyWord">'], "post_tags":['</span>'], "fields":{ "title":{}, "content":{} } } }) end_time = datetime.now() last_second = (end_time - start_time).total_seconds() # 数据总量 total_num = response['hits']['total'] if (page%10)>0: page_num = int(total_num/10)+1 else: page_num = int(total_num/10) hit_list = [] for hit in response['hits']['hits']: hit_dict={} if 'highlight' in hit: if 'title' in hit['highlight']: hit_dict['title'] = ''.join(hit['highlight']['title']) else: hit_dict['title'] = ''.join(hit['_source']['title']) if 'content' in hit['highlight']: hit_dict['content'] ="".join(hit['highlight']['content'])[:500] else: hit_dict['content'] = "".join(hit['_source']['content'])[:500] else: hit_dict['title'] = ''.join(hit['_source']['title']) hit_dict['content'] = "".join(hit['_source']['content'])[:500] hit_dict['create_date'] = hit['_source']['create_time'] hit_dict['url'] = hit['_source']['url'] hit_dict['score'] = hit['_score'] hit_list.append(hit_dict) return render(request,'result.html',{ 'page':page, 'page_num':page_num, 'last_second':last_second, 'total_num':total_num, 'all_hits':hit_list, 'key_words':key_words })
- 原视频UP主慕课网(聚焦Python分布式爬虫必学框架Scrapy 打造搜索引擎)
- 本篇博客撰写人: XiaoJinZi 个人主页 转载请注明出处
- 学生能力有限 附上邮箱: 986209501@qq.com 不足以及误处请大佬指责