本文为快速上手elasticsearch python客户端pyes来索引自己的代码仓库。
集群搭建请使用本攻城狮搭建的开箱即用的集成版本
https://github.com/full-stack-engineer/elasticsearch-integrated
# -*- coding: utf-8 -*-
import os
import sys
from pyes import *
INDEX_NAME = 'javafiles'
INDEX_ALIAS = 'javafiles_alias'
TYPE_NAME = "code"
class IndexFiles(object):
def __init__(self, root):
conn = ES('127.0.0.1:9200', timeout=3.5) # 连接ES
try:
conn.indices.delete_index(INDEX_NAME)
# pass
except:
pass
conn.indices.create_index(INDEX_NAME) # 新建一个索引
# 定义索引存储结构
mapping = {u'content': {'boost': 1.0,
'index': 'analyzed',
'store': 'yes',
'type': u'string',
"indexAnalyzer": "ik",
"searchAnalyzer": "ik",
"term_vector": "with_positions_offsets"},
u'name': {'boost': 1.0,
'index': 'analyzed',
'store': 'yes',
'type': u'string',
"indexAnalyzer": "ik",
"searchAnalyzer": "ik",
"term_vector": "with_positions_offsets"},
u'dirpath': {'boost': 1.0,
'index': 'analyzed',
'store': 'yes',
'type': u'string',
"indexAnalyzer": "ik",
"searchAnalyzer": "ik",
"term_vector": "with_positions_offsets"},
}
conn.indices.put_mapping(TYPE_NAME, {'properties': mapping}, [INDEX_NAME]) # 定义test-type
self.addIndex(conn, root)
conn.indices.add_alias(INDEX_ALIAS, INDEX_NAME)
conn.default_indices = [INDEX_NAME] # 设置默认的索引
conn.indices.refresh() # 刷新以获得最新插入的文档
def addIndex(self, conn, root):
print root
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.java'):
continue
print "Indexing file ", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'utf-8')
file.close()
if len(contents) > 0:
conn.index({'name': filename, 'dirpath': root, 'content': contents}, INDEX_NAME, TYPE_NAME)
else:
print 'no contents in file %s', path
except Exception, e:
print e
if __name__ == '__main__':
IndexFiles('/Users/xxx/Projects')