最近有个语义分词工具的需求,在比较查找了一番后决定使用百度开源的LAC
百度语义分词 https://github.com/baidu/lac
环境准备(测试时是centos 7.5系统)
安装py37
1、安装前准备
a、yum更新yum源
yum update
b、安装Python 3.7所需的依赖否则安装
yum install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel libffi-devel gcc make
c、在官网下载所需版本,这里用的是3.7.1版本(使用淘宝镜像)
wget http://npm.taobao.org/mirrors/python/3.7.1/Python-3.7.1.tgz
2、安装Python
a、解压
tar -xvf Python-3.7.1.tgz
b、配置编译
./configure --prefix=/usr/local/python3 # 配置编译的的路径(这里--prefix是指定编译安装的文件夹)
./configure --enable-optimizations # 执行该代码后,会编译安装到 /usr/local/bin/ 下,且不用添加软连接或环境变量
make && make install
ln -s /usr/local/python3/bin/python3 /usr/bin/python3 # 添加软连接
ln -s /usr/local/python3/bin/pip3 /usr/bin/pip3
c、将/usr/local/python3/bin加入PATH
vim /etc/profile
export PATH=$PATH:/usr/local/python3/bin
source /etc/profile # 修改完后,还需要让这个环境变量在配置信息中生效,执行命令
安装LAC
理论上一键安装
pip3 install lac -i https://mirror.baidu.com/pypi/simple
安装LAC过程中问题排坑
1.opencv 安装失败
最好先升级下pip3 pip3 install --upgrade pip --user
pip3 install numpy==1.15.4 --user -i https://mirror.baidu.com/pypi/simple
手动安装 pip3 install opencv-python==4.3.0.36
2. opencv 安装后 出现 from .cv2 * 模块不能引入的问题
缺少 libsm.so.6 和 libXext.so.6 ,需安装下
yum install libSM-1.2.2-2.el7.x86_64 --setopt=protected_multilib=false
yum install libXext-1.3.3-3.el7.x86_64 --setopt=protected_multilib=false
yum install install mesa-libGL.x86_64
包装成web 服务提供http 接口
将该模块使用轻量级web框架 包装成服务对外提供
1. 安装 web.py
pip3 install web.py
2 .代码编写
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 11 15:27:51 2020
@author: 67022
"""
import web
from LAC import LAC
import logging
import json
logger = logging.getLogger()
urls = (
'/lac', 'lac'
)
app = web.application(urls, globals())
class lac():
#默认分词
def POST(self):
web.header("Access-Control-Allow-Origin", "*")
web.header('content-type','text/json')
try:
data = str(web.data(),"utf-8")
dataDic = json.loads(data)
#待分词文本
text = dataDic['text']
lacmodel= ''
meddledic=''
#装载指定模型
if 'model' in dataDic.keys():
lacmodel = dataDic['model']
#可以通过model 参数加载自己训练的模型
lac = LAC(model_path=lacmodel)
else:
lac = LAC(mode='lac')
#装载干预词典, sep参数表示词典文件采用的分隔符,为None时默认使用空格或制表符'\t'
if 'meddledic' in dataDic.keys():
meddledic= dataDic['meddledic']
#可以加载自定义的干预词典,进行自定义的语义分词,改文件需要放在与当前文件同级 目录下
lac.load_customization(meddledic, sep=None)
logger.info('lac input text:%s lacmodel:%s meddledic:%s' %(text,lacmodel,meddledic))
seg_result = lac.run(text)
logger.info(seg_result)
return seg_result
except Exception as e:
logger.info("lac exception: %s: " %(str(e)))
return [[],[]]
if __name__ == "__main__":
app.run()
3.代码升级 使用更高性能的tornado
安装所需模块
pip3 install tornado==5.1.1 -i https://mirror.baidu.com/pypi/simple
pip3 install nest_asyncio -i https://mirror.baidu.com/pypi/simple
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 18 15:34:25 2020
@author: 67022
"""
import logging
import tornado
import tornado.ioloop
import tornado.web
import tornado.httpserver
import tornado.gen
from tornado.concurrent import run_on_executor
# 这个并发库在python3自带在python2需要安装sudo pip install futures
from concurrent.futures import ThreadPoolExecutor
import json
from lacmodel import lacmod
import nest_asyncio
nest_asyncio.apply()
LISTEN_PORT = 8080
PROCESS_NUM = 1
LOG_FORMAT = "%(asctime)s %(thread)s %(name)s %(levelname)s %(pathname)s %(message)s "#配置输出日志格式
DATE_FORMAT = '%Y-%m-%d %H:%M:%S %a ' #配置输出时间的格式,注意月份和天数不要搞乱了
class LacHandler(tornado.web.RequestHandler):
executor = ThreadPoolExecutor(100)
@tornado.web.asynchronous
@tornado.gen.coroutine
def get(self):
# 假如你执行的异步会返回值被继续调用可以这样(只是为了演示),否则直接yield就行
res = yield self.handle()
self.write(res)
#self.finish()
#self.handle()
@tornado.gen.coroutine
def post(self):
# 假如你执行的异步会返回值被继续调用可以这样(只是为了演示),否则直接yield就行
res = yield self.handle()
self.writejson(res)
#self.finish()
#self.handle()
@run_on_executor
def handle(self):
result = {}
try:
bodyStr = str(self.request.body,"utf-8")
data = json.loads(bodyStr)
text = data['text']
seg_result = lacmod.run(text)
#resp = seg_result
result = {}
result['code'] = "200"
result['words'] = seg_result[0]
result['tags'] = seg_result[1]
result['msg'] = "分词成功"
logging.info("LacHandler input: %s, result: %s"%(bodyStr,result))
except Exception as e:
logging.error("%s" % str(e)+" "+str(self.request.remote_ip))
result['resultMsg'] = str(e)
result['resultCode'] = "1111"
finally:
return result
def prepare_head(self):
self.set_header("Content-Type","application/json;chatset=utf-8")
self.set_status(200)
def writejson(self,obj):
jsonstr = json.dumps(obj).encode("utf-8").decode("utf-8");
self.prepare_head()
self.write(jsonstr)
settings = {
}
application = tornado.web.Application([
(r"/lac", LacHandler),
], **settings)
if __name__ == "__main__":
myapplog = logging.getLogger()
myapplog.setLevel(logging.INFO)
formatter = logging.Formatter(LOG_FORMAT)
filehandler = logging.handlers. RotatingFileHandler("log/lacsize.log", mode='a', maxBytes=1024*1024*100, backupCount=10)#每 102400Bytes重写一个文件,保留5(backupCount) 个旧文件
filehandler.setFormatter(formatter)
myapplog.addHandler(filehandler)
#application.listen(config.LISTEN_PORT)
#tornado.ioloop.IOLoop.instance().start()
http_server = tornado.httpserver.HTTPServer(application)
http_server.bind(LISTEN_PORT)
http_server.start(PROCESS_NUM)
tornado.ioloop.IOLoop.instance().start()
from LAC import LAC
lacmod = LAC(mode='lac')
lacmod.load_customization("custom.txt", sep=None)
启动服务
python3 lac_web.py
http请求 (POST):
接口调用 POST
参数
{
"text":"lac是百度开源的一款优秀分词工具", /* 待分词文本 */
"meddledic":"custom.txt" /* 特定词语字典 ,放在python的web文件同级目录,该字段可不传,不传则按默认分词规则*/
"model":"modelPath" /* 可不传,自己训练的模型地址 */
}
返回值:分词后的数组