#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: xxx
import string
import sys
import csv
import json
import logging
import random
import pymysql
import os
import sys
# import jieba
# import jieba.posseg
# import jieba.analyse
# import json
import time
#from text_quality_classifier import para_sims_tagger_feature_exact
#from text_quality_classifier import layout_tagger_feature_extract
#from content_base_feature import content_base_fea_extract
#from custom_dict_feature import feature_custom_dict_v4_train
#from infor_entropy_fea import part_of_speech_extract_jieba
# 设定日志级别和格式
logging.basicConfig(
level=logging.FATAL,
format=
'%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
def connect_db():
"""mysql链接
"""
return pymysql.connect(host='10.xxx.83.15',
port=5203,
user='om_article_r',
password='0f59ee296',
database='om_article',
charset='utf8')
# 链接
con = connect_db()
cur = con.cursor()
def select_from_sql(cmsid):
"""查询
"""
# 计算时间
date = cmsid
if date[0].isdigit():
date = date[:6]
else:
date = date[3:9]
# 拼接查询串
sql_str = ("select media_id,title,content "
+ " from news_article_%s where cmsid='%s' limit 1" % (date, cmsid))
logging.info(sql_str)
try:
# 查询
cur.execute(sql_str)
# 获取
row = cur.fetchone()
return row
except:
return ("", "", "")
def get_baicao_metas(art_file, out_file):
"""请求baicao,获取内容
"""
# 去重结果
results = {}
lst_cmsid = []
lst_label = []
lst_media_id = []
lst_title = []
lst_content = []
# fw = open(out_file, "w", encoding='utf-8')
# # 写表头
# fw.write("cmsid\tlabel\tmedia_id\ttitle\tcontent\n")
with open(out_file, "w", encoding='utf-8') as fw:
fw.write("cmsid\tlabel\tmedia_id\ttitle\tcontent\n")
# fr = open(art_file, 'r', encoding='utf-8')
with open(art_file, 'r', encoding='utf-8') as fr:
lines = fr.readlines()
for line in lines[1:]:
try:
line_list = line.strip().split('\t')
label = line_list[0]
cmsid = line_list[1]
# 请求百草mysql
media_id, title, content = select_from_sql(cmsid)
content = content.replace("\n", "").replace("\r", "").replace("\t", "")
title = title.replace("\n", "").replace("\r", "").replace("\t", "")
lst_cmsid.append(cmsid)
lst_label.append(label)
lst_media_id.append(media_id)
lst_title.append(title)
lst_content.append(content)
# featrue_list=[]#,feature_size,paras = nlp_fea_extrt.feature_extract(title,content_html)
# content = line_list[4]
if title == "":
logging.warn("not found: " + cmsid)
continue
# with open(out_file, "w", encoding='utf-8') as fw:
# fw.write(cmsid + '\t' + label + '\t' + media_id + '\t' + title + '\t' + content + '\n')
print(len(lst_cmsid))
except:
continue
# 控制频率
time.sleep(0.0)
with open(out_file, "a", encoding='utf-8') as fw:
for i in range(len(lst_cmsid)):
fw.write(lst_cmsid[i] + '\t' + lst_label[i] + '\t' + lst_media_id[i] + '\t' + lst_title[i] + '\t' + lst_content[i] + '\n')
cur.close()
con.close()
art_file = '../ft_local/titles_n_o_2.eval.a.1_30'
out_file = '../ft_local/titles_n_o_2_out.eval.a.1_30'
get_baicao_metas(art_file, out_file)
# # usage
# if len(sys.argv) < 3:
# print("usage:")
# print("\t./request_baicao.py corpus[IN] corpus.new[OUT]")
# sys.exit(1)
#
# # run
# get_baicao_metas(sys.argv[1], sys.argv[2])
# sys.exit(0)
pymysql
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
推荐阅读更多精彩内容
- 上面可见已经安装成功 但是引入的时候会报错: 解决方法: 在uwsgi.ini中添加
- 每天都一堆的事情等着去研究等着去做。。。又是晚上11点了。。。再研究一下吧。。。 按照上一天说的进度今天研究下py...
- 今天在用pymysql插入数据时出现一下异常。 后来检查数据发现your-price列对应的数据为float,长度...
- Mysql驱动介绍 MySQL-python(弃用):也就是MySQLdb,是对C语言操作MySQL数据库的一个简...