Tips:
python3 打开http端口
python -m http.server 9000
pandas、numpy相关
返回数组中不为0的元素的索引
np.nonzero(ndarray)
函数返回值为tuple元组类型,tuple内的元素数目与ndarray维度相同。
dataframe追加写csv
data = {'vid':data['vid'], 'text':data['token'], 'final_label':json.dumps(final_label)}
df = pd.DataFrame(data, index = [0]) # data是json数据
if os.path.exists(filepath):
df.to_csv(filepath, header=0, mode='a', index=False, sep=',', encoding='utf-8-sig')
else:
df.to_csv(filepath, mode='a', index=False, sep=',', encoding='utf-8-sig')
dataframe追加写列
data.loc[index, 'celebrity'] = str(name_list)
pandas读tsv文件并指定列名
data = pd.read_csv(path, sep='\t', dtype=object, header=None, error_bad_lines=False)
data.columns = ['gid', 'uid', 'uri', 'label', 'inference', 'result', 'msg']
读csv,修复错误行
pd.read_csv(filePath, error_bad_lines=False)
数据分布统计
def status(x) :
return pd.Series([x.min(),x.quantile(.25),x.median(),x.quantile(.75),x.mean(),x.max(),x.var(),x.std(),x.skew(),x.kurt()],
index=['最小值','25%分位数','中位数','75%分位数','均值','最大值','方差','标准差','偏度','峰度'])
自定义函数
pickle读写
with open('vid_dic.pkl','wb') as in_data:
pickle.dump(vid_dic, in_data, pickle.HIGHEST_PROTOCOL)
with open('vid_dic.pkl','rb') as out_data:
vid_dic = pickle.load(out_data)
列举出文件夹下需要的所有xx格式文档
import os
# 第一种情况:当前文件夹下存在,不遍历子文件夹
docLabels = [f for f in os.listdir(filepath) if f.endswith('.xml')]
#第二种情况:遍历当前文件夹及其所有子文件夹的某种类型
def print_dir(path_list, filepath, pattern):
for i in os.listdir(filepath):
path = os.path.join(filepath, i)
if os.path.isdir(path):
print_dir(path_list, path, pattern)
if path.endswith(pattern):
path_list.append(path)
return path_list
path_list = []
path_list = print_dir(path_list, filepath, ".xxx")
for path in tqdm(path_list, ncols=70):
xxx
写json
with open('train_data.json', 'a') as f:
json.dump(info_dic, f)
f.write('\n')
# f.write(json.dumps(info_dic, ensure_ascii=False)+'\n')
json转tsv
with open('test.json', 'r') as f:
json_data = f.readlines()
for data in tqdm(json_data, ncols=70):
data = json.loads(json.loads(data))
data['label'] = label_dic[data['poi_id']]
del data['poi_id']
head = ['uri','ai_ocr_sentence','backend_type_name','comment_cnt_all']
# 第一次打开文件时,第一行写入表头
path = 'train.tsv'
if not os.path.exists(path):
with open(path, "w", newline='', encoding='utf-8') as csvfile: # newline='' 去除空白行
writer = csv.DictWriter(csvfile, fieldnames=head, delimiter='\t') # 写字典的方法
writer.writeheader() # 写表头的方法
# 接下来追加写入内容
with open(path, "a", newline='', encoding='utf-8') as csvfile: # newline='' 一定要写,否则写入数据有空白行
writer = csv.DictWriter(csvfile, fieldnames=head, delimiter='\t')
writer.writerow(data) # 按行写入数据
tsv转json
lines = pd.read_csv(path, sep='\t', dtype=object, header=None, error_bad_lines=False)
lines.columns = ['vid', 'cid', 'album', 'cid_title', 'score', 'item_id', 'item_title', 'vv_all']
lines = lines.drop_duplicates()
for i in tqdm(range(len(lines)), ncols=70):
line = lines.iloc[i].to_json()
line = json.loads(line)
参数
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--hdfs_path", type=str, default='hdfs://haruna/home/byte_arnold_hl_vc/user/name.1997/')
parser.add_argument("--save_path", type=str, default = '')
args = parser.parse_args()
os.system('hdfs dfs -get {}'.format(args.hdfs_path))
os.system('hdfs dfs -put {} {}'.format(item, args.save_path))
统计出现次数并存入字典
date_dic = {}
if not date in date_dic:
date_dic[date]=1
else:
date_dic[date]=date_dic[date]+1
字典(dict)按键(key)和值(value)排序
- 字典按键排序:
>>> dic = {'a':2, 'b':1}
>>> d = sorted(dic.items(), key = lambda k: k[0])
>>> print(d)
[('a', 2), ('b', 1)]
- 字典按值排序:
>>> dic = {'a':2, 'b':1}
>>> d = sorted(dic.items(), key = lambda k: k[1])
>>> print(d)
[('b', 1), ('a', 2)]
打印进度条
方法一:print()函数实现
print("\r", "---- 处理到第" + str(j) + "个", end="", flush=True)
方法二:tqdm库
for patent in tqdm(patent_list, ncols=10):
pass
字符串与日期转换
- 将列表中的日期转为字符串
# 输出时间戳对应的年月日信息
test['日期'].apply(lambda x: print(x.year, x.month, x.day))
# 将时间戳转换为字符串
myString = test['日期'].apply(lambda x: x.strftime('%Y-%m-%d'))
- 字符串转时间(年月日)
published_time = "20" + pub_time[0]
published_time = datetime.datetime.strptime(published_time,'%Y%m%d').date()
python时间与Unix时间戳相互转换
import time
def unix_time(dt):
#转换成时间数组
timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
#转换成时间戳
timestamp = time.mktime(timeArray)
return timestamp
def local_time(timestamp):
#转换成localtime
time_local = time.localtime(timestamp)
#转换成新的时间格式(2016-05-05 20:28:54)
dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
return dt
if __name__ == '__main__':
time_now = '2018-09-25 15:33:20'
unix_t = unix_time(time_now)
local_t = local_time(unix_t)
print(unix_t)
根据经纬度计算距离
def distence(lon1, lat1, lon2, lat2): # 经度1,纬度1,经度2,纬度2 (十进制度数)
# 将十进制度数转化为弧度
lon1, lat1, lon2, lat2 = map(radians, [float(lon1), float(lat1), float(lon2), float(lat2)])
# haversine公式
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
c = 2 * asin(sqrt(a))
r = 6371 # 地球平均半径,单位为公里
return c * r * 1000
自动解压当前文件夹下所有zip包
import zipfile
import os
def unzip(path, zfile):
file_path = path + os.sep + zfile
desdir = path + os.sep + zfile[:zfile.index('.zip')]
srcfile = zipfile.ZipFile(file_path)
for filename in srcfile.namelist():
srcfile.extract(filename, desdir)
if filename.endswith('.zip'):
# if zipfile.is_zipfile(filename):
path = desdir
zfile = filename
unzip(path, zfile)
# 定位到每个zip文件
def print_dir(filepath):
for i in os.listdir(filepath):
path = os.path.join(filepath, i)
if os.path.isdir(path):
print_dir(path)
if path.endswith(".zip"):
unzip(file_path, path.split('/')[-1])
file_path = "D:/Pythonworkspace/patent/data/Application/2010/"
print_dir(file_path)
读取xml文本
import xml.dom.minidom
#打开xml文档
dom = xml.dom.minidom.parse('C:/Users/asus/Desktop/1.xml')
#得到文档元素对象
root = dom.documentElement
urls = dom.getElementsByTagName('url')
copus = ""
for url in urls:
copus = copus + url.firstChild.data + ";"
# copus.append(url.firstChild.data)
text = "https://www.drugs.com/sfx/nytol-quickcaps-side-effects.html"
if copus.find(text) == 0:
print("已经存在")
使用RandomForestClassifier查看不同特征的重要程度
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)
rf_clf = clf.fit(x, y)
rf_clf.feature_importances_
数据库相关
读mysql数据库
import pymysql
def read_content(path):
con = pymysql.connect(host="",user="",password="",port=3306,charset="utf8",db="")
cursor = con.cursor()
sql = "SELECT result FROM table_name WHERE id= '' limit 1;"
try:
# 执行sql语句
cursor.execute(sql)
# 获取记录列表
result = cursor.fetchone()
except:
# 如果发生错误则回滚
con.rollback()
cursor.close()
# 关闭数据库连接
con.close()
写mysql数据库
import pymysql
# 保存到数据库
def save_data_to_mysql(val):
con = pymysql.connect(host="",user="",password="",port=3306,charset="utf8",db="")
cursor = con.cursor()
sql="insert into match_result(application_id,filed_time,published_time,location,result)\
VALUES('"+val[0]+"','"+val[1]+"','"+val[2]+"','"+val[3]+"','"+val[4]+"')"
# print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
con.commit()
except:
# 如果发生错误则回滚
con.rollback()
cursor.close()
# 关闭数据库连接
con.close()