import pandas as pd
Excel Sheet
- read one
pd.read_excel("xxx.xlsx", sheetname="first")
- read all
di = pd.read_excel("xxx.xlsx", sheetname=None)
return a dict: {"sheetname": xx, ...}
- write Sheets
writer = pd.ExcelWriter("xxx.xlsx", engine="xlsxwriter")
for df in dfs:
df.to_excel(writer, sheet_name="xx")
- write time
df.to_excel("xxx.xlsx", engin="openpyxl")
- read sql
from sqlalchemy import create_engine
db_url = "postgresql+psycopg2://db_name:password@ip/db_name"
engine = create_engine(db_url)
SQL_QUERY = """ xxx """
df = pd.read_sql(SQL_QUERY, engine)
- del Column / Rows
del df["column"]
df = df[:500]
- Judge nan
import math
xx = df.get_value(index, column)
math.isnan(xx)
- Write in memory (Supply download)
import io
buffer = io.BytesIO()
writer = pd.ExcelWriter(buffer, engine='xlsxwriter')
df = ...
df.to_excel(wirter)
writer.save()
data = buffer.getvalue()
response = HttpResponse(data)
response['Content-Type'] = 'application/octet-stream'
response['Content-Disposition'] = 'attachment;filename="{0}"'.format("target.xlsx")
Read Json
from pandas.io.json import json_normalize
di = {"a": 1, "b": 2}
df = json_normalize(di)
Create New Column
df = ...
df["new_line"] = df["line1"] + df["line2"]
Groupby
合并重复项
df.groupby(["小区名称", "行政区", "面积", "所在层" ...])["时间"].min().reset_index() 合并重复项,时间取最小值
# 注: 如果某一行作为groupby的列为空(nan),那么这一列必定不会出现在groupby的结果里
可以先填充空值: df.fillna("-")
Calc Rate
计算每一部分占总体的比例
total = len(df)
pd.value_counts(pd.cut(df["column"], bins=[字段分割])) / total
合并
df = pd.concat([df1, df2])
读取大的csv文件
reader = pd.read_csv("/home/manbug/小区.csv", iterator=True)
# r = reader.get_chunk(5)
while reader:
r = reader.get_chunk(5)
TODO...
filter技巧
索引
df.ix["xxx"]
字符串
df[df["抓取时间"].str.startswith('2017')]
时间
df[df["时间"].dt.month>5]
转list
Series:
df["MAC地址"].tolist()
# df["MAC地址"].values.tolist()