本章开始，进入实战

0 实验数据

环境：mysql+python 3.6
表名：EMP

CREATE TABLE if not exists EMP(empno int,ename VARCHAR(10),job VARCHAR(9),mgr int,hiredate varchar(10),sal double,comm double,deptno int);

INSERT INTO EMP VALUES (7369, 'SMITH', 'CLERK', 7902,'1980-12-17', 800, NULL, 20);
INSERT INTO EMP VALUES (7499, 'ALLEN', 'SALESMAN', 7698,'1981-02-10', 1600, 300, 30);
INSERT INTO EMP VALUES (7521, 'WARD', 'SALESMAN', 7698,'1981-02-22', 1250, 500, 30);
INSERT INTO EMP VALUES (7566, 'JONES', 'MANAGER', 7839,'1981-04-02', 2975, NULL, 20);
INSERT INTO EMP VALUES (7654, 'MARTIN', 'SALESMAN', 7698,'1981-09-28', 1250, 1400, 30);
INSERT INTO EMP VALUES (7698, 'BLAKE', 'MANAGER', 7839,'1981-05-01', 2850, NULL, 30);
INSERT INTO EMP VALUES (7782, 'CLARK', 'MANAGER', 7839,'1981-07-09', 2450, NULL, 10);
INSERT INTO EMP VALUES (7788, 'SCOTT', 'ANALYST', 7566,'1982-12-09', 3000, NULL, 20);
INSERT INTO EMP VALUES (7839, 'KING', 'PRESIDENT', NULL,'1981-11-17', 5000, NULL, 10);
INSERT INTO EMP VALUES (7844, 'TURNER', 'SALESMAN', 7698,'1981-09-08', 1500, 0, 30);
INSERT INTO EMP VALUES (7876, 'ADAMS', 'CLERK', 7788,'1983-01-12', 1100, NULL, 20);
INSERT INTO EMP VALUES (7900, 'JAMES', 'CLERK', 7698,'1981-12-03', 950, NULL, 30);
INSERT INTO EMP VALUES (7902, 'FORD', 'ANALYST', 7566,'1981-12-03', 3000, NULL, 20);
INSERT INTO EMP VALUES (7934, 'MILLER', 'CLERK', 7782,'1982-11-23', 1300, NULL, 10);

1 SQL和pandas对照

#连接mysql数据库
from sqlalchemy import create_engine
ce="mysql+mysqldb://password@ip:port/dbname?charset=utf8"
from_conn = create_engine(ce, echo=True, encoding='utf-8')
df = pd.read_sql_query("select * from emp",con=from_conn)

#--emp表的sal*10
#select a.sal*10 from emp a;
df["sal"] = df["sal"] * df["sal"] / df["sal"] * 10

#--emp表各部门的sal的平均值，总和，max和min（遇到缺失数值时，需特殊处理）
#select a.deptno,avg(a.sal) as avg_sal,sum(a.sal) as sum_sal,max(a.sal) as max_sal,min(a.sal) as min_sal from emp a group by a.deptno;
grouped = df.groupby(["deptno"])
grouped["sal"].agg([np.mean, np.sum, np.max, np.min]).reset_index()

#--emp表中sal<1000的信息
#select * from emp where sal<1000;
df = df[df["sal"]<1000]

#--ename中含有A的信息
#select ename  from emp where ename like'%A%';
df = df.ix[df["ename"].str.contains("A"),"ename"]

#--emp中ename不含有A的信息
#select * from emp where ename not like'%A%';
df = df[~df["ename"].str.contains("A")]

#--计算各部门中薪资小于5000的员工的工资总和
#select deptno,sum(sal) from emp where sal<5000 group by deptno;
df = df[df["sal"]<5000].groupby(["deptno"]).agg({"sal":np.sum}).reset_index()

#--计算各部门中工资不高于其部门平均工资的工资总和
#select a.deptno,sum(a.sal) from emp as a join (select c.deptno, avg(c.sal) as avg_sal from emp as c group by c.deptno) as b where a.deptno=b.deptno and a.sal<b.avg_sal group by a.deptno;
dfg = df.groupby(["deptno"]).agg({"sal":np.mean}).reset_index()
dfg = dfg.rename(columns={"sal":"avg_sal"})
df = pd.merge(left=df,right=dfg,on="deptno", how="inner")
df = df[df["sal"]<df["avg_sal"]].groupby(["deptno"]).agg({"sal":np.sum})

#--计算各部门中工资小于4000的员工的平均工资
#select a.deptno, avg(a.sal) from (select b.deptno, b.sal from emp as b where b.sal < 4000) as a group by a.deptno;
df = df[df["sal"]<4000].groupby(["deptno"]).agg({"sal":np.mean}).reset_index()

#--查询薪水低于10000的员工姓名和sal
#select ename,sal from emp where  sal<10000;
df = df.ix[df["sal"]<10000,["ename","sal"]]

#--查询工资大于1000，并且是20号部门的员工的全部信息
#select * from emp where sal>1000 and deptno=20;
df = df[df["sal"]>1000 & df["deptno"]==20]

#--根据用户名"SMITH"查找他所在的部门
#select deptno from emp where ename='SMITH';
df = df.ix[df["ename"].isin(["SMITH"]),"deptno"]

#--查询每个部门的最高工资的人员名称
#select e.ename,e.deptno,e.sal from (select deptno as did ,max(sal) as m from emp group by deptno)  s,emp e where e.sal=s.m and s.did=e.deptno;
df_max = df.groupby(["deptno"]).agg({"sal":np.max}).reset_index()
df_max = df_max.rename(columns={"sal":"sal_max"})
df = pd.merge(left=df, right=df_max, on="deptno", how="inner")
df = df.ix[df["sal"]==df["sal_max"],["ename","deptno","sal"]]

#--查询“Jones”之后第一个进入公司的人
#select * from emp where hiredate=(select min(hiredate) from emp where hiredate>(select hiredate from emp where ename='JONES')) ;
df = df.sort_values(by=["ename","hiredate"])
df = df.groupby(["ename"]).head(1).reset_index()
hiredate = df.ix[df["ename"].isin(["JONES"]),"hiredate"].tolist()[0]
df = df[df["hiredate"]>hiredate ]
df = df.sort_values(by=["hiredate"])
df = df.head(1)

#--查找工资最高的部门名称和工资最低的部门名称及工资
#select d.dname,e.sal from emp e,dept d where e.deptno=d.deptno and sal=(select max(m) from (select deptno,max(sal) as m from emp e group by deptno) s) union select d.dname,e.sal from emp e,dept d where e.deptno=d.deptno and sal=(select min(m) from (select deptno,min(sal) as m from emp e group by deptno) s);
df = df.sort_values(by=["sal"],)
df = pd.concat([df.head(1),df.tail(1)])
df = df[["deptno","sal"]]

#--查找从事特定工作的人数（分组数人头）
#select case 
#               when job='ANALYST' then 'Y' 
#               when job='MANAGER' then 'Y' 
#               when job='SALESMAN' then 'N' 
#               when job='CLERK' then 'N' 
#               else 'N' end as is_target, count(distinct empno) as csr_cnt
#   from EMP 
#   group by 1
job_map = {'ANALYST':'Y',
           'MANAGER':'Y'}
df['is_target'] = df['job'].map(job_map)
new_df = df.groupby(['is_target']).agg({'empno':pd.Series.nunique}).reset_index()

#--公司有多少个部门（去重）
#select distinct job from EMP group by 1
new_df = df.drop_duplicates(subset=['job'], keep='first')

#--公司根据部门和员工工号排序（分组排序）
#select job, empno from EMP order by 1,2
df = df.sort_values(['job','empno'], ascending=[0,0]).reset_index()

#--公司部门员工名单拉平（行列转换，拉平）
#展现结果如下：
# CLERK ADAMS,JAMES,MILLER,SMITH
# ANALYST   FORD,SCOTT
df.sort_values(['job','ename'], ascending=[0,0]).reset_index()
new_df = df.groupby('job')['ename'].agg(lambda x: ', '.join(x.unique())).reset_index()

#--非去重计数
#select ename, count(empno) from EMP group by 1
df.groupby(['ename'])['empno'].size()

#--去重计数
#select ename, count(distinct empno) from EMP group by 1
df.groupby(['ename']).agg({'empno':pd.Series.nunique})

2 pandas效率

#--数据过大、内存吃紧时，需对原数据分块
#经测试，分块与不分块，在抽取时间上差异不大
sql = "select BIN_SaleRecordID,BIN_BrandInfoID,SaleType,TicketType,SaleRecordCode,BillCode,BillCodePre,SaleTime as SaleDate,MemberCode,BIN_OrganizationID,EmployeeCode,InvoiceFlag from Sale.BIN_SaleRecord where SaleDate >= '" + s_day + "' and SaleDate < '" + e_day + "'"
for BIN_SaleRecord in pd.read_sql_query(sql, con=sql_server_engine, chunksize=batch_no):
  (obs_n, _) = BIN_SaleRecord.shape
  if obs_n > 0:
    BIN_SaleRecord = BIN_SaleRecord[BIN_SaleRecord['InvoiceFlag'] != 'NO']

5.pandas进阶(SQL对照，不定时更新)

5.pandas进阶(SQL对照，不定时更新)

0 实验数据

1 SQL和pandas对照

2 pandas效率