1.单列where条件
select * from data where id < 8;
df[df.id < 8]
df[df['id'] < 8]
df[df.created == pd.Timestamp('2013-08-24 11:37:35')]
df.query("id < 8")
df.query('object_name == "C_COBJ#"')
df[df.B == df['C C']]
# The previous expression is equivalent to
df.query("B == `C C`")
2. 排序sort
select * from data order by object_type desc;
df.sort_values(by='object_type', ascending=False)
select * from data order by object_type desc, object_name asc;
df.sort_values(by=['object_type', 'object_name'], ascending=[False, True])
3. 逻辑运算 and(&) or(|) not (-)
3-1. & and
select * from data where id < 8 and object_name = 'OBJ$';
df[(df.id < 8) & (df.object_name == 'OBJ$')]
# The previous expression is equivalent to
df.query("id<8 and object_name== 'OBJ$'")
3-2. | or
select * from data where value>1 or itemid=139476;
data[(data['db_name'] == 'shhxbase') | (data['nodes'] > 1)]
data[(data.value >1) | ( data.itemid==139476)]
data.query("value > 1 or itemid == 139476")
3-3. - not
select * from data where not (value>1 or itemid=139476);
data[ - ((data.value >1) | ( data.itemid==139476))]
data.query( "value <= 1 and itemid != 139476")
4. null 判断
data[data.value.isna()]
or
data[data.value.notna()]
5. in or like
5-1. in
data[data.itemid.isin([139479, 139471])]
#not in
data[- data.itemid.isin([139479, 139471])]
data.query("itemid in (139479, 139471)")
data.query("itemid not in (139479, 139471)")
5-2. like
df[df.name.str.contains(r'Tablespace|Sysmetric')]
6. groupby
select itemid, count(1) from data group by itemid;
data.groupby('itemid').size()
data.groupby('itemid').count()
# select itemid, avg(value), count(1) from data group by itemid;
data.groupby('itemid').agg(dict(value=np.mean, itemid=np.size))
#select itemid, count(distinct value) from data group by itemid;
data.groupby('itemid').agg(dict(value=pd.Series.nunique))
7. join
# select * from emp, dept where emp.deptno = dept.deptno order empno desc;
# 方法1
pd.merge(emp, dept, on='deptno').sort_values(by='empno', ascending=False)
#方法2
pd.merge(emp, dept, left_on='deptno', right_on='deptno')
#方法3
emp.set_index('deptno').join(dept.set_index('deptno'), on='deptno')
8. union all
pd.concat([df1, df2])
9. union
pd.concat([df1, df2]).drop_duplicates()
10. update
update data set value=value+1 where value<1;
data.loc[data.value < 1, 'value'] += 1
11. drop/add column
#使用drop()函数,此函数有一个列表形参labels,写的时候可以加上labels=[xxx],也可以不加,列表内罗列要删除行或者列的名称,默认是行名称,如果要删除列,则要增加参数axis=1,
#drop columns
test_dict_df.drop(['id'], axis=1)
# 替换原数据
test_dict_df.drop(['id'], axis=1, inplace=True)
# 增加列
emp.insert(0, 'test', ['test']*14)
# 调整列的位置, 把empno调到最后,此操作会就地修改数据
emp.insert(len(emp.columns), 'empno', emp.pop('empno'))
12.delete row
df.drop([index])
13. insert
# 增加一行
emp.append(emp.iloc[2], ignore_index=True)
# df 增加多行
emp.append(emp.iloc[-3:], ignore_index=True)
# 增加内置列表list
# nan = np.nan
row = [7902, 'FORD', 'ANALYST', 7566.0, Timestamp('1981-12-03 00:00:00'), 3000.0, np.nan, 20]
# 方法1:使用参数ignore_index=True
row = pd.Series(row, index=emp.columns)
emp.append(row, ignore_index=True)
# 方法2:给series命名,对应emp的索引
row = pd.Series(row, index=emp.columns, name=14)
emp.append(row)
# 方法3 在指定位置插入行,第二行位置,插入索引为88的行
df = emp.reindex(emp.index.insert(1, 88), copy=False)
df.loc[88] = row
14.拆分为多行
数据
country | score | value | label |
---|---|---|---|
China,US | 100 | 1 | a |
Japan,EU | 150 | 2 | b |
UK,Australia | 120 | 3 | c |
Singapore,Netherland | 90 | 4 | d |
实现结果:
score | value | label | country |
---|---|---|---|
100 | 1 | a | China |
100 | 1 | a | US |
150 | 2 | b | Japan |
150 | 2 | b | EU |
120 | 3 | c | UK |
120 | 3 | c | Australia |
90 | 4 | d | Singapore |
90 | 4 | d | Netherland |
df = pd.DataFrame({'country': ['China,US', 'Japan,EU', 'UK,Australia', 'Singapore,Netherland'],
'score': [100, 150, 120, 90],
'value': [1, 2, 3, 4],
'label': list('abcd')})
df.to_sql('df', odb, index=False,
dtype=dict(country=VARCHAR(30), score=NUMBER(8), value=NUMBER(8), label=VARCHAR(10)))
select regexp_substr(country, '[^,]+', 1, level) country, score, value, label
from df
connect by value= prior value --这里的value为主键
and prior dbms_random.value is not null
and level <= regexp_count(country, ',') + 1;
df.drop('country', axis=1).join(
df['country'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('country')
).reset_index(drop=True)