1.简介
Python Data Analysis Library 或 pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。Pandas 纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。Pandas是使Python成为强大而高效的数据分析环境的重要因素之一。
- python数据分析library
- 基于numpy (对ndarray的操作)
- 有一种用python做Excel/SQL/R的感觉
2.数据结构Series
2.1 构造和初始化Series
import pandas as pd
import numpy as np
#Series是一个一维的数据结构,下面是一些初始化Series的方法。
s = pd.Series([7, 'Beijing', 2.17, -12344, 'Happy Birthday!'])
#pandas会默认用0到n来作为Series的index,但是我们也可以自己指定index。index我们可以把它理解为dict里面的key。
s = pd.Series([7, 'Beijing', 2.17, -12344, 'Happy Birthday!'],
index=['A', 'B', 'C', 'D', 'E'])
#还可以用dictionary来构造一个Series,因为Series本来就是key value pairs。
cities = {'Beijing': 55000, 'Shanghai': 60000, 'Shenzhen': 50000, 'Hangzhou': 20000, 'Guangzhou': 25000, 'Suzhou': None}
# apts = pd.Series(cities)
apts = pd.Series(cities, name="price")
#numpy ndarray构建一个Series
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
2.2 选择数据
#我们可以像对待一个list一样对待Series
apts[[4,3,1]]
apts[1:]
#为什么下面这样会拿到两个NaN呢?
apts[1:] + apts[:-1]
#Series就像一个dict,前面定义的index就是用来选择数据的
apts["Hangzhou"]
apts[["Hangzhou", "Beijing", "Shenzhen"]]
"Hangzhou" in apts
apts.get("Hangzhou")
#boolean indexing,与numpy类似。
apts[apts < 50000]
apts.median()
apts[apts > apts.median()]
#下面我再详细展示一下这个boolean indexing是如何工作的
less_than_50000 = apts < 50000
print(less_than_50000)
print(apts[less_than_50000])
2.3 Series元素赋值
#Series的元素可以被赋值
apts['Shenzhen'] = 55000
#前面讲过的boolean indexing在赋值的时候也可以用
apts[apts <= 50000] = 40000
2.4 数学运算
#下面我们来讲一些基本的数学运算。
apts / 2
apts * 2
#numpy的运算可以被运用到pandsa上去
np.square(apts)
#我们再定义一个新的Series做加法
cars = pd.Series({'Beijing': 300000, 'Shanghai': 400000, 'Shenzhen': 300000, \
'Tianjin': 200000, 'Guangzhou': 200000, 'Chongqing': 150000})
cars + apts * 100
2.5 数据缺失
apts.notnull()
apts.isnull()
apts[apts.isnull()]
apts[apts.isnull() == False]
3.数据结构DataFrame
一个Dataframe就是一张表格,Series表示的是一维数组,Dataframe则是一个二维数组,可以类比成一张excel的spreadsheet。也可以把Dataframe当做一组Series的集合。
3.1 创建一个DataFrame
#dataframe可以由一个dictionary构造得到。
data = {'city': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chongqing'],
'year': [2016,2017,2016,2017,2016, 2016],
'population': [2100, 2300, 1000, 700, 500, 500]}
pd.DataFrame(data)
#columns的名字和顺序可以指定
pd.DataFrame(data, columns = ['year', 'city', 'population'])
pd.DataFrame(data, columns = ['year', 'city', 'population', 'debt'])
frame2 = pd.DataFrame(data, columns = ['year', 'city', 'population', 'debt'],
index=['one', 'two', 'three', 'four', 'five', 'six'])
print(frame2)
3.2 从DataFrame里选择数据
frame2['city']
type(frame2['city'])
frame2.ix['three']
#下面这种方法默认用来选列而不是选行
frame2.ix[2]
3.3 DataFrame元素赋值
frame2["population"]["one"] = 2100
#可以给一整列赋值
frame2['debt'] = 100000000
frame2.ix['six'] = 0
frame2 = pd.DataFrame(data, \
columns = ['year', 'city', 'population', 'debt'],
index = ['one', 'two', 'three', 'four', 'five', 'six'])
print(frame2)
frame2.debt = np.arange(6)
#还可以用Series来指定需要修改的index以及相对应的value,没有指定的默认用NaN.
val = pd.Series([100, 200, 300], index=['two', 'three', 'five'])
frame2['debt'] = val
print(frame2)
frame2['western'] = (frame2.city == 'Chongqing')
print(frame2)
frame2.columns
frame2.index
#一个DataFrame就和一个numpy 2d array一样,可以被转置
frame2.T
#指定index的顺序,以及使用切片初始化数据
frame3['Beijing'][1:3]
frame3['Shanghai'][:-1]
pdata = {'Beijing': frame3['Beijing'][:-1], 'Shanghai':frame3['Shanghai'][:-1]}
print(pd.DataFrame(pdata))
#还可以指定index的名字和列的名字
frame3.index.name = 'year'
frame3.columns.name = 'city'
frame3
type(frame3.values)
3.数据结构Index
3.1 index object
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
print(index)
print(index[1:])
#index的值是不能被赋值
index[1] = 'd' #错误的做法
index = pd.Index(np.arange(3))
index
obj2 = pd.Series([2,5,7], index=index)
print(obj2)
print(obj2.index is index)
pop = {'Beijing': {2016: 2100, 2017:2200},
'Shanghai': {2015:2400, 2016:2500, 2017:2600}}
frame3 = pd.DataFrame(pop)
print('Shanghai' in frame3.columns)
print(2015 in frame3.columns)
3.2 针对index进行索引和切片
obj = pd.Series(np.arange(4), index=['a','b','c','d'])
obj[['b', 'a']]
obj[[0, 2]]
#默认的数字index依旧可以使用
obj[1:3]
obj['b':'d'] = 5
#对DataFrame进行Indexing与Series基本相同
frame = pd.DataFrame(np.arange(9).reshape(3,3),
index = ['a', 'c', 'd'],
columns = ['Hangzhou', 'Shenzhen', 'Nanjing'])
frame['Hangzhou']
frame[:2]
frame.ix['a':'d']
frame['Hangzhou':'Najing']
frame.ix[:, 'Shenzhen':'Nanjing']
frame.ix[:'c', 'Hangzhou']
#DataFrame也可以用condition selection
frame[frame.Hangzhou > 1]
frame[frame < 5] = 0
print(frame)
3.3 reindex
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html
#把一个Series或者DataFrame按照新的index顺序进行重排
obj = pd.Series([4.5, 7.2, -5.3, 3.2], index=['d', 'b', 'a', 'c'])
obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value = 0)
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0,2,4])
obj3.reindex(range(6), method='ffill')
obj3.reindex(range(6), method='bfill')
#既然我们可以对Series进行reindex,相应地,我们也可以用同样的方法对DataFrame进行reindex。
frame = pd.DataFrame(np.arange(9).reshape(3,3),
index = ['a', 'c', 'd'],
columns = ['Hangzhou', 'Shenzhen', 'Nanjing'])
print(frame)
frame.reindex(['a' , 'b', 'c', 'd'])
#在reindex的同时,我们还可以重新指定columns
frame.reindex(columns = ['Shenzhen', 'Hangzhou', 'Chongqing'])
frame.reindex(index = ['a', 'b', 'c', 'd'],
method = 'ffill',
columns = ['Shenzhen', 'Hangzhou', 'Chongqing'])
#下面介绍如何用drop来删除Series和DataFrame中的index
obj4 = obj3.drop(2)
obj3.drop([2, 4])
frame.drop(['a', 'c'])
frame.drop('Shenzhen', axis=1)
frame.drop(['Shenzhen', 'Hangzhou'], axis=1)
4.数据结构hierarchical index
#Series的hierarchical indexing
data = pd.Series(np.random.randn(10),
index=[['a','a','a','b','b','c','c','c','d','d'], \
[1,2,3,1,2,1,2,3,1,2]])
data
data.index
data["b"]
data['b':'d']
data[1:4]
#unstack和stack可以帮助我们在hierarchical indexing和DataFrame之间进行切换。
data.unstack()
type(data.unstack())
data.unstack().stack()
#DataFrame的hierarchical indexing
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
index = [['a','a','b','b'], [1,2,1,2]],
columns = [['Beijing', 'Beijing', 'Shanghai'], ['apts', 'cars', 'apts']])
print(frame)
frame.index.names = ['alpha', 'number']
frame.columns.names = ['city', 'type']
print(frame)
frame.ix['a', 1]
frame.ix['a', 2]['Beijing']['apts']
5.关于Concatenate, Merge和Join
df1 = pd.DataFrame({'apts': [55000, 60000],
'cars': [200000, 300000],},
index = ['Shanghai', 'Beijing'])
df1
df2 = pd.DataFrame({'apts': [25000, 20000],
'cars': [150000, 120000],},
index = ['Hangzhou', 'Najing'])
print(df2)
df3 = pd.DataFrame({'apts': [30000, 10000],
'cars': [180000, 100000],},
index = ['Guangzhou', 'Chongqing'])
print(df3)
5.1 concatenate
frames = [df1, df2, df3]
result = pd.concat(frames)
result
#在concatenate的时候可以指定keys,这样可以给每一个部分加上一个Key
#以下的例子就构造了一个hierarchical index
result2 = pd.concat(frames, keys=['x', 'y', 'z'])
print(result2)
df4 = pd.DataFrame({'salaries': [10000, 30000, 30000, 20000, 15000]},
index = ['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin'])
print(df4)
result3 = pd.concat([result, df4], axis=1)
result3
#用inner可以去掉NaN
result3 = pd.concat([result, df4], axis=1, join='inner')
result3
#用append来做concatenation
df1.append(df2)
df1.append(df4) #可能不是想要的结果
#Series和DataFrame还可以被一起concatenate,这时候Series会先被转成DataFrame然后做Join,因为Series本来就是一个只有一维的DataFrame。
s1 = pd.Series([60, 50], index=['Shanghai', 'Beijing'], name='meal')
print(s1)
df1
pd.concat([df1, s1], axis=1)
#如何append一个row到DataFrame里
s2 = pd.Series([18000, 120000], index=['apts', 'cars'], name='Xiamen')
s2
df1.append(s2)
5.2 Merge
df1 = pd.DataFrame({'apts': [55000, 60000, 58000],
'cars': [200000, 300000,250000],
'cities': ['Shanghai', 'Beijing','Shenzhen']})
print(df1)
df4 = pd.DataFrame({'salaries': [10000, 30000, 30000, 20000, 15000],
'cities': ['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin']})
print(df4)
result = pd.merge(df1, df4, on='cities')
result
result2 = pd.merge(df1, df4, on='cities', how='outer')
result2
5.3 Join
df1 = pd.DataFrame({'apts': [55000, 60000, 58000],
'cars': [200000, 300000,250000]},
index=['Shanghai', 'Beijing','Shenzhen'])
print(df1)
df4 = pd.DataFrame({'salaries': [10000, 30000, 30000, 20000, 15000]},
index=['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin'])
print(df4)
df1.join(df4)
df4.join(df1)
df1.join(df4, how='outer')
#也可以用merge来写
pd.merge(df1, df4, left_index=True, right_index=True, how='outer')
6.Group By
import pandas as pd
salaries = pd.DataFrame({
'Name': ['July', 'Chu', 'Chu', 'Lin', 'July', 'July', 'Chu', 'July'],
'Year': [2016,2016,2016,2016,2017,2017,2017,2017],
'Salary': [10000,2000,4000,5000,18000,25000,3000,4000],
'Bonus': [3000,1000,1000,1200,4000,2300,500,1000]
})
print(salaries)
group_by_name = salaries.groupby('Name')
group_by_name
#groupby经常和aggregate一起使用
group_by_name.aggregate(sum)
group_by_name.sum()
group_by_name_year = salaries.groupby(['Name', 'Year'])
group_by_name_year.sum()
group_by_name_year.size()
group_by_name_year.max()
#describe这个function可以为我们展示各种有用的统计信息
group_by_name_year.describe()
7.Read From CSV
#我们先从CSV文件中读取一些数据。
#bike.csv记录了Montreal自行车路线的数据,具体有7条路线,数据记录了每条自行车路线每天分别有多少人。
bikes = pd.read_csv('bikes.csv', encoding='latin1', sep=';',
parse_dates=['Date'], dayfirst=True, index_col='Date')
bikes.head()
bikes.dropna() #dropna会删除所有带NA的行
bikes.dropna(how='all').head()
bikes.dropna(axis=1, how='all').head()
#下面给大家介绍如何填充缺失的数据
row = bikes.ix[0].copy()
row.fillna(row.mean())
m = bikes.mean(axis=1)
for i, col in enumerate(bikes):
bikes.ix[:, i] = bikes.ix[:, i].fillna(m)
bikes.head()
berri_bikes = bikes[['Berri 1']].copy()
berri_bikes.head()
berri_bikes.index
berri_bikes.index.weekday
berri_bikes.loc[:, 'weekday'] = berri_bikes.index.weekday
berri_bikes[:5]
#有了weekday信息之后,我们就可以用上我们前面学过的.groupyby把骑车人数按照weekday分类,然后用aggregate算出每个工作日的骑车人数之和。
weekday_counts = berri_bikes.groupby('weekday').aggregate(sum)
weekday_counts
weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts
#接下来我们试试能不能把每条路线都加起来,然后算出一天骑自行车出门的人数之和。
bikes_sum = bikes.sum(axis=1).to_frame()
bikes_sum.head()
# bikes_sum.index
bikes_sum.ix[:, 'weekday'] = bikes_sum.index.weekday
bikes_sum.head()
#type(berri_bikes)
weekday_counts = bikes_sum.groupby('weekday').aggregate(sum)
weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts
8.stock project
import pandas as pd
import numpy as np
%matplotlib inline
goog = pd.read_csv("data/GOOG.csv", index_col=0)
goog.index = pd.to_datetime(goog.index)
# goog
goog["Adj Close"].plot(grid = True)
goog["Close"].plot(grid = True)
#shift这个function可以帮我们移动时间
goog.shift(1).head()
#datetime indexing
goog["log-return"] = np.log(goog["Adj Close"] / goog["Adj Close"].shift(1))
print(goog["log-return"].head())
goog["log-return"].plot(grid=True)
aapl = pd.read_csv("data/AAPL.csv", index_col=0)
aapl.index = pd.to_datetime(aapl.index)
#aapl["Adj Close"].plot(grid=True)
goog.join(aapl, lsuffix=" goog", rsuffix=" aapl").plot()
aapl["Adj Close"][aapl["Adj Close"] == "null"] = np.NaN
aapl["Adj Close"] = aapl["Adj Close"].bfill()
aapl["Adj Close"] = aapl["Adj Close"].apply(lambda x: float(x))
aapl["Adj Close"].plot(grid=True)
msft = pd.read_csv("data/MSFT.csv", index_col=0)
msft.index = pd.to_datetime(msft.index)
stocks = pd.DataFrame({"AAPL": aapl["Adj Close"].bfill(),
"MSFT": msft["Adj Close"].bfill(),
"GOOG": goog["Adj Close"].bfill()})
pd.concat([aapl["Adj Close"], msft["Adj Close"], goog["Adj Close"]], 1, keys=["aapl", "msft", "goog"]).plot()
stocks = pd.concat([aapl["Adj Close"], msft["Adj Close"], goog["Adj Close"]], 1, keys=["aapl", "msft", "goog"])
valid_stocks = stocks[stocks.index >= stocks["goog"].first_valid_index()]
# stocks = pd.to_numeric(stocks)
stocks.plot(grid=True)
valid_stocks.plot(grid=True)
valid_stocks_lr = np.log(valid_stocks / valid_stocks.shift(1))
valid_stocks_lr.loc["2017-01-01":"2017-02-01"].plot(grid=True)
#下面我们试试能不能把日K图变成月K图
monthly_stocks = valid_stocks.groupby([valid_stocks.index.year, valid_stocks.index.month]).last()
monthly_stocks
index = [ str(i[0]) + "-" + str(i[1]) for i in monthly_stocks.index.values]
print(index)
index = pd.core.indexes.period.PeriodIndex(index, freq="M")
monthly_stocks.index = index
monthly_stocks.plot(grid=True)
monthly_stocks.loc["2011-8"]
monthly_stocks.loc["2011/8"]
monthly_stocks.loc["8/2011"]
9.特征处理
9.1 credit project
import pandas as pd
import numpy as np
# import matplotlib.pyploy as plt
%matplotlib inline
df = pd.read_csv("data/credit-data.csv")
df.head()
for i, val in enumerate(df):
print(df[val].value_counts())
df['income_bins'] = pd.cut(df.monthly_income, bins=15)
pd.value_counts(df['income_bins'])
df['income_bins'] = pd.cut(df.monthly_income, bins=15, labels=False)
pd.value_counts(df.income_bins)
df["monthly_income"] = df["monthly_income"].fillna(df["monthly_income"].mean())
df["income_bins"] = np.log(df.monthly_income)
df["income_bins"] = df["income_bins"].replace([np.inf, -np.inf], 0)
df["income_bins"] = df["income_bins"].astype("int")
df[["income_bins", "serious_dlqin2yrs"]].groupby("income_bins").mean()
cols = ['income_bins', 'serious_dlqin2yrs']
income_means = df[cols].groupby("income_bins").mean()
income_means.plot()
cols = ['age', 'serious_dlqin2yrs']
age_means = df[cols].groupby("age").mean()
age_means.plot()
mybins = [0] + list(range(20, 80, 5)) + [120]
df['age_bucket'] = pd.cut(df.age, bins=mybins)
df['age_bucket'].value_counts()
df[["age_bucket", "serious_dlqin2yrs"]].groupby("age_bucket").mean().fillna(0)
df[["age_bucket", "serious_dlqin2yrs"]].groupby("age_bucket").mean().plot()
#把categorize的类型转换成数值类型
labels, levels = pd.factorize(df.age_bucket)
df.age_bucket = labels
df.age_bucket.head()
#quantile
bins = []
for q in [0.2, 0.4, 0.6, 0.8, 1.0]:
bins.append(df.debt_ratio.quantile(q))
debt_ratio_binned = pd.cut(df.debt_ratio, bins=bins)
debt_ratio_binned
print(pd.value_counts(debt_ratio_binned))
9.2 train project
step1.加载需要的库
import pandas as pd
import numpy as np
%matplotlib inline
#载入数据:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
train.shape, test.shape
step2.看看数据的基本情况
train.dtypes
train.head(5)
#合成一个总的data
train['source']= 'train'
test['source'] = 'test'
data=pd.concat([train, test],ignore_index=True)
data.shape
step3.数据应用/建模一个很重要的工作是,你要看看异常点,比如说缺省值
data.apply(lambda x: sum(x.isnull()))
step4.要对数据有更深的认识,比如说,咱们看看这些字段,分别有多少种取值(甚至你可以看看分布)
var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']
for v in var:
print '\n%s这一列数据的不同取值和出现的次数\n'%v
print data[v].value_counts()
step5.紧接着你就可以开始处理你的字段(特征)了
#City字段的处理
len(data['City'].unique())
data.drop('City',axis=1,inplace=True) #好像city的类型好多,粗暴一点,这个字段咱们不要了
#DOB字段的处理
#DOB是出生的具体日期,咱们要具体日期作用没那么大,年龄段可能对我们有用,所有算一下年龄好了
data['DOB'].head()
#创建一个年龄的字段Age
data['Age'] = data['DOB'].apply(lambda x: 115 - int(x[-2:]))
data['Age'].head()
#把原始的DOB字段去掉:
data.drop('DOB',axis=1,inplace=True)
#EMI_Load_Submitted字段处理
data.boxplot(column=['EMI_Loan_Submitted'],return_type='axes')
#好像缺失值比较多,干脆就开一个新的字段,表明是缺失值还是不是缺失值
data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)
#原始那一列就可以不要了
data.drop('EMI_Loan_Submitted',axis=1,inplace=True)
#Employer Name字段处理
len(data['Employer_Name'].value_counts())
data.drop('Employer_Name',axis=1,inplace=True)
#Existing_EMI字段
data.boxplot(column='Existing_EMI',return_type='axes')
data['Existing_EMI'].describe()
#缺省值不多,用均值代替
data['Existing_EMI'].fillna(0, inplace=True)
#Interest_Rate字段:
data.boxplot(column=['Interest_Rate'],return_type='axes')
#缺省值太多,也造一个字段,表示有无
data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)
print data[['Interest_Rate','Interest_Rate_Missing']].head(10)
data.drop('Interest_Rate',axis=1,inplace=True)
#Lead Creation Date字段
#不!要!了!,是的,不要了!!!
data.drop('Lead_Creation_Date',axis=1,inplace=True)
data.head()
#Loan Amount and Tenure applied字段
#找中位数去填补缺省值(因为缺省的不多)
data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)
data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)
#Loan Amount and Tenure selected
# 缺省值太多。。。是否缺省。。。
data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
#原来的字段就没用了
data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)
#LoggedIn
#没想好怎么用。。。不要了。。。
data.drop('LoggedIn',axis=1,inplace=True)
#salary account
# 可能对接多个银行,所以也不要了
data.drop('Salary_Account',axis=1,inplace=True)
#Processing_Fee
#和之前一样的处理,有或者没有
data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)
#旧的字段不要了
data.drop('Processing_Fee',axis=1,inplace=True)
#Source
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)
data['Source'].value_counts()
#最终的数据样式
data.head()
data.describe()
data.apply(lambda x: sum(x.isnull()))
data.dtypes
step6.数值编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']
for col in var_to_encode:
data[col] = le.fit_transform(data[col])
data.head()
data.dtypes
step7.类别型的One-Hot 编码
data = pd.get_dummies(data, columns=var_to_encode)
data.columns
step8.区分训练和测试数据
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']
train.drop('source',axis=1,inplace=True)
test.drop(['source','Disbursed'],axis=1,inplace=True)
train.to_csv('train_modified.csv',index=False)
test.to_csv('test_modified.csv',index=False)