pandas datafram add series experiment
import pandas as pd
# Change False to True for each block of code to see what it does
# Adding a Series to a square DataFrame
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
# Adding a Series to a one-row DataFrame
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
print df
print '' # Create a blank line between outputs
print df + s
# Adding a Series to a one-column DataFrame
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10, 20, 30, 40]})
print df
print '' # Create a blank line between outputs
print df + s
# Adding when DataFrame column names match Series index
if False:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
# Adding when DataFrame column names don't match Series index
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
'a': [10, 20, 30, 40],
'b': [50, 60, 70, 80],
'c': [90, 100, 110, 120],
'd': [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
pandas加减乘除的方向确定方法:如果要对行操作,那么就是对同一组index进行操作,那么就改用add, sub,div的方法来,并把axis调成index;默认的+——/都是按照列来,也就是对同一组column操作,例如df.mean(),返回一个数组,每个数组的元素代表某一列的数组的平均值。df.mean() == df.means(axis = 'index')
- 求某一行的平均值
df.mean(axis = 'columns')
- 将某个df减去每一行的平均值
df.sub(df.mean(axis = 'columns'), axis = 'index')
Excercise
import pandas as pd
# Adding using +
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df + s
# Adding with axis='index'
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df.add(s, axis='index')
# The functions sub(), mul(), and div() work similarly to add()
# Adding with axis='columns'
if False:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
0: [10, 20, 30, 40],
1: [50, 60, 70, 80],
2: [90, 100, 110, 120],
3: [130, 140, 150, 160]
})
print df
print '' # Create a blank line between outputs
print df.add(s, axis='columns')
# The functions sub(), mul(), and div() work similarly to add()
grades_df = pd.DataFrame(
data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio',
'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
def standardize(df):
'''
Fill in this function to standardize each column of the given
DataFrame. To standardize a variable, convert each value to the
number of standard deviations it is above or below the mean.
This time, try to use vectorized operations instead of apply().
You should get the same results as you did before.
'''
return None
def standardize_rows(df):
'''
Optional: Fill in this function to standardize each row of the given
DataFrame. Again, try not to use apply().
This one is more challenging than standardizing each column!
'''
return None
Excercise about groupby()
import numpy as np
import pandas as pd
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
# Change False to True for each block of code to see what it does
# Standardize each group
if False:
def standardize(xs):
return (xs - xs.mean()) / xs.std()
grouped_data = example_df.groupby('even')
print grouped_data['value'].apply(standardize)
# Find second largest value in each group
if False:
def second_largest(xs):
sorted_xs = xs.sort(inplace=False, ascending=False)
return sorted_xs.iloc[1]
grouped_data = example_df.groupby('even')
print grouped_data['value'].apply(second_largest)
# --- Quiz ---
# DataFrame with cumulative entries and exits for multiple stations
ridership_df = pd.DataFrame({
'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
'EXITSn': [1088151, 13755385, 1088159, 13755393, 1088177, 13755598, 1088231, 13756191, 1088275]
})
def get_hourly_entries_and_exits(entries_and_exits):
'''
Fill in this function to take a DataFrame with cumulative entries
and exits and return a DataFrame with hourly entries and exits.
The hourly entries and exits should be calculated separately for
each station (the 'UNIT' column).
Hint: Take a look at the `get_hourly_entries_and_exits()` function
you wrote in a previous quiz, DataFrame Vectorized Operations. If
you copy it here and rename it, you can use it and the `.apply()`
function to help solve this problem.
'''
return None