# -*- coding: utf-8 -*-
import numpy
import pandas
data = pandas.read_csv(
'D:\\PDA\\4.9\\data.csv'
)
#设置随机种子
numpy.random.seed(seed=2)
#按照个数抽样
data.sample(n=10)
#按照百分比抽样
data.sample(frac=0.02)
#是否可放回抽样,
#replace=True,可放回,
#replace=False,不可放回
data.sample(n=10, replace=True)
#典型抽样,分层抽样
gbr = data.groupby("class")
gbr.groups
typicalNDict = {
1: 2,
2: 4,
3: 6
}#定义一个字典,从一班级中抽取2个人,二班级中抽取4个人,三班级中抽取6个人
def typicalSampling(group, typicalNDict):
name = group.name
n = typicalNDict[name]
return group.sample(n=n)
result = data.groupby(
'class', group_keys=False
).apply(typicalSampling, typicalNDict)#向量化运算的函数apply
typicalFracDict = {
1: 0.2,
2: 0.4,
3: 0.6
}#定义字典,按照百分比抽样
def typicalSampling(group, typicalFracDict):
name = group.name
frac = typicalFracDict[name]
return group.sample(frac=frac)
result = data.groupby(
'class', group_keys=False
).apply(typicalSampling, typicalFracDict)
id class score
39 40 1 45
4 5 1 63
53 54 1 95
25 26 1 64
37 38 1 107
70 71 1 75
85 86 2 77
81 82 2 63
54 55 2 121
68 69 2 56
13 14 2 69
86 87 2 93
57 58 2 82
84 85 2 85
94 95 2 103
96 97 2 108
35 36 2 101
89 90 2 86
45 46 2 95
80 81 2 81
20 21 2 138
65 66 3 83
83 84 3 52
34 35 3 66
6 7 3 87
77 78 3 77
82 83 3 54
55 56 3 126
17 18 3 58
67 68 3 93
10 11 3 89
26 27 3 64
61 62 3 103
88 89 3 89
69 70 3 96
0 1 3 77
90 91 3 91
91 92 3 59
48 49 3 98
7 8 3 48
52 53 3 62