R数据挖掘实战-数据抽样

简单随机抽样

library(sampling)
sample(x,size,replace=F,prob=NULL) 
x表示待抽取对象，size为抽取样本个数，replace表示不放回抽样，prob表示各个样本抽样概率

sample(1:5,5,replace = T,prob = c(0.1,0.1,0.1,0.1,0.6))
[1] 5 5 5 5 5

分层抽样

strata(data, stratanames=NULL, size,method=c("srswor","srswr","poisson","systematic"), pik,description=FALSE)
stratanames放入分层所依据的变量名称，方法为无放回，有放回，系统，泊松抽样,pik表示各层各样本的抽样概率，description表示是否输出各层基本信息的结果
strata(Insurance,stratanames = 'District',size = c(1,2,3,4),method = 'srswor')
   District ID_unit   Prob Stratum
9         1       9 0.0625       1
24        2      24 0.1250       2
25        2      25 0.1250       2
37        3      37 0.1875       3
43        3      43 0.1875       3
48        3      48 0.1875       3
51        4      51 0.2500       4
53        4      53 0.2500       4
62        4      62 0.2500       4
63        4      63 0.2500       4

getdata(data,m)m表示抽样后的结果，getdata返回抽样后的数据

a = strata(Insurance,stratanames = 'District',size=c(1,2,3,4),method = 'systematic',pik=Insurance$Claims)
a
   District ID_unit       Prob Stratum
8         1       8 0.28964518       1
22        2      22 0.11447811       2
27        2      27 0.08754209       2
36        3      36 0.36612022       3
40        3      40 1.00000000       3
43        3      43 0.13114754       3
55        4      55 0.29464286       4
56        4      56 1.00000000       4
60        4      60 0.84375000       4
64        4      64 0.44196429       4
getdata(Insurance,a)  返回被抽选出的原始数据
   Group   Age Holders Claims District ID_unit       Prob Stratum
8  1-1.5l   >35    3582    400        1       8 0.28964518       1
22 1-1.5l 25-29     313     51        2      22 0.11447811       2
27 1.5-2l 30-35     221     39        2      27 0.08754209       2
36    <1l   >35     648     67        3      36 0.36612022       3
40 1-1.5l   >35    1635    187        3      40 1.00000000       3
43 1.5-2l 30-35     121     24        3      43 0.13114754       3
55 1-1.5l 30-35     122     22        4      55 0.29464286       4
56 1-1.5l   >35     724    102        4      56 1.00000000       4
60 1.5-2l   >35     344     63        4      60 0.84375000       4
64    >2l   >35     114     33        4      64 0.44196429       4

整群抽样

cluster(data, clustername, size, method=c("srswor","srswr","poisson",
"systematic"),pik,description=FALSE)
clustername指用来划分群的变量名称,size表示要抽取的群的数量
sub7 = cluster(Insurance,clustername = 'District',size=2,method='srswor',description = T)
此处按照District不同的值分群，再抽出2群
   District ID_unit Prob
1         1       1  0.5
2         1       3  0.5
......
20        3      39  0.5
21        3      40  0.5
32        3      38  0.5
getdata(Insurance,sub7)
    Group   Age Holders Claims District ID_unit Prob
1     <1l   <25     197     38        1       1  0.5
3     <1l 30-35     246     20        1       3  0.5
.......
36    <1l   >35     648     67        3      36  0.5
37 1-1.5l   <25      53     10        3      37  0.5
38 1-1.5l 25-29     155     24        3      38  0.5

训练集和测试集

一般训练集和测试集的比例为3:1

 train_id=sample(nrow(Insurance),3/4*nrow(Insurance)) #训练集编号
 train_data= Insurance[train_id,] # 训练集数据
 test_data= Insurance[-train_id,] #测试集数据
 train_data
   District  Group   Age Holders Claims
9         1 1.5-2l   <25     133     19
43        3 1.5-2l 30-35     121     24
.......
46        3    >2l 25-29      29      2
37        3 1-1.5l   <25      53     10
test_data
   District  Group   Age Holders Claims
6         1 1-1.5l 25-29     536     84
10        1 1.5-2l 25-29     286     52
......
64        4    >2l   >35     114     33