当你需要一组简单的示例数据用以模拟分析时,除了使用R种自带的数据集,也可以自己通过随机数产生。
下列文件展示了一组基于已有数据额外拓展生成两组额外sd的随机数据,以便模拟两组sd变化时对第三组数据的影响。过程是生成对应数且指定均值的两组sd。随后通过公式计算第三组结果,以模拟三组数据在指定公式下的相互影响。
# 文件预览
> head(row_data)
sample number trait_A trait_B trait_C
1 Dhi 1 62.40 80.72 143.12
2 Dhi 1 68.01 88.09 156.10
3 Dhi 2 59.30 62.24 121.54
4 Dhi 2 61.27 65.52 126.79
5 Dhi 3 47.01 64.12 111.13
6 Dhi 3 47.01 75.06 122.07
# 统计均值,分型
sum_tmp <- row_data%>%
group_by(sample, number)%>%
summarise(trait_A = sum(sum_A),
trait_B= sum(sum_B),
trait_C = sum(sum_C))%>%
data.frame()%>%
mutate(AR = trait_B/trait_A,
CI = trait_A/(trait_A + trait_B)*100)%>%
mutate(check = if_else(trait_A <= trait_B, "TRUE","FALSE"))%>%
mutate(type = if_else(AR >= 1.00 & AR <= 1.70 ,"typeA",
if_else(AR > 1.70 & AR <= 3.00,"typeB",
if_else(AR > 3.00 & AR <= 7.00,"typeC","typeD"))))%>%
rownames_to_column(var = "label")
### 两个数据的sd
sd_1 <- rnorm(sum_tmp$sample%>%
#length确定数据数量,mean和sd指定方差的比例和浮动情况
length(),mean = 0.05,sd = 0.008)%>%
data.frame()%>%
dplyr::rename(tmp_AR = ".")%>%
# label 用以多表关联
rownames_to_column(var = "label")
sd_2 <- rnorm(sum_tmp$sample%>%
length(),mean = 0.02,sd = 0.001)%>%
data.frame()%>%
dplyr::rename(tmp_CI = ".")%>%
rownames_to_column(var = "label")
# 根据label合并
sum_data <- sum_tmp%>%
left_join(sd_1,by = "label")%>%
left_join(sd_2,by = "label")%>%
mutate(AR_sd = AR*tmp_AR,
CI_sd = CI*tmp_CI)%>%
dplyr::select(sample,number,AR,AR_sd,CI,CI_sd,type)
将上述分步脚本合并,同时增加对小数点位数的统计等。
# 整合脚本
sum_tmp <- row_data%>%
group_by(sample, number)%>%
summarise(trait_A = sum(sum_A),
trait_B = sum(sum_B),
trait_C = sum(sum_C))%>%
data.frame()%>%
#直接在计算后控制小数点位置
mutate(AR = round(trait_B/trait_A,2),
CI = round(trait_A/(trait_A + trait_B)*100,2))%>%
left_join(row_data%>%
group_by(sample, number)%>%
summarise(trait_C = sum(sum_C))%>%
data.frame()%>%
group_by(sample)%>%
summarise(sum_C = sum(trait_C))%>%
data.frame(),by = "sample")%>%
mutate(R_sum_C = round(trait_C*100/sum_C,2))%>%
mutate(check = if_else(trait_A <= trait_B, "TRUE","FALSE"))%>%
mutate(chr_type = if_else(AR >= 1.00 & AR <= 1.70 ,"TypeA",
if_else(AR > 1.70 & AR <= 3.00,"TypeB",
if_else(AR > 3.00 & AR <= 7.00,"TypeC","TypeD"))))%>%
rownames_to_column(var = "label")%>%
#通过length控制生成的数目
left_join(round(rnorm(row_data$sample%>%
# 由于数目过小,先保留4位小数
length(),mean = 0.05,sd = 0.008),4)%>%
#转换格式
data.frame()%>%
#修改名称,否则后面重复
dplyr::rename(tmp_AR = ".")%>%
rownames_to_column(var = "label"),by = "label")%>%
left_join(round(rnorm(row_data$sample%>%
length(),mean = 0.02,sd = 0.001),4)%>%
data.frame()%>%
dplyr::rename(tmp_CI = ".")%>%
rownames_to_column(var = "label"),by = "label")%>%
left_join(round(rnorm(row_data$sample%>%
length(),mean = 0.04,sd = 0.002),4)%>%
data.frame()%>%
dplyr::rename(tmp_RL = ".")%>%
rownames_to_column(var = "label"),by = "label")%>%
#将生成的sd与原数据相乘,此时回到两位小数
mutate(AR_sd = paste(AR," ± ",round(tmp_AR*AR,2)),
CI_sd = paste(CI," ± ",round(tmp_CI*CI,2)),
RL_sd = paste(R_sum_C," ± ",round(tmp_RL*R_sum_C,2)))%>%
dplyr::select(sample,number,AR_sd,CI_sd,RL_sd,chr_type)%>%
mutate(AR = if_else(chr_type == "T","--",AR_sd),
CI = if_else(chr_type == "T","--",CI_sd),
RL = if_else(chr_type == "T","--",RL_sd))%>%
dplyr::select(sample,number,AR,CI,RL,chr_type)