########### 练习一 ###########
Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose",
"David Jones", "Janice Markhammer", "Cheryl Cushing",
"Reuven Ytzrhak", "Greg Knox", "Joel England","Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(Student, Math, Science, English,stringsAsFactors=FALSE)
#1.将成绩单按照姓名进行排序
a <- roster[order(roster$Student),]
#2.将学生的各科考试成绩组合为单一的成绩衡量指标
##对各项成绩进行标准化
options(digits = 2)
b <- scale(roster[,2:4])
##求平均数
score <- apply(b,1,mean)
roster1 <- cbind(roster,score)
#3.基于相对名次(四等分)给出从A到D的评分(因子型)
c<-quantile(score1$score,c(0.75,0.5,0.25))
roster1$level[score>=c[1]]<-"A"
roster1$level[score<c[1] & score>=c[2]] <-"B"
roster1$level[score<c[2] & score>=c[3]] <-"C"
roster1$level[score<c[3]] <-"D"
########### 练习二 ###########
#1.分别使用for/while 求n!
## 1.1 for循环
factorial1 <- function(n){
s=1
for(i in 1:n){
s=i*s
i=i+1
}
print(s)
}
## 括号内输入n的值,例如n=4
factorial1(4)
##[1] 24
## 1.2 while循环
factorial2 <- function(n){
i=n
s=1
while(i>1){
s=i*s
i=i-1
}
print(s)
}
factorial2(3)
#[1] 6
#2.编写程序计算 h(x,n)=1+x+x^2+……+x^n
my_function <- function(x,n){
h=0
for(i in 0:n){
h <- h+x^i
i=i+1
}
print(h)
}
my_function(3,2)
##[1] 13
#3.编写程序,求斐波那契数列第n项
Fibobacci <- function(n){
f1=0
f2=1
if(n==1){
print(f1)
}
else if(n==2){
print(f2)
}
else{
for(i in 3:n){
f3=f1+f2
f1=f2
f2=f3
}
print(f3)
}
}
Fibobacci(1)
##[1] 0
Fibobacci(2)
##[1] 1
Fibobacci(4)
##[1] 2
########### 练习三 ###########
#1.编写函数进行体操评分,输入10名评委所评分数,去除一个最高分,一个最低分,
# 再计算出平均分作为选手得分。
#2.使用备注代码生成数据,计算运动员得分。
set.seed(123465)
my_data <- data.frame(matrix(sample.int(100,1000,replace = T),100,10))
names(my_data) <- paste0('评委',1:10)
my_data$ID <- 1:100
average <- function(i){
data <- subset(my_data,my_data$ID==i)[,1:10]
data <- as.numeric(data)
score <- sum(data)-max(data)-min(data)
averagescore <- score/8
return(averagescore)
}
average(5)
##[1] 54.875
average(6)
##[1] 61.375
考试
library(dplyr)
library(tidyr)
library(ggplot2)
library(MASS)
#一、 模拟实验
#从相同的正态分布总体N(50,52)中随机抽两个样本,样本含量均为10。对两个样本进行成组t检验,检验水准为0.05,
#n=10
pval<-c()
for(i in 1:1000)
{
x<- rnorm(10,mean=50,sd=5)
y<- rnorm(10,mean=50,sd=5)
norm <- data.frame(x,y,stringsAsFactors=FALSE)
library(MASS)
outcome<- t.test(x,y)
p<-outcome$p.value
pval<-c(pval,p)
i=i+1
}
n1<-which(pval < 0.05)
n2<-which(pval < 0.1)
n3<-which(pval < 0.2)
#n=20
pval<-c()
for(i in 1:1000)
{
x<- rnorm(20,mean=50,sd=5)
y<- rnorm(20,mean=50,sd=5)
norm <- data.frame(x,y,stringsAsFactors=FALSE)
library(MASS)
outcome<- t.test(x,y)
p<-outcome$p.value
pval<-c(pval,p)
i=i+1
}
n4<-which(pval < 0.05)
n5<-which(pval < 0.1)
n6<-which(pval < 0.2)
#n=50
pval<-c()
for(i in 1:1000)
{
x<- rnorm(50,mean=50,sd=5)
y<- rnorm(50,mean=50,sd=5)
norm <- data.frame(x,y,stringsAsFactors=FALSE)
library(MASS)
outcome<- t.test(x,y)
p<-outcome$p.value
pval<-c(pval,p)
i=i+1
}
n7<-which(pval < 0.05)
n8<-which(pval < 0.1)
n9<-which(pval < 0.2)
setwd("C:\\users\\Lenovo\\Desktop\\workspace\\2021-2022-1\\R\\test")
test <- read.csv("test_data.csv")
# 1 绘图
hist(test$age, freq = F, col="red4")
lines(density(test$age),col="blue4",lwd=2)
# 2 自定义函数
M_quantile <- function(x){
q1 <- signif(quantile(x,na.rm = T)[2], digits = 4)
q2 <- signif(quantile(x,na.rm = T)[3], digits = 4)
q3 <- signif(quantile(x,na.rm = T)[4], digits = 4)
q <- paste0(q2,"(",q1,"-",q3,")")
return(q)
}
sapply(test[c("bmi","packyr")], M_quantile)
# 3 差异性分析
# age
test$age_group <- cut(test$age,c(40,50,60,70,80),c(1,2,3,4))
table_age <- xtabs(~test$age_group+test$sex)
fisher.test(table_age)
############ p-value = 0.00468
# education
table_edu <- xtabs(~test$education+test$sex)
fisher.test(table_edu)
############ p-value = 0.003732
# bmi
test$bmi_group <- cut(test$bmi,c(0,18.5,24,28,60),c(1,2,3,4))
table_bmi <- xtabs(~test$bmi_group+test$sex)
fisher.test(table_bmi)
############ p-value = 0.2109
# smoke
table_smoke <- xtabs(~test$smoke+test$sex)
fisher.test(table_smoke)
############ p-value < 2.2e-16
# 广义线性模型
test$packyr[is.na(test$packyr)] <- 0 # 填补缺失值
test1 <- test[,-1]
head(test1)
model <- glm(lung_ca~.,data = test1) #gaussian family
summary(model)
model <- glm(lung_ca~.,data = test1,family = binomial)
summary(model)
model1 <- glm(lung_ca~age+packyr+respdis,data = test1,family = binomial)
summary(model)
step_ <- step(model,direction = "backward") #Error in step(model) : 行数有变化:是不是删除了遺漏值?
# 查找缺失值
apply(test1, 2, function(x) sum(is.na(x)))
# 发现bmi有缺失,填补缺失值
test1$bmi[is.na(test1$bmi)] <- mean(test1$bmi,na.rm = T)
model <- glm(lung_ca~.,data = test1,family = binomial)
step_ <- step(model,direction = "backward")
model2 <- glm(lung_ca~age + sex + packyr + respdis + exposure + drink,data = test1,family = binomial)
summary(model2)
# 最后还是只有age、packyr、repdis这三个因素有统计学意义