ctrl shift c 加注释
1.模拟试验
2.利用ggplot2绘制图形
3.简单的数据分析
>检验均匀分布——ks.test()
>正态性检验——shapiro.test(data)
ifelse(cond, statement1, statement2)
1.绘图基础绘图 +回归线+阴影 散点图,直方图,箱线图
#回归线阴影
ggplot(data=mtcars, aes(x=wt, y=mpg)) +
geom_point(pch=17, color='blue', size=2) +
geom_smooth(method='lm', color='red', linetype=2) +
labs(title="Automobile Data", x="Weight", y="Miles Per Gallon")
#散点图
#直方图
#箱线图(学姐版本
mtcars$cyl <- factor(mtcars$cyl)
ggplot(mtcars, aes(x=cyl, y=mpg)) +
geom_boxplot( color="black",
notch=TRUE,
fill = c('red','blue','green'))+
labs(x ="Number of Cylinders" ,y="Miles Per Gallon")+
geom_point(position="jitter", color="blue", alpha=.5)
#拼图(学姐版
install.packages("gridExtra")
library(gridExtra)
p1 <- ggplot(data =mtcars ) + geom_point(aes(x=wt,y=mpg))
p2 <- ggplot(data =mtcars ) + geom_bar(aes(x= am,fill = cyl),position = 'fill')
grid.arrange(p1,p2, ncol=2)
2.数据整理 整合,分类汇总,转置,重塑
apply(mydata, 1, mean) #行均值
apply(mydata, 2,mean) #列均值
aggdata <-aggregate(mtcars, by=list(G_cyl=mtcars$cyl,G_gear=mtcars$gear),
FUN=mean, na.rm=TRUE)
#拼接
#分类汇总
ddply
#加一列
mutate
#转置
#重塑reshape2
roster <- roster[order(roster$Student),]
# 1
Z_new <- scale(roster[,c("Math", "Science", "English")],center = T,scale = T)
roster$score <- apply(Z_new,1,mean)
Y_Cut <- quantile(roster$score,c(0,0.25,0.50,0.75,1))
roster$Rank <- cut(roster$score,breaks = Y_Cut ,include.lowest=T,labels = c('D','C','B','A'))
set.seed(123465)
my_data <- data.frame(matrix(sample.int(100,1000,replace = T),100,10))
names(my_data) <- paste0('璇勫',1:10)
my_data$ID <- 1:100
Score <- function(x){
max_ <- which.max(x)[1]
min_ <- which.min(x)[1]
x <- x[-c(max_,min_)]
return(mean(x,na.rm = T))
}
my_data$score <- apply(my_data[,paste0('打分',1:10)],2,Score)
3.数据添加列 定量数据转变为定性数据
#数据
Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose", "David Jones",
"Janice Markhammer", "Cheryl Cushing", "Reuven Ytzrhak", "Greg Knox", "Joel England",
"Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 37, 18)
roster <- data.frame(Student, Math, Science, English, stringsAsFactors = FALSE)
z <- scale(roster[ , 2:4])
> z
# mean()来计算各行的均值以获得综合得分
score <- apply(z, 1, mean)
# cbind()将均值列添加到花名册中
roster <- cbind(roster, score)
library(stringr)
name <- str_split_fixed(roster$Student," ",n=2)
colnames(name) <- c("firstname","lastname")
a <- cbind(roster,name)
mutate(roster,score=apply(scale(roster[,2:4]),1,mean))
!!!!!!!
y <- quantile(roster$score,c(1,0.8, 0.6, 0.4, 0.2,0))
y
x <- cut(roster$score,breaks = c(Inf,0.9076691,0.3161559,-0.3604672,-0.8771239,-Inf),labels=c("A","B","C","D","E"))
roster <- cbind(roster, x)
b <- ddply(test,c("sex","smoke"),summarise,means=mean(bmi,na.rm=T))
4.数据分析 简单分析 t检验,卡方检验,秩和检验
ttest
table useNA="ifany"
mytable <- xtabs(~Treatment+Improved, data=Arthritis)
chisq.test(mytable)
res_5 <- aov(Y ~ Weight_A+group_ , data=data5)
广义线性模型,变量筛选,交叉验证
glm
library(car)
states
<- as.data.frame(state.x77[,c("Murder",
"Population",
"Illiteracy",
"Income", "Frost")])
fit
<- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
qqPlot(fit,
labels=row.names(states),
simulate=TRUE, main="Q-Q Plot")
vif(fit)
outlierTest(fit)
anova()函数可以比较两个嵌套模型的拟合优度
stepAIC(fit, direction="backward") MASS包
states
<- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income","Frost")])
fit
<- lm(Murder
~ Population + Income + Illiteracy + Frost, data=states)
shrinkage(fit)
shrinkage
<- function(fit, k=10){
require(bootstrap)
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
x
<- fit$model[,2:ncol(fit$model)]
y
<- fit$model[,1]
results
<- crossval(x,
y, theta.fit, theta.predict, ngroup=k)
r2
<- cor(y, fit$fitted.values)^2
r2cv
<- cor(y, results$cv.fit)^2
cat("Original
R-square =", r2, "\n")
cat(k,
"Fold Cross-Validated R-square =", r2cv, "\n")
cat("Change
=", r2-r2cv, "\n")
}
5.模拟试验 简单检验方法:变量参数改变的影响,bootstrap,置换检验
data("women")
names(women)
#bootstrap
#set up the bootstrap
B <- 200 #number of replicates
n <- nrow(women) #sample size
R <- numeric(B) #storage for replicates
#bootstrap estimate of standard error of R
for (b in 1:B) {
#randomly select the indices
i <- sample(1:n, size = n, replace = TRUE)
height <- women$height[i] #i is a vector of indices
weight <- women$weight[i]
R[b] <- cor(height, weight)
}
#output
print(se.R <- sd(R))
#[1] 0.001321346
hist(R, prob = TRUE)
n <- 20 #样本量
alpha <- 0.05 #一类错误率
mu0 <- 500 #均数
sigma <- 100 #方差
m <- 1000
i=0
j=0
k=0
n1 <- c(20,50,100)
alpha1 <- c(0.05,0.10,0.20)
q <- numeric(m)
for (i in n1){
for (j in alpha1) {
for (k in 1:m) {
x <- rnorm(i, mu0, sigma)
ttest <- t.test(x, alternative = "greater", mu = mu0)
q[k] <- ifelse(ttest$p.value > 0.05,0,1)
}
print(paste(c(i,j,sum(q))))
}
}
results
<- boot(data=mtcars, statistic=rsq,
R=1000, formula=mpg~wt+disp)
library(coin)
score
<- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)
treatment
<- factor(c(rep("A",5), rep("B",5)))
mydata <- data.frame(treatment, score)
t.test(score~treatment, data=mydata, var.equal=TRUE)
6.function写法 输出描述性变量 学会paste()
paste0(c(M,p25,p75),sep=c("(","-",")"),collapse = "")
apply(mydata, 1, mean) #行均值
apply(mydata, 2,mean) #列均值