R实战学习笔记1

ctrl shift c 加注释


1.模拟试验

2.利用ggplot2绘制图形

3.简单的数据分析

>检验均匀分布——ks.test()

>正态性检验——shapiro.test(data)

R实战:重塑数据reshape

方差分析

ggplot2高级绘图

paste用法

数据管理

ifelse(cond, statement1, statement2)


1.绘图基础绘图 +回归线+阴影 散点图,直方图,箱线图


#回归线阴影

ggplot(data=mtcars, aes(x=wt, y=mpg)) +

  geom_point(pch=17, color='blue', size=2) +

  geom_smooth(method='lm', color='red', linetype=2) +

  labs(title="Automobile Data", x="Weight", y="Miles Per Gallon")

#散点图

#直方图

#箱线图(学姐版本

mtcars$cyl <- factor(mtcars$cyl)

ggplot(mtcars, aes(x=cyl, y=mpg)) +

  geom_boxplot( color="black",

              notch=TRUE,

              fill = c('red','blue','green'))+

  labs(x ="Number of Cylinders" ,y="Miles Per Gallon")+

  geom_point(position="jitter", color="blue", alpha=.5)

#拼图(学姐版

install.packages("gridExtra")

  library(gridExtra)

  p1 <- ggplot(data =mtcars ) + geom_point(aes(x=wt,y=mpg))

  p2 <- ggplot(data =mtcars ) + geom_bar(aes(x= am,fill = cyl),position = 'fill')

  grid.arrange(p1,p2, ncol=2)


2.数据整理 整合,分类汇总,转置,重塑


apply(mydata, 1, mean)  #行均值

apply(mydata, 2,mean)  #列均值


aggdata <-aggregate(mtcars, by=list(G_cyl=mtcars$cyl,G_gear=mtcars$gear),

                    FUN=mean, na.rm=TRUE)

#拼接


#分类汇总

ddply

#加一列

mutate

#转置


#重塑reshape2


roster <- roster[order(roster$Student),]

# 1

Z_new <- scale(roster[,c("Math", "Science", "English")],center = T,scale = T)

roster$score <- apply(Z_new,1,mean)

Y_Cut <- quantile(roster$score,c(0,0.25,0.50,0.75,1))

roster$Rank <- cut(roster$score,breaks = Y_Cut ,include.lowest=T,labels = c('D','C','B','A'))



set.seed(123465)

my_data <- data.frame(matrix(sample.int(100,1000,replace = T),100,10))

names(my_data) <- paste0('璇勫',1:10)

my_data$ID <- 1:100

Score <- function(x){

  max_ <- which.max(x)[1]

  min_ <- which.min(x)[1]

  x <- x[-c(max_,min_)]

  return(mean(x,na.rm = T))

}

my_data$score <- apply(my_data[,paste0('打分',1:10)],2,Score)


3.数据添加列 定量数据转变为定性数据


#数据

Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose", "David Jones",

            "Janice Markhammer", "Cheryl Cushing", "Reuven Ytzrhak", "Greg Knox", "Joel England",

            "Mary Rayburn")

Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)

Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)

English <- c(25, 22, 18, 15, 20, 28, 15, 30, 37, 18)

roster <- data.frame(Student, Math, Science, English, stringsAsFactors = FALSE)

z <- scale(roster[ , 2:4])

> z

# mean()来计算各行的均值以获得综合得分

score <- apply(z, 1, mean)

# cbind()将均值列添加到花名册中

roster <- cbind(roster, score)

library(stringr)

name <- str_split_fixed(roster$Student," ",n=2)

colnames(name) <- c("firstname","lastname")

a <- cbind(roster,name)

mutate(roster,score=apply(scale(roster[,2:4]),1,mean))


!!!!!!!


y <- quantile(roster$score,c(1,0.8, 0.6, 0.4, 0.2,0))

y

x <- cut(roster$score,breaks = c(Inf,0.9076691,0.3161559,-0.3604672,-0.8771239,-Inf),labels=c("A","B","C","D","E"))

roster <- cbind(roster, x)


b <- ddply(test,c("sex","smoke"),summarise,means=mean(bmi,na.rm=T))


4.数据分析 简单分析 t检验,卡方检验,秩和检验

ttest

table useNA="ifany"

mytable <- xtabs(~Treatment+Improved, data=Arthritis)

chisq.test(mytable)

res_5 <- aov(Y ~ Weight_A+group_ , data=data5)


广义线性模型,变量筛选,交叉验证

glm


library(car)

states

<- as.data.frame(state.x77[,c("Murder",

"Population",

"Illiteracy",

"Income", "Frost")])

fit

<- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)

qqPlot(fit,

labels=row.names(states),

simulate=TRUE, main="Q-Q Plot")


vif(fit)

outlierTest(fit)

anova()函数可以比较两个嵌套模型的拟合优度

stepAIC(fit, direction="backward") MASS包

states

<- as.data.frame(state.x77[,c("Murder",  "Population",

  "Illiteracy", "Income","Frost")])

fit

<- lm(Murder

~ Population + Income + Illiteracy + Frost, data=states)

shrinkage(fit)

shrinkage

<- function(fit, k=10){

require(bootstrap)

theta.fit <- function(x,y){lsfit(x,y)}

theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}

x

<- fit$model[,2:ncol(fit$model)]

y

<- fit$model[,1]

results

<- crossval(x,

y, theta.fit, theta.predict, ngroup=k)

r2

<- cor(y, fit$fitted.values)^2

r2cv

<- cor(y, results$cv.fit)^2

cat("Original

R-square =", r2, "\n")

cat(k,

"Fold Cross-Validated R-square =", r2cv, "\n")

cat("Change

=", r2-r2cv, "\n")

}

5.模拟试验 简单检验方法:变量参数改变的影响,bootstrap,置换检验



data("women")

names(women)

#bootstrap

#set up the bootstrap

B <- 200            #number of replicates

n <- nrow(women)      #sample size

R <- numeric(B)    #storage for replicates

#bootstrap estimate of standard error of R

for (b in 1:B) {

  #randomly select the indices

  i <- sample(1:n, size = n, replace = TRUE)

  height <- women$height[i]      #i is a vector of indices

  weight <- women$weight[i]

  R[b] <- cor(height, weight)

}

#output

print(se.R <- sd(R))

#[1] 0.001321346

hist(R, prob = TRUE)


n <- 20 #样本量

alpha <- 0.05 #一类错误率

mu0 <- 500 #均数

sigma <- 100 #方差

m <- 1000

i=0

j=0

k=0

n1 <- c(20,50,100)

alpha1 <- c(0.05,0.10,0.20)

q <- numeric(m)

for (i in n1){

  for (j in alpha1) {

    for (k in 1:m) {

      x <- rnorm(i, mu0, sigma)

      ttest <- t.test(x, alternative = "greater", mu = mu0)

      q[k] <- ifelse(ttest$p.value > 0.05,0,1)

    }     

    print(paste(c(i,j,sum(q))))

  }

}

results

<- boot(data=mtcars, statistic=rsq,

               R=1000, formula=mpg~wt+disp)


library(coin)

score

<- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)

treatment

<- factor(c(rep("A",5), rep("B",5)))

mydata <- data.frame(treatment, score)

t.test(score~treatment, data=mydata, var.equal=TRUE)


6.function写法 输出描述性变量 学会paste()

paste0(c(M,p25,p75),sep=c("(","-",")"),collapse = "")

R数据管理高级版

apply(mydata, 1, mean)  #行均值

apply(mydata, 2,mean)  #列均值

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容