因子
[if !supportLists]¢ [endif]一种特殊的向量,可对元素进行归类。
[if !supportLists]¢ [endif]创建factor对象
> f1<-factor( paste("x",rep(c(1,2),3),
sep=""))
[1] x1 x2 x1 x2 x1 x2
Levels: x1 x2
[if !supportLists]¢ [endif]获得因子中不重复的元素
> levels(f1)
[1] "x1" "x2"
[if !supportLists]¢ [endif]获得因子中不重复元素的数目
> nlevels(f1)
[1] 2
[if !supportLists]¢ [endif]获得因子中元素的频数
> table(f1)
f1
x1 x2
3 3
[if !supportLists]¢ [endif]将连续性变量进行切割,定义新的分组因子
> cut(1:20,breaks=5*(0:4))
> t(x) #转置
> solve(x) #矩阵的逆
> dim(x) #获得矩阵的维数
> x[1,1] #取出第1行、第1列元素
> x[c(1,3),] #取出第1行、第3行元素
> x[,c(1,3)] #取出第1列、第3列元素
> x[,”col2”] #取出名称为col2的列
> x[”row2”,] #取出名称为row2的行
> head(x) #取出矩阵前6行
> tail(x) #取出矩阵后6行
R 内置lm()函数,线性回归模型
> summary(lm(ds[,1]~ds[,-1]))$coefficients
自编函数
mystats <- function(x, parametric=TRUE,print=FALSE) {
if(parametric) {
center <- round(mean(x),2); spread <- round(sd(x),2)
}else {
center <- round(median(x),2);spread <- round(quantile(x,c(0.25,0.75)) ,2)
}
if(print & parametric) {
cat(paste0(center, "±", spread))
}else if (print & !parametric) {
cat(paste0(center, "(", spread[1],',',spread[2],')'))
}
result <- list(center=center, spread=spread)
return(result)
}
转置
cars <- mtcars[1:5,1:4]
数据整理示意
Student <- c("John Davis","Angela Williams", "Bullwinkle Moose","DavidJones", "Janice Markhammer", "CherylCushing","Reuven Ytzrhak", "Greg Knox", "JoelEngland","Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512,410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80,95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15,30, 27, 18)
roster <- data.frame(Student, Math,Science, English,stringsAsFactors=FALSE)
#将成绩单按照姓名进行排序
newdata<-roster[order(roster$student),]
roster[order(roster$Student),]
#将学生的各科考试成绩组合为单一的成绩衡量指标
sca<-scale(roster[,2:4])
sca
score<-apply(sca,1,mean)
score
roster<-cbind(roster,score)
roster
#基于相对名次(四等分)给出从A到D的评分(因子型)
y<-quantile(score,c(0.25,0.50,0.75))
y
roster$grade[score
roster$grade[score=y[1]]<-"C"
roster$grade[score=y[2]]<-"B"
roster$grade[score>=y[3]]<-"A"
roster
基础统计分析
#列联表
mytable <- with(,table())
mytable<- xtabs(~,data=)
library(gmodels)
t.test(y1,y2,paired = TRUE)#配对检验
aov()#方差分析
attach()
dose<-factor()
fit<- aov(len ~ supp*dose)
summary(fit)
¢相关性矩阵可视化
library(corrplot)#先加载包
corrplot(res,
type = "upper", order = "hclust", tl.col =
"black", tl.srt =45)
回归
简单线性回归
基础安装中的women数据,15个年龄在30~39岁间女性的身高和体重信息。
fit <- lm(weight ~ height, data=women)
summary(fit)
women$weight
fitted(fit)
residuals(fit)
plot(women$height,women$weight,xlab="Height (in inches)",ylab="Weight (in pounds)")
abline(fit)
多项式回归
fit2<- lm(weight ~ height + I(height^2), data=women)
summary(fit2)
plot(women$height,women$weight,
xlab="Height (in inches)",
ylab="Weight (in lbs)")
lines(women$height,fitted(fit2))
coef(fit2) #系数
coef(summary(fit2)) #假设检验结果
confint(fit2) #系数可信区间
箱线图
install.packages("car")
library(car)
scatterplot(weight ~ height, data=women,
pch=19,
main="Women Age 30-39",
xlab="Height (inches)",
ylab="Weight (lbs.)")
多重线性回归
state.x77
states <- as.data.frame(state.x77[,c("Murder","Population","Illiteracy", "Income", "Frost")])
cor(states)
library(car)
scatterplotMatrix(states, main="Scatter Plot Matrix")
states <- as.data.frame(state.x77[,c("Murder", "Population","Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost,data=states)
summary(fit)
#有交互项的多重线性回归
fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)
summary(fit)
#回归诊断
fit <- lm(weight ~ height, data=women)
Op <- par()
par(mfrow=c(2,2))
plot(fit)
par(Op)
states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
Op <- par()
par(mfrow=c(2,2))
plot(fit)
par(Op)
#基于car包的回归诊断
library(car)
states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
qqPlot(fit, labels=row.names(states), simulate=TRUE, main="Q-Q Plot")
#判断误差方差是否相等
library(car)
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
ncvTest(fit)
spreadLevelPlot(fit)
#线性模型假设的综合验证
install.packages("gvlma")
library(gvlma)
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost,
data=data.frame(state.x77))
gvmodel <- gvlma(fit)
summary(gvmodel)
多重共线性诊断
library(car)
fit<- lm(Murder ~ Population + Illiteracy + Income + Frost,
data=data.frame(state.x77))
vif(fit)
sqrt(vif(fit)) > 2
异常观测值离群点检验
library(car)
fit<- lm(Murder ~ Population + Illiteracy + Income + Frost,
data=data.frame(state.x77))
outlierTest(fit)
异常观测值 高杠杆点
hat.plot <- function(fit) {
p<- length(coefficients(fit))
n<- length(fitted(fit))
plot(hatvalues(fit),main="Index Plot of Hat Values")
abline(h=c(2,3)*p/n, col="red", lty=2)
identify(1:n,hatvalues(fit),
names(hatvalues(fit)))
}
hat.plot(fit)
异常观测值 强影响点
cutoff<- 4/(nrow(state.x77)-length(fit$coefficients)-2)
plot(fit,which=4, cook.levels=cutoff)
abline(h=cutoff, lty=2, col="red")
car包 可以将强影响点 高杠杆点等整合到一张图片中
library(car)
influencePlot(fit, main="Influence Plot",
sub="Circle size isproportional to Cook's distance")
改进措施
library(car)
summary(powerTransform(state.x77[,'Murder'])) 变量变换
states
<- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit1<- lm(Murder ~ Population + Illiteracy + Income + Frost,
data=states)
fit2<- lm(Murder ~ Population + Illiteracy, data=states)
anova(fit2,fit1)
anova函数可以比较两个嵌套模型的拟合优度
fit1<- lm(Murder ~ Population + Illiteracy + Income + Frost,data=states)
fit2<- lm(Murder ~ Population + Illiteracy, data=states)
AIC(fit1,fit2) 优先考虑AIC小的模型
逐步回归 MASS包
library(MASS)
states<- as.data.frame(state.x77[,c("Murder", "Population","Illiteracy", "Income",
"Frost")])
fit<- lm(Murder ~ Population + Illiteracy + Income + Frost,data=states)
stepAIC(fit, direction="backward")
交叉验证
shrinkage<- function(fit, k=10){
require(bootstrap)
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
x<- fit$model[,2:ncol(fit$model)]
y<- fit$model[,1]
results<- crossval(x,
y, theta.fit, theta.predict, ngroup=k)
r2<- cor(y, fit$fitted.values)^2
r2cv<- cor(y, results$cv.fit)^2
cat("Original R-square =", r2, "\n")
cat(k,"Fold Cross-Validated R-square =", r2cv, "\n")
cat("Change=", r2-r2cv, "\n")
}
states<- as.data.frame(state.x77[,c("Murder", "Population","Illiteracy", "Income","Frost")])
fit<- lm(Murder ~ Population + Income + Illiteracy + Frost, data=states)
shrinkage(fit)
states<- as.data.frame(state.x77[,c("Murder", "Population","Illiteracy", "Income","Frost")])
fit<- lm(Murder ~ Population + Illiteracy + Frost, data=states)
shrinkage(fit)