R基础
1、查看当前工作目录
getwd()
2、更改目录
setwd("~/Downloads")
attention:当不能读取文件时通常是因为目录不是文件存储所在的位置 error:cannot open file 'data':no such file or directory
3、读取文件
statesInfo <- read.csv('stateDate.csv) 并把文件命名为statesInfo
4、查找表中的数据
2. statesInfo[rows,columns] statesInfo[statesInfo$state.region==1, ]```
5、取数据集中前2行和打印数据的大小
head(data,2)
dim(data)
6、查看数据信息;快捷键option+cmd+I
?cars str(cars)```
7、查找数据
subset(data,mpg>=30|hp<60)
8、把变量制成表,观察每个组的人数
table(data$employment.status)#"mployment.status"这列数据制成表:会显示出这一列中各个值的数量相当于group by
9、查看统计信息
summary(reddit)
10、查看一个变量的级别
levels(data$columns)
11、画直方图,图片不显示在plots要先运行dev.off()
library(ggplot2) qplot(data=reddit,x=age.range)#显示reddit数据集中age.range中各种值的分布(使用于有几
个离散值的列
12、对直方图列上面的级别排序
reddit$age.range <- ordered(data$columns,levels=c('Under 18','18-24','25-34','35-44','45-54','55-64','65 or Above'),Drdered=T)#Drdered=T顺序设置为真
13、画直方图的两种方式
># 第一种方式
>qplot(x = dob_day, data = pf)+
scale_x_continuous(breaks = 1:31)
># 第二种方式
>ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 0.5) +
scale_x_continuous(breaks = 1:31)#binwidth和bins的调整很重要尽量调小,越小越清晰
>##调整bins
>ggplot(aes(price),data = diamonds)+
geom_histogram(bins = 300)+
scale_x_log10()
>##默认的参数
>ggplot(aes(price),data = diamonds)+
geom_histogram(bins = 300)+
scale_x_log10()
- 总体来说,更少的binwidth,更能反应数据的整体趋势,更多的bins(组数),能展示数据中的细节
14、直方图按按列dob_month的值分别画图,为每个类别变量创建相同类型的图形
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 0.5) +
scale_x_continuous(breaks = 1:31)+
facet_wrap(~dob_month,ncol=3)#ncol列数
这个是facet_wrap(formula) facet_wrap(~variable琢面包裹
facet_grid(formula) 琢面网格facet_grid(vertical~horizontal)垂直方向分割的变量 水平方向的分割的变量
教程链接<http://www.cookbook-r.com/Graphs/Facets_(ggplot2)/>
15、设置直方图x轴的起点位置和终点位置
qplot(data=pf,x=friend_count,xlim=c(0,1000))#设置x轴位置的方法
qplot(x=friend_count,data=pf)+
scale_x_continuous(limits=c(0,1000))#设置图层的方法
16、忽略na值,
qplot(x=friend_count,data=subset(pf,!is.na(gender)),binwidth=25)+
scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,25))+
facet_wrap(~gender,ncol=2)
1.subset(pf,!is.na(gender))忽略gender列中的na值;
2.binwidth调组距;
3.scale_x_continuous(limits=c(0,1000)建立图层,设置X轴上起始和终点位置;
4.breaks设置0-1000的数据,组居为25;5.facet_wrap(~gender,ncol=2)按gender 的类别建立两个列图
17、查看统计数据
table(pf$gender)#查看数据集pf中gender字段各个值有多少
18、查看统计值
by(pf$friend_count,pf$gender,summary)
查看gender各列别的friend_count值统计
19、设置直方图的颜色
qplot(x=tenure,data=pf,binwidth=30,
color=I('black'),fill=I('#099DD9'))
ggplot(aes(x=price,fill=cut),data=diamonds)+
geom_histogram()+
facet_wrap(~color)+
scale_x_log10()+
scale_fill_brewer(type="qual")
1. fill=cut 设置填充颜色
20、
qplot(x = age,data=pf,binwidth=1,
color=I('black'),fill=I('#099DD9'))+
scale_x_continuous(breaks= seq(0,113,5))
#scale_x_continuous 是在X轴上设置断点
21、对变量取对数转为正态分布
##summary(log10(pf$friend_count+1))
以10为低的对数,变量的转换常用对变量的分布转为正态分布
qplot(x=(price/carat+1),data=diamonds,binwidth=50)+
facet_wrap(~cut)+
scale_x_log10()
22、取对数、平方根画图
>p1 <- qplot(x=friend_count,data=pf)
p2<- qplot(x=log10(friend_count+1),data=pf)
p3<- qplot(x=sqrt(friend_count),data=pf)
grid.arrange(p1,p2,p3,ncol=1)
plot1 <- qplot(data=diamonds,x=price,binwidth=100,fill=I('#099DD9')) +
ggtitle('Price')
plot2 <- qplot(data=diamonds,x=price,binwidth=0.01,fill=I('#F79420')) +
scale_x_log10()+
ggtitle('Price (log10)')
library(gridExtra)
library(grid)
grid.arrange(plot1,plot2,ncol=2)
23、创建频数多边形
##ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) +
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('好友数量') +
ylab('Percentage of users with that friend count')
24、求变量中类别的和
by(pf$www_likes,pf$gender,sum)
25、箱线图
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot',ylim=c(0,1000))```
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot')+
scale_y_continuous(limits=c(0,1000)) ```
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot')+
coord_cartesian(ylim= c(0,1000))
#将y限制在0-1000
26、ifelse和转换成因素变量
mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)#转换成因素变量
summary(pf$mobile_check_in)
27、研究两个连续变量之间的关系,做散点图
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_point(alpha=1/20)+
xlim(13,90)+
coord_trans(y='sqrt')
#geom_point的alpha参数是20个数据是一个全黑的点,coord_trans给y的值平方根也可以log10
ggplot(aes(x=age,y=friend_count),data=subset(pf,!is.na(gender)))+
geom_jitter(alpha=1/20,aes(colour=gender),height=0)+
xlim(13,90)+
coord_trans(y='sqrt',limy=c(0,3000))
#geom_jitter给数据添加抖动,可能出现负值,所以要设置当为负时,高度为0
ggplot(aes(x=table,y=price),data=diamonds)+
geom_point(alpha=1/5,aes(color=cut))+
scale_x_continuous(breaks=seq(50,80,2))
28、数据分组、取每组里面的均值、中位数
age_groups <- group_by(pf,age)
pf.fc_by_age <- summarise(age_groups,fried_coun_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n())#建立一个新的表
qplot(x=fried_coun_mean,data=pf.fc_by_age)
pf.fc_by_age <- arrange(pf.fc_by_age,age)#排序
head(pf.fc_by_age)
ggplot(aes(x=age,y=friend_count),data=pf)+
xlim(13,90)+
geom_point(alpha=0.05,
position=position_jitter(h=0),
color='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)+
geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .9),
linetype=2,color='blue')
fun.y=quantile,fun.args=list(probs= .9)是设置分位数
29、散点图中添加中位数、分位数图层
ggplot(aes(x=age,y=friend_count),data=pf)+
coord_cartesian(xlim=c(13,70),ylim=c(0,1000))+
geom_point(alpha=0.05,
position=position_jitter(h=0),
color='orange')+
#coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)+
geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .9),
linetype=2,color='blue')+
geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .5),
linetype=2,color='blue')+
geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .1),
linetype=2,color='blue')
1. coord_cartesian设置x、y轴的起始和终点位置
2. alpha=1/20 是20个数据位一个全黑的点;
2.1:position=position_jitter(h=0)抖动数据,是负值时设置位0
3.coord_trans(y='sqrt')取y轴平方跟也可以取对数;
4.取y 轴的均值geom_line(stat='summary',fun.y=mean);
5.取y轴的分位数geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .5),
linetype=2,color='blue')
30、查看两个变量的相关系数
with(pf,cor.test(age,friend_count))
with(subset(pf,age<=70),cor.test(age,friend_count,
method="pearson"))
1.pearson 积矩关联衡量两个变量之间的关联强度(default)
31、散点图+相关系数图+划分数据子集
ggplot(aes(x=www_likes_received,y=likes_received),data=pf)+
geom_point()+
xlim(0,quantile(pf$www_likes_received,0.95))+
ylim(0,quantile(pf$likes_received,0.95))+
geom_smooth(method='lm',color='red')
1.设施x\y的上下限用xlim\ylim 这里选取的是95%的数据;
2.显示相关系数的线:geom_smooth
32、平滑数据
p1<- ggplot(aes(x=age,y=fried_coun_mean),
data=subset(pf.fc_by_age,age<71))+
geom_line()
p2<-ggplot(aes(x=age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months,age_with_months<71))+
geom_line()+
geom_smooth()
p3<- ggplot(aes(x=round(age / 5)*5,y=friend_count),
data=subset(pf,age<71))+
geom_line(stat='summary',fun.y=mean)
library(gridExtra)
grid.arrange(p2,p1,p3,ncol=1)
分析更多变量
library(dplyr)
pf.fc_age_gender<- pf%>%
filter(!is.na(gender))%>%
group_by(age,gender)%>%
summarise(mean_friend_count=mean(friend_count),
median_friend_count=median(friend_count),
n=n())%>%
ungroup()%>%
arrange(age)
1.按性别、年龄、分组的数据框;
names(pf.fc_age_gender)
ggplot(aes(x=age,y=mean_friend_count),
data=pf.fc_age_gender)+
geom_line(aes(color=gender))
2.画散点图
2.长格式转换成宽格式
pf.fc_age_gender<- pf%>%
filter(!is.na(gender))%>%
group_by(age,gender)%>%
summarise(mean_friend_count=mean(friend_count),
median_friend_count=median(friend_count),
n=n())%>%
ungroup()%>%
arrange(age)
library(reshape2)
pf.fc_by_age_gender.wide<- dcast(pf.fc_age_gender,
age~gender,
value.vat='median_friend_count')
3.比率图
ggplot(aes(x=age,y=female/male),
data=pf.fc_by_age_gender.wide)+
geom_line()+
geom_hline(yintercept=1,alpha=0.3,linetype=2)
4.切割一个变量函数cut
pf$year_joined.bucket<- cut(pf$year_joined,
c(1994,1998,2001,2004))
1.这里c是指定切割的数,breaks=4就是以4的间隔切割
5.探索加入时间与好友数量的关系
ggplot(aes(x=age,y=friend_count),
data=subset(pf,!is.na(year_joined.bucket)))+
geom_line(aes(color=year_joined.bucket),stat="summary",fun.y=mean)+
geom_line(stat="summary",fun.y=mean,linetype=2)
1.geom_line(aes(color=year_joined.bucket),stat="summary",fun.y=mean)是设置year_joined.bucket)每个组内的均值
2. geom_line(stat="summary",fun.y=mean,linetype=2)
是设置整体的均值
6.建立友谊与使用时长之间关系的线图
需要利用的变量有年龄、使用时长、建立的友谊和year_joined.bucket
ggplot(aes(x=30*round(tenure/30),y=friendships_initiated/tenure),
data=subset(pf,tenure>=1))+
geom_line(aes(color=year_joined.bucket),stat="summary",fun.y=mean)
1.30*round(tenure/30)是调整组距,降低噪声
ggplot(aes(x=30*round(tenure/30),y=friendships_initiated/tenure),
data=subset(pf,tenure>=1))+
geom_smooth(aes(color=year_joined.bucket))
geom_smooth平滑
7.创建散点图 矩阵
library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset<- pf[,c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])
带有分面和颜色的价格直方图
ggplot(aes(x=price,fill=cut),data=diamonds)+
geom_histogram()+
facet_wrap(~color)+
scale_x_log10()+
scale_fill_brewer(type="qual")
8.画散点图、选取99%的数据、按某离散变量分类、轴log10
diamonds$volume<- diamonds$x*diamonds$y*diamonds$z
ggplot(aes(x=volume,y=price,fill=clarity,color=clarity),
data=diamonds)+
xlim(0,quantile(diamonds$volume,0.99))+
geom_point()+
scale_y_log10()+
scale_color_brewer(type='div')
9.散点图,取X轴的立方根,取Y轴的log10
ggplot(aes(carat,price),data=diamonds)+
geom_point()+
scale_x_continuous(trans=cuberoot_trans(),limits=c(0.2,3),
breaks=c(0.2,0.5,1,2,3))+
scale_y_continuous(trans=log10_trans(),limits=c(350,15000),
breaks=c(350,1000,5000,10000,15000))+
ggtitle('Price(log10) by Cube-Root of Carat')
ggplot(aes(carat, price), data = diamonds) +
geom_point(alpha=0.5,position='jitter',size=0.75) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
library('RColorBrewer')
ggplot(aes(x = carat, y = price,colour=clarity), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
1.设置颜色参数是净度-colour=clarity
2.显示的时候,最好的在最上面
创建线性模型并预测
要按转menisc包
bigdiamonds$logprice=log(bigdiamonds$price)
建立模型,模型有5个输入特征
m1<- lm(log(price)~I(carat^(1/3)),
data=bigdiamonds[bigdiamonds$price<10000&
bigdiamonds$cert=='GIA',])
m2<- update(m1,~ . + carat)
m3<- update(m2,~ . + cut)
m4<- update(m3,~ . + color)
m5<- update(m4,~ . + clarity)
mtable(m1,m2,m3,m4,m5)
##输入5个特征的值
thisDiamond=data.frame(carat=2.00,cut='y.Good',
color='I',clarity='VS1')
##调整置信区间level
modelEstimate=predict(m5,newdata=thisDiamond,
interval='prediction',level= .95)
##查看预测的结果
exp(modelEstimate)
m1<-lm(I(quality)~I(alcohol),data=pp_subset)
m2<-update(m1,~ . + pH)
m3<-update(m2,~ . + volatile.acidity)
m4<-update(m3,~ . + citric.acid)
m5<-update(m4,~ . +residual.sugar)
mtable(m1,m2,m3,m4,m5)
thisDiamond=data.frame(alcohol=0.5,pH=0.4,volatile.acidity=5,citric.acid=2,residual.sugar=5)
modelEstimate=predict(m5,newdata=thisDiamond,
interval='prediction')
#预测
exp(modelEstimate)
调整字体大小和水平位置
theme(axis.title.x=element_text(size=60),axis.title.y=element_text(size=60))+theme(plot.title=element_text(hjust=0.5))