getwd()
list.files()
pf <- read.csv('pseudo_facebook.tsv',sep='\t')
Third Qualitative Variable
在以性别为分类的年龄箱线图中,加入每个性别的平均年龄
原箱线图:
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot()
添加后的箱线图:
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot()+
stat_summary(fun.y=mean,geom='point',shape=4)
年龄、朋友数、性别三个变量:
ggplot(aes(x=age,y=friend_count),
data=subset(pf,!is.na(gender)))+
geom_line(aes(color=gender),stat='summary',fun.y=median)
按年龄和性别对数据进行分组,并计算每个组里的平均好友数,中位数好友数和每个组的数据条目数
detach("package:plyr", unload=TRUE)
library(dplyr)
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender))%>%
group_by(age,gender)%>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median=median(as.numeric(friend_count)),
n=n())%>%
ungroup()%>%
arrange(age)
head(pf.fc_by_age_gender)
用上面的分组创建图表
ggplot(aes(x=age,y=friend_count_median),data=pf.fc_by_age_gender)+
geom_line(aes(color=gender))
Thinking in Ratios
女性用户的好友数是男性用户好友数的几倍?
要回答这个问题,先重塑我们的数据
pf.fc_by_age_gender是长格式数据,我们要把它转化成宽格式数据,
每一行包括:
年龄
对应该年龄的男性用户的好友数(中位数)
对应该年龄的女性用户的好友数(中位数)
library(reshape2)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age~gender,
value.var = 'friend_count_median')
head(pf.fc_by_age_gender.wide)
函数dcast()中的d表示输出的数据结构为dataframe
如果要输出矩阵或者数组,应使用acast()
Ratio Plot
横轴:年龄
纵轴:男/女好友数的中位数
ggplot(aes(x=age,y=female/male),data=pf.fc_by_age_gender.wide)+
geom_line()+
geom_hline(yintercept = 1,alpha=0.3,linetype=2)
探索四个变量:年龄,性别,好友数,使用时长tenure
以2014为基准年,添加[加入时间]这个变量
pf$year_joined <- floor(2014-pf$tenure/365)
floor为向下取整,返回不大于该数字的最大整数
Cut a Variable
切割变量year_joined,分为以下几组:
2004-2009,2009-2011,2011-2012,2012-2014
summary(pf$year_joined)
table(pf$year_joined)
pf$year_joined.buckets <- cut(pf$year_joined,
c(2004,2009,2011,2012,2014))
table(pf$year_joined.buckets,useNA='ifany')
use variable year_joined.buckets to create a line graph
ggplot(aes(x=age,y=friend_count),
data=subset(pf,!is.na(year_joined.buckets)))+
geom_line(aes(color=year_joined.buckets),
stat='summary',
fun.y='median')
the parameter linetype can take the values 0-6:
0 = blank,
1 = solid,
2 = dashed
3 = dotted
4 = dotdash
5 = longdash
6 = twodash
Plot the Grand Mean
ggplot(aes(x=age,y=friend_count),
data=subset(pf,!is.na(year_joined.buckets)))+
geom_line(aes(color=year_joined.buckets),
stat='summary',
fun.y=mean)+
geom_line(stat='summary',fun.y=mean,linetype=2)
Friending Rate
with(subset(pf,tenure>1),summary(friend_count/tenure))
Friendships Initiated
ggplot(aes(x=tenure,y=friendships_initiated/tenure),
data=subset(pf,tenure>=1))+
geom_line(stat='summary',aes(color=year_joined.buckets),fun.y=mean)
偏差-方差权衡
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.buckets),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.buckets),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.buckets),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.buckets),
stat = "summary",
fun.y = mean)
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_smooth(aes(color = year_joined.buckets))
the Yogurt Data Set
getwd()
yo <- read.csv("yogurt.csv")
View(yo)
yo$id <- factor(yo$id)
str(yo)
酸奶价格直方图
ggplot(aes(x=price),data=yo)+
geom_histogram(fill=I('#FF6374'))
不同的酸奶价格
unique(yo$price)
length(unique(yo$price))
table(yo$price)
将一条购买记录中不同口味的酸奶数量加总,汇总成新变量all.purchases
names(yo)
yo <- transform(yo,all_purchases=strawberry+
blueberry+
pina.colada+
plain+mixed.berry)
all.purchases histogram
ggplot(aes(x=all_purchases),data=yo)+
geom_histogram(binwidth=1)
随时间变化的价格
ggplot(aes(x=time,y=price),data=yo)+
geom_jitter(alpha=1/10,shape=21,fill=I('#F79420'))
Sampling Observations
对于酸奶数据集,我们可能需要更详细地调查小样本的家庭
Looking at Samples of Households
set.seed(4230)
sample.ids <- sample(levels(yo$id),16)
sample.ids
ggplot(aes(x=time,y=price),
data=subset(yo,id %in% sample.ids))+
facet_wrap(~id)+
geom_line()+
geom_point(aes(size=all_purchases),pch=1)
Scatterplot Matrix 散点图矩阵
library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset <- pf[,c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])
set.seed确保得到可重复的结果
Even More Variables
nci <- read.table("nci.tsv")
colnames(nci)
colnames <-c(1:64)
Heat Maps
library(reshape2)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
习题集
1.带有分面和颜色的价格直方图
scale_fill_brewer(type = 'qual')可以修改颜色的编码方式
data(diamonds)
View(diamonds)
ggplot(aes(x = price,fill=cut),
data = diamonds) +
geom_histogram(bins=35) +
facet_wrap(~ color) +
scale_x_log10() +
scale_fill_brewer(type = 'qual')
2.价格与按切工填色的表格
names(diamonds)
p1 <- ggplot(aes(x=table,y=price),data=diamonds)+
geom_point(aes(color=cut))+
scale_color_brewer(type = 'qual')+
scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))
p2 <- ggplot(aes(x=table,y=price,fill=cut),data=diamonds)+
geom_point(aes(color=cut))+
scale_color_brewer(type = 'qual')+
scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))
library(gridExtra)
grid.arrange(p1,p2)
3.价格与体积和钻石净度
diamonds$v = diamonds$x*diamonds$y*diamonds$z
ggplot(aes(x=v,y=price,fill=clarity),
data=diamonds)+
xlim(0,quantile(diamonds$v,0.99))+
scale_y_log10()+
geom_point(aes(color=clarity))+
scale_color_brewer(type = 'div')
4.新建友谊的比例
pf$prop_initiated <- pf$friendships_initiated/pf$friend_count
5.prop_initiated 与使用时长
pf$year_joined <- floor(2014-pf$tenure/365)
pf$year_joined.buckets <- cut(pf$year_joined,
c(2004,2009,2011,2012,2014))
ggplot(aes(x=tenure,y=prop_initiated),
data=na.omit(pf))+
geom_line(aes(color=year_joined.buckets),
stat='summary',
fun.y=median)
ggplot(pf, aes(x=tenure,
y=prop_initiated,
color=year_joined.buckets)) +
geom_line(stat='summary', fun.y=median, na.rm=TRUE)
ggplot(pf, aes(x=tenure,
y=prop_initiated,
color=year_joined.buckets)) +
geom_line(stat='summary', fun.y=median, na.rm=TRUE)+
geom_smooth()
6.最大的组均值 prop_initiated
with(pf,year_joined.buckets=)
with(subset(pf,!is.na(prop_initiated)&year_joined.buckets=='(2012,2014]'),
mean(prop_initiated))
by(pf$prop_initiated,pf$year_joined.buckets,summary)
7.经过分组、分面和填色的价格/克拉
ggplot(aes(x=cut,y=price/carat),data=diamonds)+
geom_jitter(aes(color=color))+
facet_wrap(~clarity)+
scale_color_brewer(type = 'div')