1.创建leadership数据框
manager <- c(1,2,3,4,5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32,45,25,39,99)
q1 <- c(5,3,3,3,2)
q2 <- c(4,5,5,3,2)
q3 <- c(5,2,5,4,1)
q4 <- c(5,5,5,NA,2)
q5 <- c(5,5,2,NA,1)
leadership <- data.frame(manager, date, country, gender, age,
q1,q2,q3,q4,q5, stringsAsFactors = FALSE)
2. 从leadership数据框中选择变量q1, q2, q3, q4, q5,并将其保存到数据框newdata中
方法1:
newdata <- leadership[, c(6:10)]
方法2:
myvars <- c("q1", "q2", "q3", "q4", "q5")
思考:c("q1", "q2", "q3", "q4", "q5")可以等价用什么函数实现?
newdata <- leadership[myvars]
3.剔除变量q4、q5
方法1:
myvars <- names(leadership) %in% c("q4", "q5")
newdata <- leadership[!myvars]
方法2:
idx <- which(names(leadership)== c("q4","q5"))
newdata <- leadership[-idx]
方法3:
idx <- which(names(leadership)== c("q4","q5"))
leadership[idx] <- NULL
方法4:
leadershipq5 <- NULL
4.设定条件gender为M,age大于30的选入观测值
方法1:
newdata <- leadership[1:3,] #选择第一行到第三行的观测值
newdata <- leadership[leadershipage >30,]
方法2:
attach(leadership)
newdata <- leadership[gender == "M" & age > 30,]
detach(leadership)
5.选定时间范围在2009-01-01~2009-10-31之间的观测值
leadershipdate, "%m/%d/%y")
startdate <- as.Date("2009-01-01")
enddate <- as.Date("2009-10-31")
newdata <- leadership[which(leadershipdate <= enddate),]
6.利用subset函数提取子集
选择age大于35或age小于24的行,并保留变量q1到q4
newdata <- subset(leadership, age>=35|age<=24,
select = c(q1, q2, q3, q4))
选择所有25岁以上的男性,并保留变量gender到q4(即gender到q4和其间的所有列)
newdata <- subset(leadership, age>25 &gender=="M",
select = gender:q4)
7.利用sample函数从leadership数据集中随机抽取大小为3的样本
mysample <- leadership[sample(1:nrow(leadership), 3, replace = FALSE),]