- data.frame
- kids<-c("Jack","Jill")
- ages<-c(12,10)
- d<-data.frame(kids,ages,stringsAsFactors=FALSE)
- d
- kids ages
- Jack 12
- Jill 10
- d[[1]] # "Jack" "Jill"
- d$kids # same as above
- d[,1] # same as above
- examsquiz<-read.table("exams",header=TRUE)
- examsquiz[2:5,]
- examsquiz[2:5,2]
- class(examsquiz[2:5,2]) # "numeric"
- class(examsquiz[2:5,2,drop=FALSE]) # "data.frame"
- x<-c(2,NA,4)
- mean(x) # NA
- mean(x,na.rm=TRUE) # 3 or use subset()
- examsquiz[examsquiz$Exam.1>=3.8,]
- subset(examsquiz,Exam.1>=3.8) #ERROR: subset(examsquiz,examsquiz$Exam.1>=3.8)
- d4
- kids states
- Jack CA
- <NA> MA
- Jillian MA
- John <NA>
- complete.cases(d4) # TRUE FALSE TRUE FALSE, row view
- d5<-d4[complete.cases(d4),]
- d5
- kids states
- Jack CA
- Jillian MA
- rbind(d,list("Laura",19))
- kids ages
- Jack 12
- Jill 10
- Laura 19
- eq<-cbind(examsquiz,examsquiz$Exam.2-examsquiz$Exam.1)
- class(eq) # "data.frame"
- examsquiz$ExamDiff<-examsquiz$Exam.2-examsquiz$Exam.1
- d$one<-1
- d
- kids ages one
- Jack 12 1
- Jill 10 1
- d1
- kids states
- Jack CA
- Jill MA
- d2
- ages kids
- 10 Jill
- 7 Lillian
- 12 Jack
- d3
- ages pals
- 12 Jack
- 10 Jill
- 7 Lillian
- merge(d1,d3,by.x="kids",by.y="pals")
- kids stats ages
- Jack CA 12
- Jill MA 10
- d2a<-rbind(d2,list(15,"Jill"))
- ages kids
- 12 Jack
- 10 Jill
- 7 Lillian
- 15 Jill
- merge(d1,d2a)
- kids states ages
- Jack CA 12
- Jill MA 10
- Jill MA 15
- dl<-lapply(d,sort) # "Jack" "Jill" and 10 12
- as.data.frame(dl)
- kids ages
- Jack 10
- Jill 12
- example 1
- all2006<-read.csv("2006.csv",header=TRUE,as.is=TRUE) # as.is=TRUE opposite to stringsAsFactors=TRUE
- all2006<-all2006[all2006$Wage_Per=="Year",] # exclude hourly-wagers, one kind of data cleaning
- all2006<-all2006[all2006$Wage_Offered_From>20000,] # exclude weird cases, one kind of data cleaning
- all2006$rat<-all2006$Wage_Offered_From / all2006$Prevailing_Wage_Amount # create a new column
makecorp<-function(corpname) {
t<-all2006[all2006$Employer_Name==corpname,] # get sub data
return(t)
}
corplist<-c("MICROSOFT","ms","INTEL","intel","GOOGLE","google")
for (i in 1:(length(corplist)/2)) {
corp<-corplist[2*i-1]
newdtf<-paste(corplist[2*i],"2006",sep="") # concate
assign(newdtf,makecorp(corp),pos=.GlobalEnv)
- example 2
- count.fields("DA",sep=",")
- all(count.fields("DA",sep=",")>=5) # if TRUE, ok
- da<-read.csv("DA",header=TRUE,stringsAsFactors=FALSE)
- for (col in 1:6)
- print(unique(sort(da[,col])))
- mrg<-merge(da,db,by.x=1,by.y=1) # merge by their ids
- example 3
- aba<-read.csv("abalone.data",header=T)
- abamf<-aba[aba$Gender!="I",] # exclude infants from the analysis
- lftn<-function(clmn) {
- glm(abamf$Gender ~ clmn, family=binomial)$coef # return coef to loall
- # R can auto return without explict return statement
- }
- loall<-sapply(abamf[,-1],lftn)
- class(loall) # "glm" "lm"