1. Download Data
if(!file.exists("data")) { dir.create("data")}
fileUrl<-"https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./data/cameras.csv",method="curl")
list.files("./data")
2. Reading Local File (.csv)
cameraData<-read.table("./data/cameras.csv",sep=",",header=TRUE)
head(cameraData)
3. Reading Excel File (.xlsx)
library(xlsx)
cameraData<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,header=TRUE)
head(cameraData)
## Reading specific rows and columns
colIndex<-2:3
rowIndex<-1:4
cameraDataSubset<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,colIndex=colIndex,rowIndex=rowIndex)
cameraDataSubset
3. Reading XML and HTML
library(XML)
fileUrl<-"http://www.w3schools.com/xml/simple.xml"
doc<-xmlTreeParse(fileUrl,useInternal=TRUE)
rootNode<-xmlRoot(doc)
xmlName(rootNode) #查看文件标题
names(rootNode) #查看所有子主题
rootNode[[1]] #查看子主题第一级
rootNode[[1]][[1]] #查看子主题第一级的第一个Element
xmlSApply(rootNode,xmlValue) #查看所有Element的Value
XPath:
/nodeTop level node
//nodeNode at any level
node[@attr-name]Node with an attribute name
node[@attr-name='bob']Node with attribute name attr-name='bob'
Information from:http://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf
xpathSApply(rootNode,"//name",xmlValue)
xpathSApply(rootNode,"//price",xmlValue)
fileUrl<-"http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"doc<-htmlTreeParse(fileUrl,useInternal=TRUE)scores<-xpathSApply(doc,"//li[@class='score']",xmlValue)teams<-xpathSApply(doc,"//li[@class='team-name']",xmlValue)scores
4. Reading JSON
library(jsonlite)
jsonData<fromJSON("https://api.github.com/users/jtleek/repos")
names(jsonData)
jsonData$name
names(jsonData$owner)
jsonData$owner$login
#Writing data frames to JSON
myjson<-toJSON(iris,pretty=TRUE)
cat(myjson)
#Convert back to JSON
iris2<-fromJSON(myjson)
head(iris2)
5. Data Table
library(data.table)
DF=data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DF,3)
DT=data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))head(DT,3)
# See all data tables in Memory
tables()
# Subsetting rows
DT[2,]
DT[DT$y=="a",] #选出y=a的
DT[c(2,3)] #选出行12,列123
# Calculating values for variables with expressions
DT[,list(mean(x),sum(z))] #返回x的mean,z的sum两个值
# Adding new columns
DT[,w:=z^2]
# 多重操作,tep意指中间变量
DT[,m:={tmp<-(x+z); log2(tmp+5)}]
# plyr like operations
DT[,a:=x>0] #增加一个变量 true false
DT[,b:=mean(x+w),by=a] #by语句
# Special Variable
.N An integer, length 1, containing the number of elements of a factor level
set.seed(123);
DT<-data.table(x=sample(letters[1:3],1E5,TRUE))
DT[, .N,by=x]
# Keys (重要)
DT<-data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
setkey(DT,x)
DT['a']
# Fread指令 Fast reading
big_df<-data.frame(x=rnorm(1E6),y=rnorm(1E6))
file<-tempfile()write.table(big_df,file=file,row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
system.time(fread(file))