课程github地址
Week2 内容
Content
- Reading from MySQL
- Reading from HDF5
- Reading from The web
- Reading from APls
- Reading from Other Sources
1.Reading from MySQL
- install MySQL
- install RMySQL
Connecting and listing databases
ucscDb <- dbConnect(MySQL(),user = "genome",
host="genome-mysql.soe.ucsc.edu")
result <- dbGetQuery(ucscDb,"show Databases;");dbDisconnect(ucscDb)
Connecting to hg19 and listing tables
hg19 <- dbConnect(MySQL(),user = "genome",db="hg19", host="genome-mysql.soe.ucsc.edu")
allTables <- dbListTables(ucscDb,"hg19")
length(allTables)
[1] 12535
allTables[1:5]
[1] "HInv" "HInvGeneMrna" "acembly" "acemblyClass" "acemblyPep"
Get dimensions of a specific table
dbListFields(hg19,"affyU133Plus2")
[1] "bin" "matches" "misMatches" "repMatches" "nCount" "qNumInsert" "qBaseInsert" "tNumInsert"
[9] "tBaseInsert" "strand" "qName" "qSize" "qStart" "qEnd" "tName" "tSize"
[17] "tStart" "tEnd" "blockCount" "blockSizes" "qStarts" "tStarts"
dbGetQuery(hg19,"Select count(*) from affyU133Plus2")
count(*)
1 58463
Read from the table
affyData <- dbReadTable(hg19,"affyU133Plus2")
head(affyData)
bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart
1 585 530 4 0 23 3 41 3 898 - 225995_x_at 637 5
2 585 3355 17 0 109 9 67 9 11621 - 225035_x_at 3635 0
3 585 4156 14 0 83 16 18 2 93 - 226340_x_at 4318 3
4 585 4667 9 0 68 21 42 3 5743 - 1557034_s_at 4834 48
5 585 5180 14 0 167 10 38 1 29 - 231811_at 5399 0
6 585 468 5 0 14 0 0 0 0 - 236841_at 487 0
Select a specific subset
2.Reading from HDF5
HDF官网
Create group
biocManager::install("rhdf5")
library(rhdf5)
---
create <- h5createFile("example.h5")
create <- h5createGroup("example.h5","foo")
created <- h5createGroup("example.h5","foo/foobaa").
h5ls("example.h5")
Write to groups
A = matrix(1:10,nr=5,nc=2)
h5write(A, "example.h5","foo/A")
B =array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))
attr(B,"scale")<-"liter"
h5write(B, "example.h5","foo/foobaa/B")
h5ls("example.h5")
Write a data set
df = data.frame(1L:5L,seq(0,1,length.out=5),c("ab","cde","fghi","a", "s"), stringsAsFactors=FALSE)
h5write(df, "example.h5","df")
h5ls("example.h5")
Reading data
readA = h5read("example.h5","foo/A")
readB = h5read("example.h5","foo/foobaa/B")
readdf= h5read("example.h5","df")
readA
Writing and reading chunks
h5write(c(12, 13, 14),"example.h5","foo/A",index=list(1: 3, 1))
h5read ("example.h5","foo/A")
3.Reading from The web
Getting data off webpages - readLines()
Parsing with XML
library(XML)
url<- "http://scholar.googlecom/citations?user=hi-i6coaaaaj&hl=en"
html <-htmlTreeParse(url, useInternalNodes=T)
pathsApply(html,"//title", xmlValue)
xpathSApply(html,"//td[@id='col-citedby']", xmlValue)
GET from the httr package
library(httr); html2= GET(url)
content2 = content(html2, as="text")
parsedHtml= htmlParse(content2, asText=TRUE)
xpathSApply (parsedHtml ,"//title", xmlValue)
Accessing websites with passwords
pgl = GET("http://httpbin.org/basic-auth/user/passwd",authenticate( user,"passwd"))
pgl
4.Reading from APls
5.Reading from Other Sources