Installing MySQL
Download link : http://biostat.mc.vanderbilt.edu/wiki/Main/RMySQL
Tutorial for download : http://www.ahschulz.de/2013/07/23/installing-rmysql-under-windows/
setwd("C:/Users/krith/OneDrive/Documents/coursera/getting_cleaning_data/week2")
#install.packages("RMySQL")
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 3.3.3
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.3.3
ucscDB <- dbConnect(MySQL(),user="genome",host="genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDB,"show databases;");
dbDisconnect(ucscDB) # mysql command
## [1] TRUE
head(result) # these are databases
## Database
## 1 information_schema
## 2 ailMel1
## 3 allMis1
## 4 anoCar1
## 5 anoCar2
## 6 anoGam1
hg19 <- dbConnect(MySQL(),user="genome",db="hg19",
host="genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
## [1] 11048
allTables[1:5]
## [1] "HInv" "HInvGeneMrna" "acembly" "acemblyClass"
## [5] "acemblyPep"
# Each table is a different dataset
Get dimensions of a specific table
dbListFields(hg19,"affyU133Plus2")
## [1] "bin" "matches" "misMatches" "repMatches" "nCount"
## [6] "qNumInsert" "qBaseInsert" "tNumInsert" "tBaseInsert" "strand"
## [11] "qName" "qSize" "qStart" "qEnd" "tName"
## [16] "tSize" "tStart" "tEnd" "blockCount" "blockSizes"
## [21] "qStarts" "tStarts"
dbGetQuery(hg19,"select count(*) from affyU133Plus2")
## count(*)
## 1 58463
Read from table
affyData <- dbReadTable(hg19,"affyU133Plus2")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 1 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 2 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 3 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 4 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 5 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 6 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 7 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 8 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 11
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 12
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 13
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 15
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 16
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 17
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 18
## imported as numeric
head(affyData)
## bin matches misMatches repMatches nCount qNumInsert qBaseInsert
## 1 585 530 4 0 23 3 41
## 2 585 3355 17 0 109 9 67
## 3 585 4156 14 0 83 16 18
## 4 585 4667 9 0 68 21 42
## 5 585 5180 14 0 167 10 38
## 6 585 468 5 0 14 0 0
## tNumInsert tBaseInsert strand qName qSize qStart qEnd tName
## 1 3 898 - 225995_x_at 637 5 603 chr1
## 2 9 11621 - 225035_x_at 3635 0 3548 chr1
## 3 2 93 - 226340_x_at 4318 3 4274 chr1
## 4 3 5743 - 1557034_s_at 4834 48 4834 chr1
## 5 1 29 - 231811_at 5399 0 5399 chr1
## 6 0 0 - 236841_at 487 0 487 chr1
## tSize tStart tEnd blockCount
## 1 249250621 14361 15816 5
## 2 249250621 14381 29483 17
## 3 249250621 14399 18745 18
## 4 249250621 14406 24893 23
## 5 249250621 19688 25078 11
## 6 249250621 27542 28029 1
## blockSizes
## 1 93,144,229,70,21,
## 2 73,375,71,165,303,360,198,661,201,1,260,250,74,73,98,155,163,
## 3 690,10,32,33,376,4,5,15,5,11,7,41,277,859,141,51,443,1253,
## 4 99,352,286,24,49,14,6,5,8,149,14,44,98,12,10,355,837,59,8,1500,133,624,58,
## 5 131,26,1300,6,4,11,4,7,358,3359,155,
## 6 487,
## qStarts
## 1 34,132,278,541,611,
## 2 87,165,540,647,818,1123,1484,1682,2343,2545,2546,2808,3058,3133,3206,3317,3472,
## 3 44,735,746,779,813,1190,1195,1201,1217,1223,1235,1243,1285,1564,2423,2565,2617,3062,
## 4 0,99,452,739,764,814,829,836,842,851,1001,1016,1061,1160,1173,1184,1540,2381,2441,2450,3951,4103,4728,
## 5 0,132,159,1460,1467,1472,1484,1489,1497,1856,5244,
## 6 0,
## tStarts
## 1 14361,14454,14599,14968,15795,
## 2 14381,14454,14969,15075,15240,15543,15903,16104,16853,17054,17232,17492,17914,17988,18267,24736,29320,
## 3 14399,15089,15099,15131,15164,15540,15544,15549,15564,15569,15580,15587,15628,15906,16857,16998,17049,17492,
## 4 14406,20227,20579,20865,20889,20938,20952,20958,20963,20971,21120,21134,21178,21276,21288,21298,21653,22492,22551,22559,24059,24211,24835,
## 5 19688,19819,19845,21145,21151,21155,21166,21170,21177,21535,24923,
## 6 27542,
select specific subset
query <- dbSendQuery(hg19,"select * from affyU133Plus2 where misMatches between 1 and 3")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 1 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 2 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 3 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 4 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 5 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 6 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 7 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 8 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 11
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 12
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 13
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 15
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 16
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 17
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 18
## imported as numeric
affyMis <- fetch(query); quantile(affyMis$misMatches)
## 0% 25% 50% 75% 100%
## 1 1 2 2 3
# only to extract a small chunk of data from server
affyMisSMall <- fetch(query,n=10); dbClearResult(query); # second command is to stop mysql query connection
## [1] TRUE
dim(affyMisSMall)
## [1] 10 22
# remember to close connection
dbDisconnect(hg19)
## [1] TRUE
library(rhdf5)
created = h5createFile("example.h5")
## file 'C:\Users\krith\OneDrive\Documents\coursera\getting_cleaning_data\week2\example.h5' already exists.
created
## [1] FALSE
# primarily used to genomics but it has good big data sets
create groups
created = h5createGroup("example.h5","foo")
## Can not create group. Object with name 'foo' already exists.
created = h5createGroup("example.h5","baa")
## Can not create group. Object with name 'baa' already exists.
created = h5createGroup("example.h5","foo/foobaa")
## Can not create group. Object with name 'foo/foobaa' already exists.
# to list all components of this file
h5ls("example.h5")
## group name otype dclass dim
## 0 / baa H5I_GROUP
## 1 / df H5I_DATASET COMPOUND 5
## 2 / foo H5I_GROUP
## 3 /foo A H5I_DATASET INTEGER 5 x 2
## 4 /foo foobaa H5I_GROUP
## 5 /foo/foobaa B H5I_DATASET FLOAT 5 x 2 x 2
write to groups
A = matrix(1:10,nr = 5, nc = 2) # create matrix
A
## [,1] [,2]
## [1,] 1 6
## [2,] 2 7
## [3,] 3 8
## [4,] 4 9
## [5,] 5 10
h5write(A,"example.h5","foo/A") # writing this matrix to group
B = array(seq(0.1,2.0,by = 0.1),dim=c(5,2,2))
B
## , , 1
##
## [,1] [,2]
## [1,] 0.1 0.6
## [2,] 0.2 0.7
## [3,] 0.3 0.8
## [4,] 0.4 0.9
## [5,] 0.5 1.0
##
## , , 2
##
## [,1] [,2]
## [1,] 1.1 1.6
## [2,] 1.2 1.7
## [3,] 1.3 1.8
## [4,] 1.4 1.9
## [5,] 1.5 2.0
attr(B,"scale") <- "liter"
h5write(B,"example.h5","foo/foobaa/B")
h5ls("example.h5")
## group name otype dclass dim
## 0 / baa H5I_GROUP
## 1 / df H5I_DATASET COMPOUND 5
## 2 / foo H5I_GROUP
## 3 /foo A H5I_DATASET INTEGER 5 x 2
## 4 /foo foobaa H5I_GROUP
## 5 /foo/foobaa B H5I_DATASET FLOAT 5 x 2 x 2
write dataset
#install.packages("stringi")
library(stringi)
## Warning: package 'stringi' was built under R version 3.3.3
if (!file.exists("example/")) {
dir.create("example/")
}
## Warning in dir.create("example/"): 'example' already exists
df = data.frame(1L:5L,seq(0,1,length.out = 5),
c("ab","cde","fghi","a","s"),stringsAsFactors=FALSE)
#h5write(df, "./example/example.h5", "df")
#h5ls("example.h5")
Read data
readA = h5read("example.h5", "foo/A")
readB = h5read("example.h5", "foo/foobaa/B")
readdf = h5read("example.h5", "df")
readA
## [,1] [,2]
## [1,] 1 6
## [2,] 2 7
## [3,] 3 8
## [4,] 4 9
## [5,] 5 10
foo = h5read("example.h5","foo")
class(foo)
## [1] "list"
names(foo)
## [1] "A" "foobaa"
class(foo$foobaa)
## [1] "list"
Writing and reading chunks
h5write(c(12, 13, 14), "example.h5", "foo/A", index = list(1:3, 1))
h5read("example.h5", "foo/A")
## [,1] [,2]
## [1,] 12 6
## [2,] 13 7
## [3,] 14 8
## [4,] 4 9
## [5,] 5 10
con = url("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en")
htmlCode = readLines(con)
## Warning in readLines(con): incomplete final line found on 'http://
## scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en'
close(con)
substr(htmlCode, start = 1, stop = 1000)
## [1] "<!doctype html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=Edge\"><meta name=\"referrer\" content=\"always\"><meta name=\"viewport\" content=\"width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=2\"><style>@viewport{width:device-width;min-zoom:1;max-zoom:2;}</style><meta name=\"format-detection\" content=\"telephone=no\"><style>html,body,form,table,div,h1,h2,h3,h4,h5,h6,img,ol,ul,li,button{margin:0;padding:0;border:0;}table{border-collapse:collapse;border-width:0;empty-cells:show;}#gs_top{position:relative;min-width:964px;-webkit-tap-highlight-color:rgba(0,0,0,0);}#gs_top>*:not(#x){-webkit-tap-highlight-color:rgba(204,204,204,.5);}.gs_el_ph #gs_top,.gs_el_ta #gs_top{min-width:300px;}#gs_top.gs_nscl{position:fixed;width:100%;}body,td,input{font-size:13px;font-family:Arial,sans-serif;line-height:1.24}body{background:#fff;color:#222;-webkit-text-size-adjust:100%;-moz-text-size-adjust:none;}.gs_gray{color:#7777"
class(htmlCode) # character vector
## [1] "character"
length(htmlCode)
## [1] 1
nchar(htmlCode)
## [1] 88799
parsing with XML
library(XML)
## Warning: package 'XML' was built under R version 3.3.3
url <- "http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en"
html <- htmlTreeParse(url, useInternalNodes = T)
xpathSApply(html,"//title",xmlValue)
## [1] "Jeff Leek - Google Scholar Citations"
xpathSApply(html,"//td[@id='col-citedby']",xmlValue)
## list()
GET from the httr package
library(httr)
## Warning: package 'httr' was built under R version 3.3.3
html2 = GET(url)
class(html2)
## [1] "response"
content2 = content(html2, as = "text")
class(content2)
## [1] "character"
parsedHtml = htmlParse(content2, asText = TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)
## [1] "Jeff Leek - Google Scholar Citations"
Accessing websites with passwords
#install.packages("httr")
library(httr)
pg1 = GET("http://httpbin.org/basic-auth/user/passwd")
pg1
## Response [http://httpbin.org/basic-auth/user/passwd]
## Date: 2017-04-10 05:06
## Status: 401
## Content-Type: <unknown>
## <EMPTY BODY>
pg2 = GET("http://httpbin.org/basic-auth/user/passwd",
authenticate("user","passwd"))
pg2
## Response [http://httpbin.org/basic-auth/user/passwd]
## Date: 2017-04-10 05:06
## Status: 200
## Content-Type: application/json
## Size: 47 B
## {
## "authenticated": true,
## "user": "user"
## }
# status 200 indicates we got authorization
names(pg2)
## [1] "url" "status_code" "headers" "all_headers" "cookies"
## [6] "content" "date" "times" "request" "handle"
Using handles
# to authenticate one hand;le for once and set it as dafult so you dont need to keep setting authorization
google = handle("http://google.com")
pg1 = GET(handle = google, path = "/")
pg2 = GET(handle = google, path = "search")
myapp<-oauth_app("twitter",
key="796216345939546113-W7jYnEs0FqvhScFe3e6sPLSxAVYUsJM",secret = "NloOpqck3cf1g85WqO7R0pMZt90y0LtMOmSomMqjzJ0CC")
sig <- sign_oauth1.0(myapp,
token = "74waGoAWG1ArrOqCRdrnVIZcY",token_secret ="YLrrMmGs7rUkkaLumXqjYV6wD5pXaFLxO7a8y83HygtP1aa5VK")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json",sig)
converting JSON object
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.3.3
json1 <- content(homeTL)
json2 <- jsonlite::fromJSON(toJSON(json1))
# or json2 = fromJSON(toJSON(json1))
json2$errors
## code message
## 1 89 Invalid or expired token.