week 2

0201 : Reading from MySQL

  • Widely used
  • free
  • used in internet app
  • row -> record

Installing MySQL

Download link : http://biostat.mc.vanderbilt.edu/wiki/Main/RMySQL

Tutorial for download : http://www.ahschulz.de/2013/07/23/installing-rmysql-under-windows/

setwd("C:/Users/krith/OneDrive/Documents/coursera/getting_cleaning_data/week2")

#install.packages("RMySQL")
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 3.3.3
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.3.3
ucscDB <- dbConnect(MySQL(),user="genome",host="genome-mysql.cse.ucsc.edu")

result <- dbGetQuery(ucscDB,"show databases;"); 
dbDisconnect(ucscDB)   # mysql command
## [1] TRUE
head(result) # these are databases
##             Database
## 1 information_schema
## 2            ailMel1
## 3            allMis1
## 4            anoCar1
## 5            anoCar2
## 6            anoGam1
hg19 <- dbConnect(MySQL(),user="genome",db="hg19",
                  host="genome-mysql.cse.ucsc.edu")

allTables <- dbListTables(hg19)

length(allTables)
## [1] 11048
allTables[1:5]
## [1] "HInv"         "HInvGeneMrna" "acembly"      "acemblyClass"
## [5] "acemblyPep"
# Each table is a different dataset

Get dimensions of a specific table

dbListFields(hg19,"affyU133Plus2")
##  [1] "bin"         "matches"     "misMatches"  "repMatches"  "nCount"     
##  [6] "qNumInsert"  "qBaseInsert" "tNumInsert"  "tBaseInsert" "strand"     
## [11] "qName"       "qSize"       "qStart"      "qEnd"        "tName"      
## [16] "tSize"       "tStart"      "tEnd"        "blockCount"  "blockSizes" 
## [21] "qStarts"     "tStarts"
dbGetQuery(hg19,"select count(*) from affyU133Plus2")
##   count(*)
## 1    58463

Read from table

affyData <- dbReadTable(hg19,"affyU133Plus2")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 1 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 2 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 3 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 4 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 5 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 6 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 7 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 8 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 11
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 12
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 13
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 15
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 16
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 17
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 18
## imported as numeric
head(affyData)
##   bin matches misMatches repMatches nCount qNumInsert qBaseInsert
## 1 585     530          4          0     23          3          41
## 2 585    3355         17          0    109          9          67
## 3 585    4156         14          0     83         16          18
## 4 585    4667          9          0     68         21          42
## 5 585    5180         14          0    167         10          38
## 6 585     468          5          0     14          0           0
##   tNumInsert tBaseInsert strand        qName qSize qStart qEnd tName
## 1          3         898      -  225995_x_at   637      5  603  chr1
## 2          9       11621      -  225035_x_at  3635      0 3548  chr1
## 3          2          93      -  226340_x_at  4318      3 4274  chr1
## 4          3        5743      - 1557034_s_at  4834     48 4834  chr1
## 5          1          29      -    231811_at  5399      0 5399  chr1
## 6          0           0      -    236841_at   487      0  487  chr1
##       tSize tStart  tEnd blockCount
## 1 249250621  14361 15816          5
## 2 249250621  14381 29483         17
## 3 249250621  14399 18745         18
## 4 249250621  14406 24893         23
## 5 249250621  19688 25078         11
## 6 249250621  27542 28029          1
##                                                                   blockSizes
## 1                                                          93,144,229,70,21,
## 2              73,375,71,165,303,360,198,661,201,1,260,250,74,73,98,155,163,
## 3                 690,10,32,33,376,4,5,15,5,11,7,41,277,859,141,51,443,1253,
## 4 99,352,286,24,49,14,6,5,8,149,14,44,98,12,10,355,837,59,8,1500,133,624,58,
## 5                                       131,26,1300,6,4,11,4,7,358,3359,155,
## 6                                                                       487,
##                                                                                                  qStarts
## 1                                                                                    34,132,278,541,611,
## 2                        87,165,540,647,818,1123,1484,1682,2343,2545,2546,2808,3058,3133,3206,3317,3472,
## 3                   44,735,746,779,813,1190,1195,1201,1217,1223,1235,1243,1285,1564,2423,2565,2617,3062,
## 4 0,99,452,739,764,814,829,836,842,851,1001,1016,1061,1160,1173,1184,1540,2381,2441,2450,3951,4103,4728,
## 5                                                     0,132,159,1460,1467,1472,1484,1489,1497,1856,5244,
## 6                                                                                                     0,
##                                                                                                                                      tStarts
## 1                                                                                                             14361,14454,14599,14968,15795,
## 2                                     14381,14454,14969,15075,15240,15543,15903,16104,16853,17054,17232,17492,17914,17988,18267,24736,29320,
## 3                               14399,15089,15099,15131,15164,15540,15544,15549,15564,15569,15580,15587,15628,15906,16857,16998,17049,17492,
## 4 14406,20227,20579,20865,20889,20938,20952,20958,20963,20971,21120,21134,21178,21276,21288,21298,21653,22492,22551,22559,24059,24211,24835,
## 5                                                                         19688,19819,19845,21145,21151,21155,21166,21170,21177,21535,24923,
## 6                                                                                                                                     27542,

select specific subset

query <- dbSendQuery(hg19,"select * from affyU133Plus2 where misMatches between 1 and 3")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 1 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 2 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 3 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 4 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 5 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 6 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 7 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 8 imported
## as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 11
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 12
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 13
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 15
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 16
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 17
## imported as numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 18
## imported as numeric
affyMis <- fetch(query); quantile(affyMis$misMatches)
##   0%  25%  50%  75% 100% 
##    1    1    2    2    3
# only to extract a small chunk of data from server
affyMisSMall <- fetch(query,n=10); dbClearResult(query); # second command is to stop mysql query connection
## [1] TRUE
dim(affyMisSMall)
## [1] 10 22
# remember to close connection

dbDisconnect(hg19)
## [1] TRUE
  • do not use other commands of sql like del join when handling ourside servers for sql

0202 :reading HDF5

library(rhdf5)
created = h5createFile("example.h5")
## file 'C:\Users\krith\OneDrive\Documents\coursera\getting_cleaning_data\week2\example.h5' already exists.
created
## [1] FALSE
# primarily used to genomics but it has good big data sets

create groups

created = h5createGroup("example.h5","foo")
## Can not create group. Object with name 'foo' already exists.
created = h5createGroup("example.h5","baa")
## Can not create group. Object with name 'baa' already exists.
created = h5createGroup("example.h5","foo/foobaa")
## Can not create group. Object with name 'foo/foobaa' already exists.
# to list all components of this file

h5ls("example.h5")
##         group   name       otype   dclass       dim
## 0           /    baa   H5I_GROUP                   
## 1           /     df H5I_DATASET COMPOUND         5
## 2           /    foo   H5I_GROUP                   
## 3        /foo      A H5I_DATASET  INTEGER     5 x 2
## 4        /foo foobaa   H5I_GROUP                   
## 5 /foo/foobaa      B H5I_DATASET    FLOAT 5 x 2 x 2

write to groups

A = matrix(1:10,nr = 5, nc = 2) # create matrix
A
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
h5write(A,"example.h5","foo/A")  # writing this matrix to group

B = array(seq(0.1,2.0,by = 0.1),dim=c(5,2,2))
B
## , , 1
## 
##      [,1] [,2]
## [1,]  0.1  0.6
## [2,]  0.2  0.7
## [3,]  0.3  0.8
## [4,]  0.4  0.9
## [5,]  0.5  1.0
## 
## , , 2
## 
##      [,1] [,2]
## [1,]  1.1  1.6
## [2,]  1.2  1.7
## [3,]  1.3  1.8
## [4,]  1.4  1.9
## [5,]  1.5  2.0
attr(B,"scale") <- "liter"
h5write(B,"example.h5","foo/foobaa/B")
h5ls("example.h5")
##         group   name       otype   dclass       dim
## 0           /    baa   H5I_GROUP                   
## 1           /     df H5I_DATASET COMPOUND         5
## 2           /    foo   H5I_GROUP                   
## 3        /foo      A H5I_DATASET  INTEGER     5 x 2
## 4        /foo foobaa   H5I_GROUP                   
## 5 /foo/foobaa      B H5I_DATASET    FLOAT 5 x 2 x 2

write dataset

#install.packages("stringi")
library(stringi)
## Warning: package 'stringi' was built under R version 3.3.3
if (!file.exists("example/")) {
    dir.create("example/")
}
## Warning in dir.create("example/"): 'example' already exists
df = data.frame(1L:5L,seq(0,1,length.out = 5),
                c("ab","cde","fghi","a","s"),stringsAsFactors=FALSE)
#h5write(df, "./example/example.h5", "df")
#h5ls("example.h5")

Read data

readA = h5read("example.h5", "foo/A")
readB = h5read("example.h5", "foo/foobaa/B")
readdf = h5read("example.h5", "df")
readA
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
foo = h5read("example.h5","foo")
class(foo)
## [1] "list"
names(foo)
## [1] "A"      "foobaa"
class(foo$foobaa)
## [1] "list"

Writing and reading chunks

  • We can write to or read from subsets of data structures in HDF5 files
h5write(c(12, 13, 14), "example.h5", "foo/A", index = list(1:3, 1))
h5read("example.h5", "foo/A")
##      [,1] [,2]
## [1,]   12    6
## [2,]   13    7
## [3,]   14    8
## [4,]    4    9
## [5,]    5   10

0203 : Reading from Web

  • webscraping : data from HTML <- extracting
  • do not scrape from websites that do not want information to be scraped
con = url("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en")
htmlCode = readLines(con)
## Warning in readLines(con): incomplete final line found on 'http://
## scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en'
close(con)
substr(htmlCode, start = 1, stop = 1000)
## [1] "<!doctype html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=Edge\"><meta name=\"referrer\" content=\"always\"><meta name=\"viewport\" content=\"width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=2\"><style>@viewport{width:device-width;min-zoom:1;max-zoom:2;}</style><meta name=\"format-detection\" content=\"telephone=no\"><style>html,body,form,table,div,h1,h2,h3,h4,h5,h6,img,ol,ul,li,button{margin:0;padding:0;border:0;}table{border-collapse:collapse;border-width:0;empty-cells:show;}#gs_top{position:relative;min-width:964px;-webkit-tap-highlight-color:rgba(0,0,0,0);}#gs_top>*:not(#x){-webkit-tap-highlight-color:rgba(204,204,204,.5);}.gs_el_ph #gs_top,.gs_el_ta #gs_top{min-width:300px;}#gs_top.gs_nscl{position:fixed;width:100%;}body,td,input{font-size:13px;font-family:Arial,sans-serif;line-height:1.24}body{background:#fff;color:#222;-webkit-text-size-adjust:100%;-moz-text-size-adjust:none;}.gs_gray{color:#7777"
class(htmlCode)  # character vector
## [1] "character"
length(htmlCode)
## [1] 1
nchar(htmlCode)
## [1] 88799

parsing with XML

library(XML)
## Warning: package 'XML' was built under R version 3.3.3
url <- "http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en"

html <- htmlTreeParse(url, useInternalNodes = T)
xpathSApply(html,"//title",xmlValue)
## [1] "Jeff Leek - Google Scholar Citations"
xpathSApply(html,"//td[@id='col-citedby']",xmlValue)
## list()

GET from the httr package

  • alternative to htmltreeparse
  • use when website requires authorization
library(httr)
## Warning: package 'httr' was built under R version 3.3.3
html2 = GET(url)
class(html2)
## [1] "response"
content2 = content(html2, as = "text")
class(content2)
## [1] "character"
parsedHtml = htmlParse(content2, asText = TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)
## [1] "Jeff Leek - Google Scholar Citations"

Accessing websites with passwords

#install.packages("httr")
library(httr)

pg1 = GET("http://httpbin.org/basic-auth/user/passwd")
pg1
## Response [http://httpbin.org/basic-auth/user/passwd]
##   Date: 2017-04-10 05:06
##   Status: 401
##   Content-Type: <unknown>
## <EMPTY BODY>
pg2 = GET("http://httpbin.org/basic-auth/user/passwd",
          authenticate("user","passwd"))
pg2
## Response [http://httpbin.org/basic-auth/user/passwd]
##   Date: 2017-04-10 05:06
##   Status: 200
##   Content-Type: application/json
##   Size: 47 B
## {
##   "authenticated": true, 
##   "user": "user"
## }
# status 200 indicates we got authorization
  • Error 401 indicates authorization is required. It might also show list where authorization is a part of that list
names(pg2)
##  [1] "url"         "status_code" "headers"     "all_headers" "cookies"    
##  [6] "content"     "date"        "times"       "request"     "handle"

Using handles

# to authenticate one hand;le for once and set it as dafult so you dont need to keep setting authorization
google = handle("http://google.com")
pg1 = GET(handle = google, path = "/")
pg2 = GET(handle = google, path = "search")

Reading API

myapp<-oauth_app("twitter",
                 key="796216345939546113-W7jYnEs0FqvhScFe3e6sPLSxAVYUsJM",secret = "NloOpqck3cf1g85WqO7R0pMZt90y0LtMOmSomMqjzJ0CC")

sig <- sign_oauth1.0(myapp,
                     token = "74waGoAWG1ArrOqCRdrnVIZcY",token_secret ="YLrrMmGs7rUkkaLumXqjYV6wD5pXaFLxO7a8y83HygtP1aa5VK")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json",sig)

converting JSON object

library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.3.3
json1 <- content(homeTL)
json2 <- jsonlite::fromJSON(toJSON(json1))
# or json2 = fromJSON(toJSON(json1))
json2$errors
##   code                   message
## 1   89 Invalid or expired token.