WEEK2- database & web

RSQLite

library(DBI)
library(RSQLite)

driver <- dbDriver("SQLite")  #引擎
db_file <- "./data/crabtag.sqlite" #数据库文件
conn <- dbConnect(driver, db_file)#新建链接
dbListTables(conn = conn, "idblock") #查询表名称

## [1] "Daylog"             "DeploymentNotebook" "IdBlock"           
## [4] "LifetimeNotebook"   "TagNotebook"

query <- "SELECT * FROM IdBlock" #查询语句
id_block <- dbGetQuery(conn = conn, query)#发送查询语句 
id_block

##   Tag ID Firmware Version No Firmware Build Level
## 1 A03401                   2                   70

dbDisconnect(conn) #关闭连接
dbUnloadDriver(driver)#卸载驱动程序

# 所有的过程封装成函数，以免不关连接和卸载程序
#' FUNCTION_TITLE
#'
#' FUNCTION_DESCRIPTION
#' 数据库查询函数封装
#' @param  query 查询语句
#' @param  db_file 数据库文件地址

#' @return 查询结果
#' @examples
query_crab_tag_db <- function(query, db_file){
    driver <- dbDriver("SQLite")
    conn <- dbConnect(driver, db_file)
    on.exit(
        {
            #函数结束时运行程序
            #即使数据库连接有错误
            dbDisconnect(conn)
            dbUnloadDriver(driver)
        }
    )
    dbGetQuery(conn, query)
}

query_crab_tag_db(query = "SELECT * FROM IdBlock",
                  db_file = "./data/crabtag.sqlite")

##   Tag ID Firmware Version No Firmware Build Level
## 1 A03401                   2                   70

RMySQL

library(RMySQL)

ucscDb <- dbConnect(MySQL(),user="genome", 
                    host="genome-mysql.cse.ucsc.edu")

result <- dbGetQuery(ucscDb,"show databases;") 
dbDisconnect(ucscDb)

## [1] TRUE

head(result)

##             Database
## 1 information_schema
## 2            ailMel1
## 3            allMis1
## 4            anoCar1
## 5            anoCar2
## 6            anoGam1

hg19 <- dbConnect(MySQL(),user="genome", db="hg19",
                    host="genome-mysql.cse.ucsc.edu")

allTables <- dbListTables(hg19)
length(allTables)

## [1] 11107

#Connecting to hg19 and listing tables
allTables[1:5]

## [1] "HInv"         "HInvGeneMrna" "acembly"      "acemblyClass"
## [5] "acemblyPep"

#Get dimensions of a specific table，(colnames )
dbListFields(hg19,"affyU133Plus2")

##  [1] "bin"         "matches"     "misMatches"  "repMatches"  "nCount"     
##  [6] "qNumInsert"  "qBaseInsert" "tNumInsert"  "tBaseInsert" "strand"     
## [11] "qName"       "qSize"       "qStart"      "qEnd"        "tName"      
## [16] "tSize"       "tStart"      "tEnd"        "blockCount"  "blockSizes" 
## [21] "qStarts"     "tStarts"

#Get dimensions of a specific table，(length)
dbGetQuery(hg19, "select count(*) from affyU133Plus2")

##   count(*)
## 1    58463

#Read from the table
affyData <- dbReadTable(hg19, "affyU133Plus2")
head(affyData[,c(1,3)], 2)

##   bin misMatches
## 1 585          4
## 2 585         17

#https://stackoverflow.com/questions/4084028/how-to-close-resultset-in-rmysql
# Select a specific subset
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")

affyMis <- dbFetch(query) #获取数据

affyMisSmall <- fetch(query, n=10) #获取子集合中前10行数据
 
dbClearResult(query) #关闭查询

## [1] TRUE

quantile(affyMis$misMatches)  #百分位函数

##   0%  25%  50%  75% 100% 
##    1    1    2    2    3

dim(affyMisSmall)

## [1] 10 22

HDF5

source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")

library(rhdf5)
created = h5createFile("example.h5")
created

## [1] FALSE

created = h5createGroup("example.h5","foo") 
created = h5createGroup("example.h5","baa")
created = h5createGroup("example.h5","foo/foobaa")
h5ls("example.h5")

##         group   name       otype   dclass       dim
## 0           /    baa   H5I_GROUP                   
## 1           /     df H5I_DATASET COMPOUND         5
## 2           /    df1 H5I_DATASET COMPOUND         5
## 3           /    foo   H5I_GROUP                   
## 4        /foo      A H5I_DATASET  INTEGER     5 x 2
## 5        /foo foobaa   H5I_GROUP                   
## 6 /foo/foobaa      B H5I_DATASET    FLOAT 5 x 2 x 2

# Write to groups
A = matrix(1:10,nr=5,nc=2)
h5write(A, "example.h5","foo/A")

B = array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))
attr(B, "scale") <- "liter"
h5write(B, "example.h5","foo/foobaa/B")
h5ls("example.h5")

##         group   name       otype   dclass       dim
## 0           /    baa   H5I_GROUP                   
## 1           /     df H5I_DATASET COMPOUND         5
## 2           /    df1 H5I_DATASET COMPOUND         5
## 3           /    foo   H5I_GROUP                   
## 4        /foo      A H5I_DATASET  INTEGER     5 x 2
## 5        /foo foobaa   H5I_GROUP                   
## 6 /foo/foobaa      B H5I_DATASET    FLOAT 5 x 2 x 2

# Write a data set 
df1 = data.frame(1L:5L,seq(0,1,length.out=5),
                c("ab","cde","fghi","a","s"),
                stringsAsFactors=FALSE)
h5write(df1, "example.h5","df1")
h5ls("example.h5")

Webscarping

Rcul and httr

1 Register an application with the Github API here https://github.com/settings/applications. Access the API to get information on your instructors repositories (hint: this is the url you want “https://api.github.com/users/jtleek/repos”). Use this data to find the time that the datasharing repo was created. What time was it created?

This tutorial may be useful (https://github.com/hadley/httr/blob/master/demo/oauth2-github.r). You may also need to run the code in the base R package and not R studio.

#tutorial code
library(httr)

# 1. Find OAuth settings for github:
#    http://developer.github.com/v3/oauth/
oauth_endpoints("github")

## <oauth_endpoint>
##  authorize: https://github.com/login/oauth/authorize
##  access:    https://github.com/login/oauth/access_token

# 2. To make your own application, register at 
#    https://github.com/settings/developers. Use any URL for the homepage URL
#    (http://github.com is fine) and  http://localhost:1410 as the callback url
#
#    Replace your key and secret below.
myapp <- oauth_app("github",
  key = "275a4e65fd6c3ddbe46e",
  secret = "64d283aff66e6816446d91b2bca4c9e9007fd9c6")

# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)

# 4. Use API
gtoken <- config(token = github_token)
req <- GET("https://api.github.com/rate_limit", gtoken)

#' stop_for_status
#'
#' Converts http errors to R errors or warnings - these should always be used whenever you’re creating requests inside a function, so that the user knows why a request has failed.
#'
#' @param x a response, or numeric http code.
#' @param task The text of the message: either NULL or a character vector.
#'
#' @return RETURN_DESCRIPTION
#' @examples
#' # ADD_EXAMPLES_HERE
stop_for_status(req)
content(req)

## $resources
## $resources$core
## $resources$core$limit
## [1] 5000
## 
## $resources$core$remaining
## [1] 4997
## 
## $resources$core$reset
## [1] 1523108309
## 
## 
## $resources$search
## $resources$search$limit
## [1] 30
## 
## $resources$search$remaining
## [1] 30
## 
## $resources$search$reset
## [1] 1523105035
## 
## 
## $resources$graphql
## $resources$graphql$limit
## [1] 5000
## 
## $resources$graphql$remaining
## [1] 5000
## 
## $resources$graphql$reset
## [1] 1523108575
## 
## 
## 
## $rate
## $rate$limit
## [1] 5000
## 
## $rate$remaining
## [1] 4997
## 
## $rate$reset
## [1] 1523108309

# # OR:
# req <- with_config(gtoken, GET("https://api.github.com/rate_limit"))
# stop_for_status(req)
# content(req)

# 本题解答
library(httr)
library(RJSONIO)
oauth_endpoints("github")

## <oauth_endpoint>
##  authorize: https://github.com/login/oauth/authorize
##  access:    https://github.com/login/oauth/access_token

myapp <- oauth_app("github",
  key = "275a4e65fd6c3ddbe46e",
  secret = "64d283aff66e6816446d91b2bca4c9e9007fd9c6")

# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)

# 4. Use API
gtoken <- config(token = github_token)

req <- GET("https://api.github.com/users/jtleek/repos", gtoken)
# stop_for_status(req) 
jtleek_repos_RJSONIO <- RJSONIO::fromJSON(toJSON(content(req))) #list

data <- lapply(jtleek_repos_RJSONIO, 
                    function(x) {
                        if (x$name =="datasharing"){
                            time_data <- x$created_at
                            print(time_data)
                        }
                    })

## [1] "2013-11-07T13:25:07Z"

2, The sqldf package allows for execution of SQL commands on R data frames. We will use the sqldf package to practice the queries we might send with the dbSendQuery command in RMySQL.

Download the American Community Survey data and load it into an R object called acs

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv

Which of the following commands will select only the data for the probability weights pwgtp1 with ages less than 50?

if (!file.exists("data")) {
  dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv"
download.file(fileUrl, destfile = "./data/ss06pid.csv", method = "curl")
dataDownloaded <- data()

#library(readr)
#library(sqldf)
#acs <- read_csv("data/ss06pid.csv", col_names = TRUE)

#select only the data for the probability weights pwgtp1 with ages less than 50

#data <- sqldf("select pwgtp1 from acs where AGEP < 50")
#head(acs[acs[["AGEP"]]<50, c("pwgtp1","AGEP")]) #基本方法

#head(data) #查询方法

3, Using the same data frame you created in the previous problem, what is the equivalent function to unique(acs$AGEP)

#head(unique(acs$AGEP))

 # head(sqldf("select distinct AGEP from acs"))

4, How many characters are in the 10th, 20th, 30th and 100th lines of HTML from this page:

http://biostat.jhsph.edu/~jleek/contact.html

(Hint: the nchar() function in R may be helpful)

con <-  url("http://biostat.jhsph.edu/~jleek/contact.html")
htmlCode <- readLines(con = con)
str(htmlCode)

##  chr [1:180] "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" ...

nchar(htmlCode[c(10, 20, 30, 100)])

## [1] 45 31  7 25

5, Read this data set into R and report the sum of the numbers in the fourth of the nine columns.

https://d396qusza40orc.cloudfront.net/getdata%2Fwksst8110.for

Original source of the data: http://www.cpc.ncep.noaa.gov/data/indices/wksst8110.for

(Hint this is a fixed width file format)

ff <- tempfile()
cat(file = ff, "123456", "987654", sep = "\n")
read.fwf(ff, widths = c(1,2,3))    #> 1 23 456 \ 9 87 654

##   V1 V2  V3
## 1  1 23 456
## 2  9 87 654

read.fwf(ff, widths = c(1,-2,3))# 负号表示去掉

##   V1  V2
## 1  1 456
## 2  9 654

#"123456", "987654"
read.fwf(ff, widths = list(c(1,0, 2,3), c(2,2,2)))# 0表示NA list表示读成一行。

##   V1 V2 V3  V4 V5 V6 V7
## 1  1 NA 23 456 98 76 54

#本题解法
data <- read.fwf("./data/wksst8110.for", skip = 4, header = FALSE, widths = c(-28,4))
head(data)

##     V1
## 1 25.1
## 2 25.2
## 3 25.3
## 4 25.5
## 5 25.8
## 6 26.1

sum(as.numeric(data[, "V1"]))

## [1] 32426.7