library(DBI)
library(RSQLite)
driver <- dbDriver("SQLite") #引擎
db_file <- "./data/crabtag.sqlite" #数据库文件
conn <- dbConnect(driver, db_file)#新建链接
dbListTables(conn = conn, "idblock") #查询表名称
## [1] "Daylog" "DeploymentNotebook" "IdBlock"
## [4] "LifetimeNotebook" "TagNotebook"
query <- "SELECT * FROM IdBlock" #查询语句
id_block <- dbGetQuery(conn = conn, query)#发送查询语句
id_block
## Tag ID Firmware Version No Firmware Build Level
## 1 A03401 2 70
dbDisconnect(conn) #关闭连接
dbUnloadDriver(driver)#卸载驱动程序
# 所有的过程封装成函数,以免不关连接和卸载程序
#' FUNCTION_TITLE
#'
#' FUNCTION_DESCRIPTION
#' 数据库查询函数封装
#' @param query 查询语句
#' @param db_file 数据库文件地址
#' @return 查询结果
#' @examples
query_crab_tag_db <- function(query, db_file){
driver <- dbDriver("SQLite")
conn <- dbConnect(driver, db_file)
on.exit(
{
#函数结束时运行程序
#即使数据库连接有错误
dbDisconnect(conn)
dbUnloadDriver(driver)
}
)
dbGetQuery(conn, query)
}
query_crab_tag_db(query = "SELECT * FROM IdBlock",
db_file = "./data/crabtag.sqlite")
## Tag ID Firmware Version No Firmware Build Level
## 1 A03401 2 70
library(RMySQL)
ucscDb <- dbConnect(MySQL(),user="genome",
host="genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDb,"show databases;")
dbDisconnect(ucscDb)
## [1] TRUE
head(result)
## Database
## 1 information_schema
## 2 ailMel1
## 3 allMis1
## 4 anoCar1
## 5 anoCar2
## 6 anoGam1
hg19 <- dbConnect(MySQL(),user="genome", db="hg19",
host="genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
## [1] 11107
#Connecting to hg19 and listing tables
allTables[1:5]
## [1] "HInv" "HInvGeneMrna" "acembly" "acemblyClass"
## [5] "acemblyPep"
#Get dimensions of a specific table,(colnames )
dbListFields(hg19,"affyU133Plus2")
## [1] "bin" "matches" "misMatches" "repMatches" "nCount"
## [6] "qNumInsert" "qBaseInsert" "tNumInsert" "tBaseInsert" "strand"
## [11] "qName" "qSize" "qStart" "qEnd" "tName"
## [16] "tSize" "tStart" "tEnd" "blockCount" "blockSizes"
## [21] "qStarts" "tStarts"
#Get dimensions of a specific table,(length)
dbGetQuery(hg19, "select count(*) from affyU133Plus2")
## count(*)
## 1 58463
#Read from the table
affyData <- dbReadTable(hg19, "affyU133Plus2")
head(affyData[,c(1,3)], 2)
## bin misMatches
## 1 585 4
## 2 585 17
#https://stackoverflow.com/questions/4084028/how-to-close-resultset-in-rmysql
# Select a specific subset
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- dbFetch(query) #获取数据
affyMisSmall <- fetch(query, n=10) #获取子集合中前10行数据
dbClearResult(query) #关闭查询
## [1] TRUE
quantile(affyMis$misMatches) #百分位函数
## 0% 25% 50% 75% 100%
## 1 1 2 2 3
dim(affyMisSmall)
## [1] 10 22
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
library(rhdf5)
created = h5createFile("example.h5")
created
## [1] FALSE
created = h5createGroup("example.h5","foo")
created = h5createGroup("example.h5","baa")
created = h5createGroup("example.h5","foo/foobaa")
h5ls("example.h5")
## group name otype dclass dim
## 0 / baa H5I_GROUP
## 1 / df H5I_DATASET COMPOUND 5
## 2 / df1 H5I_DATASET COMPOUND 5
## 3 / foo H5I_GROUP
## 4 /foo A H5I_DATASET INTEGER 5 x 2
## 5 /foo foobaa H5I_GROUP
## 6 /foo/foobaa B H5I_DATASET FLOAT 5 x 2 x 2
# Write to groups
A = matrix(1:10,nr=5,nc=2)
h5write(A, "example.h5","foo/A")
B = array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))
attr(B, "scale") <- "liter"
h5write(B, "example.h5","foo/foobaa/B")
h5ls("example.h5")
## group name otype dclass dim
## 0 / baa H5I_GROUP
## 1 / df H5I_DATASET COMPOUND 5
## 2 / df1 H5I_DATASET COMPOUND 5
## 3 / foo H5I_GROUP
## 4 /foo A H5I_DATASET INTEGER 5 x 2
## 5 /foo foobaa H5I_GROUP
## 6 /foo/foobaa B H5I_DATASET FLOAT 5 x 2 x 2
# Write a data set
df1 = data.frame(1L:5L,seq(0,1,length.out=5),
c("ab","cde","fghi","a","s"),
stringsAsFactors=FALSE)
h5write(df1, "example.h5","df1")
h5ls("example.h5")
1 Register an application with the Github API here https://github.com/settings/applications. Access the API to get information on your instructors repositories (hint: this is the url you want “https://api.github.com/users/jtleek/repos”). Use this data to find the time that the datasharing repo was created. What time was it created?
This tutorial may be useful (https://github.com/hadley/httr/blob/master/demo/oauth2-github.r). You may also need to run the code in the base R package and not R studio.
#tutorial code
library(httr)
# 1. Find OAuth settings for github:
# http://developer.github.com/v3/oauth/
oauth_endpoints("github")
## <oauth_endpoint>
## authorize: https://github.com/login/oauth/authorize
## access: https://github.com/login/oauth/access_token
# 2. To make your own application, register at
# https://github.com/settings/developers. Use any URL for the homepage URL
# (http://github.com is fine) and http://localhost:1410 as the callback url
#
# Replace your key and secret below.
myapp <- oauth_app("github",
key = "275a4e65fd6c3ddbe46e",
secret = "64d283aff66e6816446d91b2bca4c9e9007fd9c6")
# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)
# 4. Use API
gtoken <- config(token = github_token)
req <- GET("https://api.github.com/rate_limit", gtoken)
#' stop_for_status
#'
#' Converts http errors to R errors or warnings - these should always be used whenever you’re creating requests inside a function, so that the user knows why a request has failed.
#'
#' @param x a response, or numeric http code.
#' @param task The text of the message: either NULL or a character vector.
#'
#' @return RETURN_DESCRIPTION
#' @examples
#' # ADD_EXAMPLES_HERE
stop_for_status(req)
content(req)
## $resources
## $resources$core
## $resources$core$limit
## [1] 5000
##
## $resources$core$remaining
## [1] 4997
##
## $resources$core$reset
## [1] 1523108309
##
##
## $resources$search
## $resources$search$limit
## [1] 30
##
## $resources$search$remaining
## [1] 30
##
## $resources$search$reset
## [1] 1523105035
##
##
## $resources$graphql
## $resources$graphql$limit
## [1] 5000
##
## $resources$graphql$remaining
## [1] 5000
##
## $resources$graphql$reset
## [1] 1523108575
##
##
##
## $rate
## $rate$limit
## [1] 5000
##
## $rate$remaining
## [1] 4997
##
## $rate$reset
## [1] 1523108309
# # OR:
# req <- with_config(gtoken, GET("https://api.github.com/rate_limit"))
# stop_for_status(req)
# content(req)
# 本题解答
library(httr)
library(RJSONIO)
oauth_endpoints("github")
## <oauth_endpoint>
## authorize: https://github.com/login/oauth/authorize
## access: https://github.com/login/oauth/access_token
myapp <- oauth_app("github",
key = "275a4e65fd6c3ddbe46e",
secret = "64d283aff66e6816446d91b2bca4c9e9007fd9c6")
# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)
# 4. Use API
gtoken <- config(token = github_token)
req <- GET("https://api.github.com/users/jtleek/repos", gtoken)
# stop_for_status(req)
jtleek_repos_RJSONIO <- RJSONIO::fromJSON(toJSON(content(req))) #list
data <- lapply(jtleek_repos_RJSONIO,
function(x) {
if (x$name =="datasharing"){
time_data <- x$created_at
print(time_data)
}
})
## [1] "2013-11-07T13:25:07Z"
2, The sqldf package allows for execution of SQL commands on R data frames. We will use the sqldf package to practice the queries we might send with the dbSendQuery command in RMySQL.
Download the American Community Survey data and load it into an R object called acs
https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv
Which of the following commands will select only the data for the probability weights pwgtp1 with ages less than 50?
if (!file.exists("data")) {
dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv"
download.file(fileUrl, destfile = "./data/ss06pid.csv", method = "curl")
dataDownloaded <- data()
#library(readr)
#library(sqldf)
#acs <- read_csv("data/ss06pid.csv", col_names = TRUE)
#select only the data for the probability weights pwgtp1 with ages less than 50
#data <- sqldf("select pwgtp1 from acs where AGEP < 50")
#head(acs[acs[["AGEP"]]<50, c("pwgtp1","AGEP")]) #基本方法
#head(data) #查询方法
3, Using the same data frame you created in the previous problem, what is the equivalent function to unique(acs$AGEP)
#head(unique(acs$AGEP))
# head(sqldf("select distinct AGEP from acs"))
4, How many characters are in the 10th, 20th, 30th and 100th lines of HTML from this page:
http://biostat.jhsph.edu/~jleek/contact.html
(Hint: the nchar() function in R may be helpful)
con <- url("http://biostat.jhsph.edu/~jleek/contact.html")
htmlCode <- readLines(con = con)
str(htmlCode)
## chr [1:180] "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" ...
nchar(htmlCode[c(10, 20, 30, 100)])
## [1] 45 31 7 25
5, Read this data set into R and report the sum of the numbers in the fourth of the nine columns.
https://d396qusza40orc.cloudfront.net/getdata%2Fwksst8110.for
Original source of the data: http://www.cpc.ncep.noaa.gov/data/indices/wksst8110.for
(Hint this is a fixed width file format)
ff <- tempfile()
cat(file = ff, "123456", "987654", sep = "\n")
read.fwf(ff, widths = c(1,2,3)) #> 1 23 456 \ 9 87 654
## V1 V2 V3
## 1 1 23 456
## 2 9 87 654
read.fwf(ff, widths = c(1,-2,3))# 负号表示去掉
## V1 V2
## 1 1 456
## 2 9 654
#"123456", "987654"
read.fwf(ff, widths = list(c(1,0, 2,3), c(2,2,2)))# 0表示NA list表示读成一行。
## V1 V2 V3 V4 V5 V6 V7
## 1 1 NA 23 456 98 76 54
#本题解法
data <- read.fwf("./data/wksst8110.for", skip = 4, header = FALSE, widths = c(-28,4))
head(data)
## V1
## 1 25.1
## 2 25.2
## 3 25.3
## 4 25.5
## 5 25.8
## 6 26.1
sum(as.numeric(data[, "V1"]))
## [1] 32426.7