Data Science(part II)

Getting and Cleaning Data

概述

Raw data -> Processing script -> tidy data

下载

if (!file.exists("data")) {
    dir.create("data")
}
fileUrl <- "yoururl"
download.file(fileUrl, destfile = "./data/XXX.csv", method = "curl")
list.files("./data")
dateDownloaded <- date()

读取本地文件

读取excle文件

library(xlsx)
cameraData <- read.xlsx("./data/cameras.xlsx", sheetIndex = 1, header = TRUE)
head(cameraData)
# read.xlsx2更快不过选行读取时会不稳定 支持底层读取 如字体等
library(XLConnect)
wb <- loadWorkbook("XLConnectExample1.xlsx", create = TRUE)
createSheet(wb, name = "chickSheet")
writeWorksheet(wb, ChickWeight, sheet = "chickSheet", startRow = 3, startCol = 4)
saveWorkbook(wb)
# 支持区域操作 生成报告 图片等

读取XML文件

library(XML)
fileUrl <- "http://www.w3schools.com/xml/simple.xml"
# 读取xml结构
doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
# 提取节点
rootNode <- xmlRoot(doc)
# 提取根节点名
xmlName(rootNode)
# 提取子节点名
names(rootNode)
# 提取节点数值
xmlSApply(rootNode, xmlValue)
# 提取节点下属性名为name的数值
xpathSApply(rootNode, "//name", xmlValue)

读取json文件

library(jsonlite)
# 读取json文件
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
# 列出文件名
names(jsonData)
# 可嵌套截取
jsonData$owner$login
# 可将R对象写成json文件
myjson <- toJSON(iris, pretty = TRUE)

读取MySQL数据库

library(RMySQL)
# 读取数据库
ucscDb <- dbConnect(MySQL(), user = "genome", host = "genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDb, "show databases;")
# 断开链接
dbDisconnect(ucscDb)
# 读取指定数据库
hg19 <- dbConnect(MySQL(), user = "genome", db = "hg19", host = "genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
# mysql语句查询
dbGetQuery(hg19, "select count(*) from affyU133Plus2")
# 选择子集
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query)
quantile(affyMis$misMatches)

读取HDF5数据

library(rhdf5)
created = h5createFile("example.h5")
created = h5createGroup("example.h5", "foo")
created = h5createGroup("example.h5", "baa")
created = h5createGroup("example.h5", "foo/foobaa")
h5ls("example.h5")
A = matrix(1:10, nr = 5, nc = 2)
h5write(A, "example.h5", "foo/A")
B = array(seq(0.1, 2, by = 0.1), dim = c(5, 2, 2))
attr(B, "scale") <- "liter"
h5write(B, "example.h5", "foo/foobaa/B")
h5ls("example.h5")
df = data.frame(1L:5L, seq(0, 1, length.out = 5), c("ab", "cde", "fghi", "a", 
    "s"), stringsAsFactors = FALSE)
h5write(df, "example.h5", "df")
h5ls("example.h5")
readA = h5read("example.h5", "foo/A")
readB = h5read("example.h5", "foo/foobaa/B")
readdf = h5read("example.h5", "df")

读取网页数据

con = url("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en")
htmlCode = readLines(con)
close(con)
htmlCode
library(XML)
url <- "http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en"
html <- htmlTreeParse(url, useInternalNodes = T)
xpathSApply(html, "//title", xmlValue)
library(httr)
html2 = GET(url)
content2 = content(html2, as = "text")
parsedHtml = htmlParse(content2, asText = TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)
GET("http://httpbin.org/basic-auth/user/passwd")
GET("http://httpbin.org/basic-auth/user/passwd", authenticate("user", "passwd"))
google = handle("http://google.com")
pg1 = GET(handle = google, path = "/")
pg2 = GET(handle = google, path = "search")

读取API

myapp = oauth_app("twitter", key = "yourConsumerKeyHere", secret = "yourConsumerSecretHere")
sig = sign_oauth1.0(myapp, token = "yourTokenHere", token_secret = "yourTokenSecretHere")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json", sig)
json1 = content(homeTL)
json2 = jsonlite::fromJSON(toJSON(json1))

读取其他资源

数据截取与排序

library(plyr)
arrange(X, var1)
arrange(X, desc(var1))

数据总结

library(Hmisc)
data$zipGroups = cut2(data$zipCode, g = 4)
table(data$zipGroups)
library(plyr)
# mutate进行数据替换或生成
data2 = mutate(data, zipGroups = cut2(zipCode, g = 4))
table(data2$zipGroups)

数据整理

数据操作data.table包

DT[, list(mean(x), sum(z))]
DT[, table(y)]
DT[, `:=`(w, z^2)]
DT[, `:=`(m, {
    tmp <- (x + z)
    log2(tmp + 5)
})]
DT[, `:=`(a, x > 0)]
DT[, `:=`(b, mean(x + w)), by = a]
DT <- data.table(x = sample(letters[1:3], 1e+05, TRUE))
DT[, .N, by = x]

文本处理

firstElement <- function(x) {
    x[1]
}
sapply(splitNames, firstElement)

日期处理