Raw data -> Processing script -> tidy data
if (!file.exists("data")) {
dir.create("data")
}
fileUrl <- "yoururl"
download.file(fileUrl, destfile = "./data/XXX.csv", method = "curl")
list.files("./data")
dateDownloaded <- date()
read.tableread.csv 默认sep=",", header=TRUEquote 设定引用na.strings 设定缺失值字符nrows 设定读取字段skip 跳过开始行数library(xlsx)
cameraData <- read.xlsx("./data/cameras.xlsx", sheetIndex = 1, header = TRUE)
head(cameraData)
# read.xlsx2更快不过选行读取时会不稳定 支持底层读取 如字体等
library(XLConnect)
wb <- loadWorkbook("XLConnectExample1.xlsx", create = TRUE)
createSheet(wb, name = "chickSheet")
writeWorksheet(wb, ChickWeight, sheet = "chickSheet", startRow = 3, startCol = 4)
saveWorkbook(wb)
# 支持区域操作 生成报告 图片等
library(XML)
fileUrl <- "http://www.w3schools.com/xml/simple.xml"
# 读取xml结构
doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
# 提取节点
rootNode <- xmlRoot(doc)
# 提取根节点名
xmlName(rootNode)
# 提取子节点名
names(rootNode)
# 提取节点数值
xmlSApply(rootNode, xmlValue)
# 提取节点下属性名为name的数值
xpathSApply(rootNode, "//name", xmlValue)
library(jsonlite)
# 读取json文件
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
# 列出文件名
names(jsonData)
# 可嵌套截取
jsonData$owner$login
# 可将R对象写成json文件
myjson <- toJSON(iris, pretty = TRUE)
library(RMySQL)
# 读取数据库
ucscDb <- dbConnect(MySQL(), user = "genome", host = "genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDb, "show databases;")
# 断开链接
dbDisconnect(ucscDb)
# 读取指定数据库
hg19 <- dbConnect(MySQL(), user = "genome", db = "hg19", host = "genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
# mysql语句查询
dbGetQuery(hg19, "select count(*) from affyU133Plus2")
# 选择子集
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query)
quantile(affyMis$misMatches)
library(rhdf5)
created = h5createFile("example.h5")
created = h5createGroup("example.h5", "foo")
created = h5createGroup("example.h5", "baa")
created = h5createGroup("example.h5", "foo/foobaa")
h5ls("example.h5")
A = matrix(1:10, nr = 5, nc = 2)
h5write(A, "example.h5", "foo/A")
B = array(seq(0.1, 2, by = 0.1), dim = c(5, 2, 2))
attr(B, "scale") <- "liter"
h5write(B, "example.h5", "foo/foobaa/B")
h5ls("example.h5")
df = data.frame(1L:5L, seq(0, 1, length.out = 5), c("ab", "cde", "fghi", "a",
"s"), stringsAsFactors = FALSE)
h5write(df, "example.h5", "df")
h5ls("example.h5")
readA = h5read("example.h5", "foo/A")
readB = h5read("example.h5", "foo/foobaa/B")
readdf = h5read("example.h5", "df")
con = url("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en")
htmlCode = readLines(con)
close(con)
htmlCode
library(XML)
url <- "http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en"
html <- htmlTreeParse(url, useInternalNodes = T)
xpathSApply(html, "//title", xmlValue)
library(httr)
html2 = GET(url)
content2 = content(html2, as = "text")
parsedHtml = htmlParse(content2, asText = TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)
GET("http://httpbin.org/basic-auth/user/passwd")
GET("http://httpbin.org/basic-auth/user/passwd", authenticate("user", "passwd"))
google = handle("http://google.com")
pg1 = GET(handle = google, path = "/")
pg2 = GET(handle = google, path = "search")
myapp = oauth_app("twitter", key = "yourConsumerKeyHere", secret = "yourConsumerSecretHere")
sig = sign_oauth1.0(myapp, token = "yourTokenHere", token_secret = "yourTokenSecretHere")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json", sig)
json1 = content(homeTL)
json2 = jsonlite::fromJSON(toJSON(json1))
图片
GIS
声音
$seq产生序列[按行 列或条件截取which返回行号sortorderlibrary(plyr)
arrange(X, var1)
arrange(X, desc(var1))
head tail查看数据summary str总结数据quantile 按分位数总结向量table 按向量元素频数总结sum(is.na(data)) any(is.na(data)) all(data$x > 0) 异常值总结colSums(is.na(data)) 行列求和table(data$x %in% c("21212"))特定数值计数总结xtabs ftable 创建列联表print(object.size(fakeData),units="Mb") 现实数据大小cut 通过设置breaks产生分类变量library(Hmisc)
data$zipGroups = cut2(data$zipCode, g = 4)
table(data$zipGroups)
library(plyr)
# mutate进行数据替换或生成
data2 = mutate(data, zipGroups = cut2(zipCode, g = 4))
table(data2$zipGroups)
melt进行数据融合reshape2包dcast分组汇总数据框acast分组汇总向量数组arrange指定变量名排序merge按照指定向量合并数据join函数也可实现合并data.framekey可指定因子变量并快速提取分组的行DT[, list(mean(x), sum(z))]
DT[, table(y)]
:生成新变量 进行简单计算DT[, `:=`(w, z^2)]
DT[, `:=`(m, {
tmp <- (x + z)
log2(tmp + 5)
})]
DT[, `:=`(a, x > 0)]
DT[, `:=`(b, mean(x + w)), by = a]
DT <- data.table(x = sample(letters[1:3], 1e+05, TRUE))
DT[, .N, by = x]
tolower toupperstrsplitfirstElement <- function(x) {
x[1]
}
sapply(splitNames, firstElement)
sub gsubgrep(返回行号) grepl(返回逻辑值)stringr paste0 不带空格str_trim 去除空格^ 匹配开头$ 匹配结尾[] 匹配大小写 ^在开头表示非. 匹配任意字符| 匹配或() 匹配与? 匹配可选择* 匹配任意+ 匹配至少一个{} 匹配其中最小最大 一个值表示精确匹配 m,表示至少m次匹配\1 匹配前面指代formate处理日期格式
%d 日 %a 周缩写%A 周%m 月%b 月缩写%B 月全名%y 2位年%Y 4位年weekdays 显示星期months 显示月份julian 显示70年以来的日期ymdmdydmyymd_hmsSys.timezone