if (!file.exists("data")) {
dir.create("data")
}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD&bom=true&format=true"
download.file(fileUrl, destfile = "./data/cameras.xlsx", method = "curl")
dataDownloaded <- data()
#使用readr
#Rstudio自身的import dataset功能用的就是readr
library(readr)
camerasData <- read_csv("data/cameras.xlsx", col_types = "cccccc")
head(camerasData, 2)
## # A tibble: 2 x 6
## address direction street crossStreet intersection `Location 1`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 S CATON AV~ N/B Caton ~ Benson Ave Caton Ave & B~ (39.2693779962~
## 2 S CATON AV~ S/B Caton ~ Benson Ave Caton Ave & B~ (39.2693157898~
#read specific rows and colummns -- col_types
#读取前两行
camerasData_1 <- read_csv("data/cameras.xlsx", col_types = "cc")
#排除某些行cols( ?= col_skip)
camerasData_2 <- read_csv("data/cameras.xlsx",
col_types = cols(address = col_skip(), street = col_skip()
)
)
head(camerasData_2, 2)
#使用xlsx包,
#安装xlsz包之前,安装rJava和xlsxjars
library(xlsx)
#此处报错java.lang.IllegalArgumentException: Your InputStream was neither an OLE2 str
#解决方法:将cameras.xlsx改成cameras1.xls,然后用excel打开,再改成cameras2.xlsx
#推荐用readr包
cameraDataSubset <- read.xlsx("data/cameras2.xlsx",
sheetIndex = 1,
colIndex = 2:3,
rowIndex = 1:4
)
cameraDataSubset
## direction street
## 1 N/B Caton Ave
## 2 S/B Caton Ave
## 3 E/B Wilkens Ave
data
#see all the datasets that are available in the packages
data()
#a more complete list
data(package = .packages(TRUE))
# access the data in any of these datasets
openData <- data("GVTable3", package = "AlgDesign")
head(openData)
read.table(read.csv)
library(learningr)
# system.file func locateS files that are inside a package
# system.file("文件夹","data数据集","包名称")
deer_file <- system.file(
"extdata",
"RedDeerEndocranialVolume.dlm",
package = "learningr"
)
## fill = TRUE makes read.table substitute NA value for the missing fields
#' FUNCTION_TITLE
#' read.table 参数
#' @param file DESCRIPTION.
#' @param header 列名.
#' @param sep 间隔符号.
#' @param quote DESCRIPTION.
#' @param dec DESCRIPTION.
#' @param numerals DESCRIPTION.
#' @param row.names 行名称.
#' @param col.names 列名称.
#' @param as.is DESCRIPTION.
#' @param na.strings 传递缺失值SQL,na.strings = "NULL",SAS orStata, na.strings = "." ,Excel na.strings = c("", "#N/A", "#DIV/0!", "#NUM!")
#' @param colClasses 列类别.
#' @param nrows 读取行数.
#' @param skip 跳过列.
#' @param check.names DESCRIPTION.
#' @param fill 空值区域用NA代替.
#' @param strip.white DESCRIPTION.
#' @param blank.lines.skip DESCRIPTION.
#' @param comment.char DESCRIPTION.
#' @param allowEscapes DESCRIPTION.
#' @param flush DESCRIPTION.
#' @param stringsAsFactors DESCRIPTION.
#' @param fileEncoding DESCRIPTION.
#' @param encoding DESCRIPTION.
#' @param text DESCRIPTION.
#' @param skipNul DESCRIPTION.
#'
#' @return RETURN_DESCRIPTION
#' @examples
#' # ADD_EXAMPLES_HERE
deer_data <- read.table(deer_file, header = TRUE, fill = TRUE)
str(deer_data, vec.len = 1) #vec.len alters the amount of output
## 'data.frame': 33 obs. of 8 variables:
## $ SkullID : Factor w/ 33 levels "A4","B11","B12",..: 14 2 ...
## $ VolCT : int 389 389 ...
## $ VolBead : int 375 370 ...
## $ VolLWH : int 1484 1722 ...
## $ VolFinarelli: int 337 377 ...
## $ VolCT2 : int NA NA ...
## $ VolBead2 : int NA NA ...
## $ VolLWH2 : int NA NA ...
write.table(write.csv)
write.csv(deer_data, "./data/deer_data.csv", row.names = FALSE)
#write(文件名,文件路径,写入的参数设置)
readLines WriteLines
the_tempest <- readLines("./data/testread.txt")
the_tempest[1] <- "keep moving"
writeLines(
the_tempest,
"./data/testwrite.txt"
)
# using internal nodes (that is, objects
# are stored with C code, the default) or #R nodes.
## using internal nodes (can use XPath 查询)
library(XML)
xml_file <- system.file("extdata", "options.xml", package = "learningr")
r_options <- xmlParse(xml_file)
# head(r_options) #Error in `[.XMLInternalDocument`(x, seq_len(n))
xpathSApply(r_options, "//variable[contains(@name, 'warn')]")
## [[1]]
## <variable name="nwarnings" type="numeric">
## <value>50</value>
## </variable>
##
## [[2]]
## <variable name="warn" type="numeric">
## <value>0</value>
## </variable>
##
## [[3]]
## <variable name="warning_length" type="numeric">
## <value>1000</value>
## </variable>
## R-level nodes ("head" "str" can work)
#xmlParse(xml_file, useInternalNodes = FALSE )
#xmlTreeParse(xml_file) #the same
#/node Top level node
#//node at any level
#node[@attr-name] Node with an attribute name attr-name= "bob"
rootNode <- xmlRoot(r_options)
xmlName(rootNode)
## [1] "root"
head(names(rootNode), 2)
## variable variable
## "variable" "variable"
fileUrl <- "http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"
doc <- htmlTreeParse(fileUrl, useInternalNodes = TRUE)
scores <- xpathSApply(doc, "//div[@class ='score']", xmlValue)
head(scores, 4)
## [1] "20-0" "24-10" "44-7" "26-9"
library(stringr)
teams <- xpathSApply(doc,"//div[@class='game-info']", xmlValue)
teams_deal <- vector(mode = "character", length = length(teams))
n <- 1
for (i in teams){
teams_deal[n] <- str_split(i, " ")[[1]][3]
n <- n + 1
}
head(teams_deal, 4)
## [1] "Bengals" "Browns" "Jaguars" "Steelers"
RJSONIO and rjson#use RJSONIO and rjson packages
library(RJSONIO)
library(rjson)
jamaican_city_file <- system.file(
"extdata",
"Jamaican Cities.json",
package = "learningr")
#::are used to distinguish which package each function should be taken from
jamaican_cities_RJSONIO <- RJSONIO::fromJSON(jamaican_city_file)
jamaican_cities_rjson <- rjson::fromJSON(file = jamaican_city_file
#JSON spec doesn’t allow infinite or NaN values,
## RJSONIO maps NaN and NA to JSON’s null value
## but preserves positive and negative infinity,
special_numbers <- c(NaN, NA, Inf, -Inf)
RJSONIO::toJSON(special_numbers)
## [1] "[ null, null, Infinity, - Infinity ]"
#rjson converts all these values to strings
rjson::toJSON(special_numbers)
## [1] "[\"NaN\",\"NA\",\"Inf\",\"-Inf\"]"
jsonlitelibrary(jsonlite)
jsonData <- jsonlite::fromJSON("https://api.github.com/users/jtleek/repos")
head(names(jsonData))
## [1] "id" "name" "full_name" "owner" "private" "html_url"
head(jsonData$name)
## [1] "advdatasci" "advdatasci-project" "advdatasci-swirl"
## [4] "advdatasci15" "advdatasci16" "advdatasci_swirl"
head(names(jsonData$owner))
## [1] "login" "id" "avatar_url" "gravatar_id" "url"
## [6] "html_url"
toJSON
library(data.table)
DF = data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DF,3)
## x y z
## 1 -0.03499954 a -0.5006231
## 2 -0.89593855 a -0.1174095
## 3 0.73458340 a -0.2020225
DT = data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DT,3)
## x y z
## 1: 0.1270484 a 0.6307191
## 2: 2.2383929 a 0.2983171
## 3: 0.9396463 a 0.1547205
## See all the data tables in memory
tables()
## NAME NROW NCOL MB COLS KEY
## [1,] DT 9 3 1 x,y,z
## Total: 1MB
## Subsetting rows
DT[2,]
## x y z
## 1: 2.238393 a 0.2983171
DT[DT$y=="a",]
## x y z
## 1: 0.1270484 a 0.6307191
## 2: 2.2383929 a 0.2983171
## 3: 0.9396463 a 0.1547205
## Subsetting rows
DT[c(2,3)]
## x y z
## 1: 2.2383929 a 0.2983171
## 2: 0.9396463 a 0.1547205
## Subsetting columns
DT[,c(2,3)]
## y z
## 1: a 0.63071906
## 2: a 0.29831714
## 3: a 0.15472053
## 4: b 0.59425949
## 5: b -0.28911627
## 6: b 1.62790927
## 7: c -0.09997734
## 8: c 1.34110247
## 9: c 0.46024374
{
x = 1
y = 2
}
k = {print(10); 5}
## [1] 10
print(k)
## [1] 5
## Calculating values for variables with expressions
DT[,list(mean(x),sum(z))]
## V1 V2
## 1: 0.3121628 4.718178
DT[,table(y)]
## y
## a b c
## 3 3 3
# adding new columns
DT[,W:=z^2]
DT
## x y z W
## 1: 0.1270484 a 0.63071906 0.397806538
## 2: 2.2383929 a 0.29831714 0.088993113
## 3: 0.9396463 a 0.15472053 0.023938444
## 4: 0.9460164 b 0.59425949 0.353144347
## 5: 0.4465410 b -0.28911627 0.083588217
## 6: -0.8062434 b 1.62790927 2.650088584
## 7: -0.5899296 c -0.09997734 0.009995468
## 8: -0.6806829 c 1.34110247 1.798555841
## 9: 0.1886763 c 0.46024374 0.211824304
DT2 <- DT
DT[, y:= 2]
head(DT,n=3)
## x y z W
## 1: 0.1270484 2 0.6307191 0.39780654
## 2: 2.2383929 2 0.2983171 0.08899311
## 3: 0.9396463 2 0.1547205 0.02393844
##复制版本也跟着变化
head(DT2,n=3)
## x y z W
## 1: 0.1270484 2 0.6307191 0.39780654
## 2: 2.2383929 2 0.2983171 0.08899311
## 3: 0.9396463 2 0.1547205 0.02393844
## Multiple operations
DT[,m:= {tmp <- (x+z); log2(tmp+5)}]
DT
## x y z W m
## 1: 0.1270484 2 0.63071906 0.397806538 2.525510
## 2: 2.2383929 2 0.29831714 0.088993113 2.913935
## 3: 0.9396463 2 0.15472053 0.023938444 2.607476
## 4: 0.9460164 2 0.59425949 0.353144347 2.709351
## 5: 0.4465410 2 -0.28911627 0.083588217 2.366651
## 6: -0.8062434 2 1.62790927 2.650088584 2.541432
## 7: -0.5899296 2 -0.09997734 0.009995468 2.107719
## 8: -0.6806829 2 1.34110247 1.798555841 2.500909
## 9: 0.1886763 2 0.46024374 0.211824304 2.497975
## plyr like operations
DT[,a:=x>0]
DT
## x y z W m a
## 1: 0.1270484 2 0.63071906 0.397806538 2.525510 TRUE
## 2: 2.2383929 2 0.29831714 0.088993113 2.913935 TRUE
## 3: 0.9396463 2 0.15472053 0.023938444 2.607476 TRUE
## 4: 0.9460164 2 0.59425949 0.353144347 2.709351 TRUE
## 5: 0.4465410 2 -0.28911627 0.083588217 2.366651 TRUE
## 6: -0.8062434 2 1.62790927 2.650088584 2.541432 FALSE
## 7: -0.5899296 2 -0.09997734 0.009995468 2.107719 FALSE
## 8: -0.6806829 2 1.34110247 1.798555841 2.500909 FALSE
## 9: 0.1886763 2 0.46024374 0.211824304 2.497975 TRUE
DT[, b:= mean(x+W),by=a]
DT
## x y z W m a b
## 1: 0.1270484 2 0.63071906 0.397806538 2.525510 TRUE 1.007603
## 2: 2.2383929 2 0.29831714 0.088993113 2.913935 TRUE 1.007603
## 3: 0.9396463 2 0.15472053 0.023938444 2.607476 TRUE 1.007603
## 4: 0.9460164 2 0.59425949 0.353144347 2.709351 TRUE 1.007603
## 5: 0.4465410 2 -0.28911627 0.083588217 2.366651 TRUE 1.007603
## 6: -0.8062434 2 1.62790927 2.650088584 2.541432 FALSE 0.793928
## 7: -0.5899296 2 -0.09997734 0.009995468 2.107719 FALSE 0.793928
## 8: -0.6806829 2 1.34110247 1.798555841 2.500909 FALSE 0.793928
## 9: 0.1886763 2 0.46024374 0.211824304 2.497975 TRUE 1.007603
.N An integer, length 1, containing the number of elements of a factor level
set.seed(123);
DT <- data.table(x=sample(letters[1:3], 1E5, TRUE))
DT[, .N, by=x] #计数
## x N
## 1: a 33387
## 2: c 33201
## 3: b 33412
##选取直接速度比data.frame快
DT <- data.table(x=rep(c("a","b","c"),each=100), y=rnorm(300))
setkey(DT, x)
head(DT['a'], 10) #选取x = a的行
## x y
## 1: a 0.25958973
## 2: a 0.91751072
## 3: a -0.72231834
## 4: a -0.80828402
## 5: a -0.14135202
## 6: a 2.25701345
## 7: a -2.37955015
## 8: a -0.45425393
## 9: a -0.06007418
## 10: a 0.86090061
##合并
DT1 <- data.table(x=c('a', 'a', 'b', 'dt1'), y=1:4)
DT2 <- data.table(x=c('a', 'b', 'dt2'), z=5:7)
setkey(DT1, x); setkey(DT2, x)
merge(DT1, DT2) #合并x的值相等的行
## x y z
## 1: a 1 5
## 2: a 2 5
## 3: b 3 6
library(data.table)
big_df <- data.frame(x=rnorm(1E6), y=rnorm(1E6))
file <- tempfile()
write.table(big_df, file=file, row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE)
system.time(fread(file))
## user system elapsed
## 1.73 0.01 1.86
system.time(read.table(file, header=TRUE, sep="\t"))
## user system elapsed
## 6.80 0.13 7.02
1,The American Community Survey distributes downloadable data about United States communities.Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here :load the data into R. The code book, describing the variable names is here:
How many properties are worth $1,000,000 or more?
#下载数据
if (!file.exists("data")) {
dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv"
download.file(fileUrl, destfile = "./data/Fss06hid.csv", method = "curl")
dataDownloaded <- data()
#读取数据
library(readr)
housesData <- read_csv("./data/Fss06hid.csv", col_types = cols_only(VAL = col_integer()), col_names = TRUE)
head(housesData)
## # A tibble: 6 x 1
## VAL
## <int>
## 1 17
## 2 NA
## 3 18
## 4 19
## 5 20
## 6 15
#找出数据大于$1,000,000即VAL == 24
## 去除NA并选择子集
nrow(housesData[!is.na(housesData$VAL) & housesData$VAL == 24, ])
## [1] 53
2,Use the data you loaded from Question 1. Consider the variable FES in the code book. Which of the “tidy data” principles does this variable violate?
Each tidy data table contains information about only one type of observation.
Tidy data has one variable per column.
Tidy data has one observation per row.
Each variable in a tidy data set has been transformed to be interpretable.
3, Download the Excel spreadsheet on Natural Gas Aquisition Program here: Read rows 18-23 and columns 7-15 into R and assign the result to a variable called:dat
What is the value of:sum(dat$Zip*dat$Ext,na.rm=T) (original data source: (http://catalog.data.gov/dataset/natural-gas-acquisition-program)
if (!file.exists("data")) {
dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FDATA.gov_NGAP.xlsx"
download.file(fileUrl, destfile = "./data/gov_NGAP.xlsx", method = "curl")
dataDownloaded <- data()
#使用xlsx包,
library(xlsx)
NGAP_dataSubset <- read.xlsx("data/gov_NGAP.xlsx",
sheetIndex = 1,
colIndex = 7:15,
rowIndex = 18:23
)
dat <- NGAP_dataSubset
dat
## Zip CuCurrent PaCurrent PoCurrent Contact Ext Fax email
## 1 74136 0 1 0 918-491-6998 0 918-491-6659 NA
## 2 30329 1 0 0 404-321-5711 NA <NA> NA
## 3 74136 1 0 0 918-523-2516 0 918-523-2522 NA
## 4 80203 0 1 0 303-864-1919 0 <NA> NA
## 5 80120 1 0 0 345-098-8890 456 <NA> NA
## Status
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
sum(dat$Zip*dat$Ext,na.rm=T)
## [1] 36534720
4, Read the XML data on Baltimore restaurants from here: How many restaurants have zipcode 21231?
library(XML)
fileUrl <- "./data/Frestaurants.xml"
doc <- xmlParse(fileUrl)
xmltop = xmlRoot(doc) #根目录
art = xmltop[[1]] #1级目录
#一级目录里每一个子目录x,取x的第二个参数
data <- xmlSApply(art, function(x) xmlSApply(x[[2]], xmlValue))
length(data[data == "21231"])
## [1] 127
5,The American Community Survey distributes downloadable data about United States communities. Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here using the fread() command load the data into an R object. The following are ways to calculate the average value of the variable broken down by sex. Using the data.table package, which will deliver the fastest user
if (!file.exists("data")) {
dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv"
download.file(fileUrl, destfile = "./data/ss06pid.csv", method = "curl")
dataDownloaded <- data()
require(data.table)
file <- "./data/ss06pid.csv"
DT <- fread(file)
tapply(DT$pwgtp15, DT$SEX, mean)
sapply(split(DT$pwgtp15, DT$SEX), mean)
mean(DT$pwgtp15,by = DT$SEX)
mean(DT[DT$SEX==1,]$pwgtp15); mean(DT[DT$SEX==2,]$pwgtp15)
rowMeans(DT)[DT$SEX==1];rowMeans(DT)[DT$SEX==2]
DT[,mean(pwgtp15),by = SEX]