1 Download the file to load

if (!file.exists("data")) {
  dir.create("data")
}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD&bom=true&format=true"
download.file(fileUrl, destfile = "./data/cameras.xlsx", method = "curl")
dataDownloaded <- data()
#使用readr
#Rstudio自身的import dataset功能用的就是readr
library(readr)
camerasData <- read_csv("data/cameras.xlsx", col_types = "cccccc")
head(camerasData, 2)
## # A tibble: 2 x 6
##   address     direction street  crossStreet intersection   `Location 1`   
##   <chr>       <chr>     <chr>   <chr>       <chr>          <chr>          
## 1 S CATON AV~ N/B       Caton ~ Benson Ave  Caton Ave & B~ (39.2693779962~
## 2 S CATON AV~ S/B       Caton ~ Benson Ave  Caton Ave & B~ (39.2693157898~
#read specific rows and colummns -- col_types
#读取前两行
camerasData_1 <- read_csv("data/cameras.xlsx", col_types = "cc")

#排除某些行cols( ?= col_skip)
camerasData_2 <- read_csv("data/cameras.xlsx",
                          col_types = cols(address = col_skip(), street = col_skip()
                                           )
                          )
head(camerasData_2, 2)
#使用xlsx包,
#安装xlsz包之前,安装rJava和xlsxjars
library(xlsx)
#此处报错java.lang.IllegalArgumentException: Your InputStream was neither an OLE2 str
#解决方法:将cameras.xlsx改成cameras1.xls,然后用excel打开,再改成cameras2.xlsx
#推荐用readr包
cameraDataSubset <- read.xlsx("data/cameras2.xlsx", 
                              sheetIndex = 1, 
                              colIndex = 2:3, 
                              rowIndex = 1:4
                              )
cameraDataSubset
##   direction      street
## 1       N/B   Caton Ave
## 2       S/B   Caton Ave
## 3       E/B Wilkens Ave

2 access datasets provided with R packages

data

#see all the datasets that are available in the packages
data()

#a more complete list
data(package = .packages(TRUE))

# access the data in any of these datasets
openData <- data("GVTable3", package = "AlgDesign")
head(openData)

3 Reading Text Files

3.1 CSV and Tab-Delimited Files

read.table(read.csv)

library(learningr)
# system.file func locateS files that are inside a package
# system.file("文件夹","data数据集","包名称")
deer_file <- system.file(
  "extdata",
  "RedDeerEndocranialVolume.dlm",
  package = "learningr"
)
## fill = TRUE makes read.table substitute NA value for the missing fields
#' FUNCTION_TITLE
#' read.table 参数
#' @param file DESCRIPTION.
#' @param header 列名.
#' @param sep 间隔符号.
#' @param quote DESCRIPTION.
#' @param dec DESCRIPTION.
#' @param numerals DESCRIPTION.
#' @param row.names 行名称.
#' @param col.names 列名称.
#' @param as.is DESCRIPTION.
#' @param na.strings 传递缺失值SQL,na.strings = "NULL",SAS orStata, na.strings  =  "." ,Excel na.strings = c("", "#N/A", "#DIV/0!", "#NUM!")
#' @param colClasses 列类别.
#' @param nrows 读取行数.
#' @param skip 跳过列.
#' @param check.names DESCRIPTION.
#' @param fill 空值区域用NA代替.
#' @param strip.white DESCRIPTION.
#' @param blank.lines.skip DESCRIPTION.
#' @param comment.char DESCRIPTION.
#' @param allowEscapes DESCRIPTION.
#' @param flush DESCRIPTION.
#' @param stringsAsFactors DESCRIPTION.
#' @param fileEncoding DESCRIPTION.
#' @param encoding DESCRIPTION.
#' @param text DESCRIPTION.
#' @param skipNul DESCRIPTION.
#'
#' @return RETURN_DESCRIPTION
#' @examples
#' # ADD_EXAMPLES_HERE
deer_data <- read.table(deer_file, header = TRUE, fill = TRUE)

str(deer_data, vec.len = 1) #vec.len alters the amount of output
## 'data.frame':    33 obs. of  8 variables:
##  $ SkullID     : Factor w/ 33 levels "A4","B11","B12",..: 14 2 ...
##  $ VolCT       : int  389 389 ...
##  $ VolBead     : int  375 370 ...
##  $ VolLWH      : int  1484 1722 ...
##  $ VolFinarelli: int  337 377 ...
##  $ VolCT2      : int  NA NA ...
##  $ VolBead2    : int  NA NA ...
##  $ VolLWH2     : int  NA NA ...

write.table(write.csv)

write.csv(deer_data, "./data/deer_data.csv", row.names = FALSE)

#write(文件名,文件路径,写入的参数设置)

3.2 Unstructured Text Files

readLines WriteLines

the_tempest <- readLines("./data/testread.txt")
the_tempest[1] <- "keep moving"
writeLines(
  the_tempest,   
  "./data/testwrite.txt"
)

4 XML and HTML Files

4.1 XML

# using internal nodes (that is, objects 
# are stored with C code, the default) or #R nodes.
## using internal nodes (can use XPath 查询)

library(XML)
xml_file <- system.file("extdata", "options.xml", package = "learningr")
r_options <- xmlParse(xml_file)
# head(r_options)  #Error in `[.XMLInternalDocument`(x, seq_len(n))
xpathSApply(r_options, "//variable[contains(@name, 'warn')]")
## [[1]]
## <variable name="nwarnings" type="numeric">
##   <value>50</value>
## </variable> 
## 
## [[2]]
## <variable name="warn" type="numeric">
##   <value>0</value>
## </variable> 
## 
## [[3]]
## <variable name="warning_length" type="numeric">
##   <value>1000</value>
## </variable>
## R-level nodes ("head" "str" can work)

#xmlParse(xml_file, useInternalNodes = FALSE )
#xmlTreeParse(xml_file)      #the same

#/node  Top level node
#//node  at any level
#node[@attr-name] Node with an attribute name attr-name= "bob"
rootNode <- xmlRoot(r_options)
xmlName(rootNode)
## [1] "root"
head(names(rootNode), 2)
##   variable   variable 
## "variable" "variable"

4.2 HTML

fileUrl <- "http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"
doc <- htmlTreeParse(fileUrl, useInternalNodes = TRUE)
scores <- xpathSApply(doc, "//div[@class ='score']", xmlValue)
head(scores, 4)
## [1] "20-0"  "24-10" "44-7"  "26-9"
library(stringr)
teams <- xpathSApply(doc,"//div[@class='game-info']", xmlValue)
teams_deal <- vector(mode = "character", length = length(teams))
n <- 1
for (i in teams){
    teams_deal[n] <- str_split(i, " ")[[1]][3]
    n <- n + 1
}
head(teams_deal, 4)
## [1] "Bengals"  "Browns"   "Jaguars"  "Steelers"

5 JSON and YAML

5.1 RJSONIO and rjson

#use RJSONIO and rjson packages
library(RJSONIO)
library(rjson)
jamaican_city_file <- system.file(
    "extdata",
    "Jamaican Cities.json",
    package = "learningr")
#::are used to distinguish which package each function should be taken from
jamaican_cities_RJSONIO <- RJSONIO::fromJSON(jamaican_city_file)
jamaican_cities_rjson <- rjson::fromJSON(file = jamaican_city_file
#JSON spec doesn’t allow infinite or NaN values, 

## RJSONIO maps NaN and NA to JSON’s null value 
## but preserves positive and negative infinity,
special_numbers <- c(NaN, NA, Inf, -Inf)
RJSONIO::toJSON(special_numbers)
## [1] "[ null, null,   Infinity,  - Infinity ]"
#rjson converts all these values to strings
rjson::toJSON(special_numbers)
## [1] "[\"NaN\",\"NA\",\"Inf\",\"-Inf\"]"

5.2 jsonlite

library(jsonlite)
jsonData <- jsonlite::fromJSON("https://api.github.com/users/jtleek/repos")
head(names(jsonData))
## [1] "id"        "name"      "full_name" "owner"     "private"   "html_url"
head(jsonData$name)
## [1] "advdatasci"         "advdatasci-project" "advdatasci-swirl"  
## [4] "advdatasci15"       "advdatasci16"       "advdatasci_swirl"
head(names(jsonData$owner))
## [1] "login"       "id"          "avatar_url"  "gravatar_id" "url"        
## [6] "html_url"

5.3 Writing data frames to JSON

toJSON

6 data.table

6.1 what is data.table?

  • Inherets from data.frame
    • All functions that accept data.frame work on data.table
  • Written in C so it is much faster
  • Much, much faster at subsetting, group, and updating
library(data.table)
DF = data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DF,3)
##             x y          z
## 1 -0.03499954 a -0.5006231
## 2 -0.89593855 a -0.1174095
## 3  0.73458340 a -0.2020225
DT = data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DT,3)
##            x y         z
## 1: 0.1270484 a 0.6307191
## 2: 2.2383929 a 0.2983171
## 3: 0.9396463 a 0.1547205
## See all the data tables in memory
tables()
##      NAME NROW NCOL MB COLS  KEY
## [1,] DT      9    3  1 x,y,z    
## Total: 1MB
## Subsetting rows
DT[2,]
##           x y         z
## 1: 2.238393 a 0.2983171
DT[DT$y=="a",]
##            x y         z
## 1: 0.1270484 a 0.6307191
## 2: 2.2383929 a 0.2983171
## 3: 0.9396463 a 0.1547205
## Subsetting rows
DT[c(2,3)]
##            x y         z
## 1: 2.2383929 a 0.2983171
## 2: 0.9396463 a 0.1547205
## Subsetting columns
DT[,c(2,3)]
##    y           z
## 1: a  0.63071906
## 2: a  0.29831714
## 3: a  0.15472053
## 4: b  0.59425949
## 5: b -0.28911627
## 6: b  1.62790927
## 7: c -0.09997734
## 8: c  1.34110247
## 9: c  0.46024374

6.2 Column subsetting in data.table

  • The subsetting function is modified for data.table
  • The argument you pass after the comma is called an “expression”
  • In R an expression is a collection of statements enclosed in curley brackets
{
  x = 1
  y = 2
}
k = {print(10); 5}
## [1] 10
print(k)
## [1] 5
## Calculating values for variables with expressions
DT[,list(mean(x),sum(z))]
##           V1       V2
## 1: 0.3121628 4.718178
DT[,table(y)]
## y
## a b c 
## 3 3 3
# adding new columns
DT[,W:=z^2]
DT
##             x y           z           W
## 1:  0.1270484 a  0.63071906 0.397806538
## 2:  2.2383929 a  0.29831714 0.088993113
## 3:  0.9396463 a  0.15472053 0.023938444
## 4:  0.9460164 b  0.59425949 0.353144347
## 5:  0.4465410 b -0.28911627 0.083588217
## 6: -0.8062434 b  1.62790927 2.650088584
## 7: -0.5899296 c -0.09997734 0.009995468
## 8: -0.6806829 c  1.34110247 1.798555841
## 9:  0.1886763 c  0.46024374 0.211824304
DT2 <- DT
DT[, y:= 2]
head(DT,n=3)
##            x y         z          W
## 1: 0.1270484 2 0.6307191 0.39780654
## 2: 2.2383929 2 0.2983171 0.08899311
## 3: 0.9396463 2 0.1547205 0.02393844
##复制版本也跟着变化
head(DT2,n=3)
##            x y         z          W
## 1: 0.1270484 2 0.6307191 0.39780654
## 2: 2.2383929 2 0.2983171 0.08899311
## 3: 0.9396463 2 0.1547205 0.02393844
## Multiple operations
DT[,m:= {tmp <- (x+z); log2(tmp+5)}]
DT
##             x y           z           W        m
## 1:  0.1270484 2  0.63071906 0.397806538 2.525510
## 2:  2.2383929 2  0.29831714 0.088993113 2.913935
## 3:  0.9396463 2  0.15472053 0.023938444 2.607476
## 4:  0.9460164 2  0.59425949 0.353144347 2.709351
## 5:  0.4465410 2 -0.28911627 0.083588217 2.366651
## 6: -0.8062434 2  1.62790927 2.650088584 2.541432
## 7: -0.5899296 2 -0.09997734 0.009995468 2.107719
## 8: -0.6806829 2  1.34110247 1.798555841 2.500909
## 9:  0.1886763 2  0.46024374 0.211824304 2.497975
## plyr like operations
DT[,a:=x>0]
DT
##             x y           z           W        m     a
## 1:  0.1270484 2  0.63071906 0.397806538 2.525510  TRUE
## 2:  2.2383929 2  0.29831714 0.088993113 2.913935  TRUE
## 3:  0.9396463 2  0.15472053 0.023938444 2.607476  TRUE
## 4:  0.9460164 2  0.59425949 0.353144347 2.709351  TRUE
## 5:  0.4465410 2 -0.28911627 0.083588217 2.366651  TRUE
## 6: -0.8062434 2  1.62790927 2.650088584 2.541432 FALSE
## 7: -0.5899296 2 -0.09997734 0.009995468 2.107719 FALSE
## 8: -0.6806829 2  1.34110247 1.798555841 2.500909 FALSE
## 9:  0.1886763 2  0.46024374 0.211824304 2.497975  TRUE
DT[, b:= mean(x+W),by=a]
DT
##             x y           z           W        m     a        b
## 1:  0.1270484 2  0.63071906 0.397806538 2.525510  TRUE 1.007603
## 2:  2.2383929 2  0.29831714 0.088993113 2.913935  TRUE 1.007603
## 3:  0.9396463 2  0.15472053 0.023938444 2.607476  TRUE 1.007603
## 4:  0.9460164 2  0.59425949 0.353144347 2.709351  TRUE 1.007603
## 5:  0.4465410 2 -0.28911627 0.083588217 2.366651  TRUE 1.007603
## 6: -0.8062434 2  1.62790927 2.650088584 2.541432 FALSE 0.793928
## 7: -0.5899296 2 -0.09997734 0.009995468 2.107719 FALSE 0.793928
## 8: -0.6806829 2  1.34110247 1.798555841 2.500909 FALSE 0.793928
## 9:  0.1886763 2  0.46024374 0.211824304 2.497975  TRUE 1.007603

6.3 Special variables

.N An integer, length 1, containing the number of elements of a factor level

set.seed(123);
DT <- data.table(x=sample(letters[1:3], 1E5, TRUE))
DT[, .N, by=x] #计数
##    x     N
## 1: a 33387
## 2: c 33201
## 3: b 33412

6.4 Keys

##选取直接速度比data.frame快
DT <- data.table(x=rep(c("a","b","c"),each=100), y=rnorm(300))
setkey(DT, x)
head(DT['a'], 10) #选取x = a的行
##     x           y
##  1: a  0.25958973
##  2: a  0.91751072
##  3: a -0.72231834
##  4: a -0.80828402
##  5: a -0.14135202
##  6: a  2.25701345
##  7: a -2.37955015
##  8: a -0.45425393
##  9: a -0.06007418
## 10: a  0.86090061
##合并
DT1 <- data.table(x=c('a', 'a', 'b', 'dt1'), y=1:4)
DT2 <- data.table(x=c('a', 'b', 'dt2'), z=5:7)
setkey(DT1, x); setkey(DT2, x)
merge(DT1, DT2) #合并x的值相等的行
##    x y z
## 1: a 1 5
## 2: a 2 5
## 3: b 3 6

fread()

library(data.table)
big_df <- data.frame(x=rnorm(1E6), y=rnorm(1E6))
file <- tempfile()
write.table(big_df, file=file, row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE)
system.time(fread(file))
##    user  system elapsed 
##    1.73    0.01    1.86
system.time(read.table(file, header=TRUE, sep="\t"))
##    user  system elapsed 
##    6.80    0.13    7.02

Week 1 Quiz

1,The American Community Survey distributes downloadable data about United States communities.Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here :load the data into R. The code book, describing the variable names is here:

How many properties are worth $1,000,000 or more?

#下载数据
if (!file.exists("data")) {
    dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv"
download.file(fileUrl, destfile = "./data/Fss06hid.csv", method = "curl")
dataDownloaded <- data()
#读取数据
library(readr)
housesData <- read_csv("./data/Fss06hid.csv", col_types = cols_only(VAL = col_integer()), col_names = TRUE)
head(housesData)
## # A tibble: 6 x 1
##     VAL
##   <int>
## 1    17
## 2    NA
## 3    18
## 4    19
## 5    20
## 6    15
#找出数据大于$1,000,000即VAL == 24
## 去除NA并选择子集
nrow(housesData[!is.na(housesData$VAL) & housesData$VAL == 24, ])
## [1] 53

2,Use the data you loaded from Question 1. Consider the variable FES in the code book. Which of the “tidy data” principles does this variable violate?

  • Each tidy data table contains information about only one type of observation.

  • Tidy data has one variable per column.

  • Tidy data has one observation per row.

  • Each variable in a tidy data set has been transformed to be interpretable.

3, Download the Excel spreadsheet on Natural Gas Aquisition Program here: Read rows 18-23 and columns 7-15 into R and assign the result to a variable called:dat

What is the value of:sum(dat$Zip*dat$Ext,na.rm=T) (original data source: (http://catalog.data.gov/dataset/natural-gas-acquisition-program)

if (!file.exists("data")) {
  dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FDATA.gov_NGAP.xlsx"
download.file(fileUrl, destfile = "./data/gov_NGAP.xlsx", method = "curl")
dataDownloaded <- data()
#使用xlsx包,
library(xlsx)
NGAP_dataSubset <- read.xlsx("data/gov_NGAP.xlsx", 
                              sheetIndex = 1, 
                              colIndex = 7:15, 
                              rowIndex = 18:23
                              )
dat <- NGAP_dataSubset
dat
##     Zip CuCurrent PaCurrent PoCurrent      Contact Ext          Fax email
## 1 74136         0         1         0 918-491-6998   0 918-491-6659    NA
## 2 30329         1         0         0 404-321-5711  NA         <NA>    NA
## 3 74136         1         0         0 918-523-2516   0 918-523-2522    NA
## 4 80203         0         1         0 303-864-1919   0         <NA>    NA
## 5 80120         1         0         0 345-098-8890 456         <NA>    NA
##   Status
## 1      1
## 2      1
## 3      1
## 4      1
## 5      1
sum(dat$Zip*dat$Ext,na.rm=T)
## [1] 36534720

4, Read the XML data on Baltimore restaurants from here: How many restaurants have zipcode 21231?

library(XML)
fileUrl <- "./data/Frestaurants.xml"
doc <- xmlParse(fileUrl)
xmltop = xmlRoot(doc)  #根目录
art = xmltop[[1]] #1级目录
#一级目录里每一个子目录x,取x的第二个参数
data <- xmlSApply(art, function(x) xmlSApply(x[[2]], xmlValue)) 
length(data[data == "21231"])
## [1] 127

5,The American Community Survey distributes downloadable data about United States communities. Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here using the fread() command load the data into an R object. The following are ways to calculate the average value of the variable broken down by sex. Using the data.table package, which will deliver the fastest user

if (!file.exists("data")) {
  dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv"
download.file(fileUrl, destfile = "./data/ss06pid.csv", method = "curl")
dataDownloaded <- data()
require(data.table)
file <- "./data/ss06pid.csv"
DT <- fread(file)

tapply(DT$pwgtp15, DT$SEX, mean)

sapply(split(DT$pwgtp15, DT$SEX), mean)

mean(DT$pwgtp15,by = DT$SEX)

mean(DT[DT$SEX==1,]$pwgtp15); mean(DT[DT$SEX==2,]$pwgtp15)

rowMeans(DT)[DT$SEX==1];rowMeans(DT)[DT$SEX==2]

DT[,mean(pwgtp15),by = SEX]