Chapter 8 Matrices and Dataframes
# Create a dataframe of boat sale data named bsale
bsale <- data.frame(name = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),
color = c("black", "green", "pink", "blue", "blue",
"green", "green", "yellow", "black", "black"),
age = c(143, 53, 356, 23, 647, 24, 532, 43, 66, 86),
price = c(53, 87, 54, 66, 264, 32, 532, 58, 99, 132),
cost = c(52, 80, 20, 100, 189, 12, 520, 68, 80, 100),
stringsAsFactors = FALSE)
# Avoid converting strings to factors!
# Explore the bsale dataset:
head(bsale) # Show the first few rows name color age price cost
1 a black 143 53 52
2 b green 53 87 80
3 c pink 356 54 20
4 d blue 23 66 100
5 e blue 647 264 189
6 f green 24 32 12
'data.frame': 10 obs. of 5 variables:
$ name : chr "a" "b" "c" "d" ...
$ color: chr "black" "green" "pink" "blue" ...
$ age : num 143 53 356 23 647 24 532 43 66 86
$ price: num 53 87 54 66 264 32 532 58 99 132
$ cost : num 52 80 20 100 189 12 520 68 80 100
[1] "name" "color" "age" "price" "cost"
[1] 10
[1] 197.3
black blue green pink yellow
3 2 3 1 1
[1] 532
# Adding new columns
bsale$id <- 1:nrow(bsale)
bsale$age.decades <- bsale$age / 10
bsale$profit <- bsale$price - bsale$cost
# The mean price of green boats
with(bsale, mean(price[color == "green"]))[1] 217
[1] "a" "c" "e" "g"
# The percent of black boats had a positive profit?
with(subset(bsale, color == "black"), mean(profit > 0))[1] 1
# Save only the price and cost columns in a new dataframe
bsale.2 <- bsale[c("price", "cost")]
# Change the names of the columns to "p" and "c"
names(bsale.2) <- c("p", "c")
# Use Subset to create a dataframe called old.black.bsale containing only data from black boats older than 50 years
old.black.bsale <- subset(bsale, color == "black" & age > 50)
x <- 1:5
y <- 6:10
z <- 11:15
# Create a matrix where x, y and z are columns
cbind(x, y, z) x y z
[1,] 1 6 11
[2,] 2 7 12
[3,] 3 8 13
[4,] 4 9 14
[5,] 5 10 15
[,1] [,2] [,3] [,4] [,5]
x 1 2 3 4 5
y 6 7 8 9 10
z 11 12 13 14 15
# Creating a matrix with numeric and character columns will make everything a character:
cbind(c(1, 2, 3, 4, 5),
c("a", "b", "c", "d", "e")) [,1] [,2]
[1,] "1" "a"
[2,] "2" "b"
[3,] "3" "c"
[4,] "4" "d"
[5,] "5" "e"
# Create a matrix of the integers 1:10, with 5 rows and 2 columns
matrix(data = 1:10,
nrow = 5,
ncol = 2) [,1] [,2]
[1,] 1 6
[2,] 2 7
[3,] 3 8
[4,] 4 9
[5,] 5 10
[,1] [,2] [,3] [,4] [,5]
[1,] 1 3 5 7 9
[2,] 2 4 6 8 10
# With 2 rows and 5 columns, but fill by row instead of columns
matrix(data = 1:10,
nrow = 2,
ncol = 5,
byrow = TRUE) [,1] [,2] [,3] [,4] [,5]
[1,] 1 2 3 4 5
[2,] 6 7 8 9 10
# Create a dataframe of survey data
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
"sex" = c("m", "m", "m", "f", "f"),
"age" = c(99, 46, 23, 54, 23))
survey index sex age
1 1 m 99
2 2 m 46
3 3 m 23
4 4 f 54
5 5 f 23
'data.frame': 5 obs. of 3 variables:
$ index: num 1 2 3 4 5
$ sex : Factor w/ 2 levels "f","m": 2 2 2 1 1
$ age : num 99 46 23 54 23
#R converted sex to a factor with only two possible levels
# Create a dataframe of survey data without factors
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
"sex" = c("m", "m", "m", "f", "f"),
"age" = c(99, 46, 23, 54, 23),
stringsAsFactors = FALSE)
#check the structure if if there is factor
str(survey)'data.frame': 5 obs. of 3 variables:
$ index: num 1 2 3 4 5
$ sex : chr "m" "m" "m" "f" ...
$ age : num 99 46 23 54 23
#To see a complete list of all the datasets included in the datasets package, run the code: library(help = "datasets")
library(help = "datasets")
##8.3 Matrix and dataframe function
# head() shows the first few rows
head(ChickWeight) weight Time Chick Diet
1 42 0 1 1
2 51 2 1 1
3 59 4 1 1
4 64 6 1 1
5 76 8 1 1
6 93 10 1 1
weight Time Chick Diet
573 155 12 50 4
574 175 14 50 4
575 205 16 50 4
576 234 18 50 4
577 264 20 50 4
578 264 21 50 4
# View() opens the entire dataframe in a new window
View(ChickWeight)
# Print summary statistics of ToothGrowth to the console
summary(ToothGrowth) len supp dose
Min. : 4.20 OJ:30 Min. :0.500
1st Qu.:13.07 VC:30 1st Qu.:0.500
Median :19.25 Median :1.000
Mean :18.81 Mean :1.167
3rd Qu.:25.27 3rd Qu.:2.000
Max. :33.90 Max. :2.000
'data.frame': 60 obs. of 3 variables:
$ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
$ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
$ dose: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
[1] "len" "supp" "dose"
[1] 4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3
[15] 22.5 17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5
[29] 23.3 29.5 15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3
[43] 23.6 26.4 20.0 25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9
[57] 26.4 27.3 29.4 23.0
[1] 18.81333
OJ VC
30 30
len supp
1 4.2 VC
2 11.5 VC
3 7.3 VC
4 5.8 VC
5 6.4 VC
6 10.0 VC
# Create a new dataframe called survey
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
"age" = c(24, 25, 42, 56, 22))
survey index age
1 1 24
2 2 25
3 3 42
4 4 56
5 5 22
# Add a new column called sex to survey
survey$sex <- c("m", "m", "f", "f", "m")
#survey with new sex column
survey index age sex
1 1 24 m
2 2 25 m
3 3 42 f
4 4 56 f
5 5 22 m
#Change name of 1st column of df to "a": names(df)[1] <- "a"
#Change name of 2nd column of df to "b": names(df)[2] <- "b"
# Change the name of the first column of survey to "participant.number"
names(survey)[1] <- "participant.number"
survey participant.number age sex
1 1 24 m
2 2 25 m
3 3 42 f
4 4 56 f
5 5 22 m
# Change the column name from age to age.years
names(survey)[names(survey) == "age"] <- "years"
survey participant.number years sex
1 1 24 m
2 2 25 m
3 3 42 f
4 4 56 f
5 5 22 m
##8.5 Slicing dataframes
# Return row 1 df[1, ]
# Return column 5 df[, 5]
# Rows 1:5 and column 2 df[1:5, 2]
# Show the rows 1-6 and column 1 of ToothGrowth
ToothGrowth[1:6, 1][1] 4.2 11.5 7.3 5.8 6.4 10.0
len dose
1 4.2 0.5
2 11.5 0.5
3 7.3 0.5
len supp dose
1 4.2 VC 0.5
[1] VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
[24] VC VC VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
[47] OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
Levels: OJ VC
# Create a new df with only the rows of ToothGrowth where supp equals VC
ToothGrowth.VC <- ToothGrowth[ToothGrowth$supp == "VC", ]
# Create a new df with only the rows of ToothGrowth where supp equals OJ and dose < 1
ToothGrowth.OJ.a <- ToothGrowth[ToothGrowth$supp == "OJ" &
ToothGrowth$dose < 1, ]
# Show rows of ToothGrowth where len < 20 AND supp == "OJ" AND dose >= 1
subset(x = ToothGrowth,
subset = len < 20 &
supp == "OJ" &
dose >= 1) len supp dose
41 19.7 OJ 1
49 14.5 OJ 1
# Get rows of ToothGrowth where len > 30 AND supp == "VC", but only return the len and dose columns
subset(x = ToothGrowth,
subset = len > 30 & supp == "VC",
select = c(len, dose)) len dose
23 33.9 2
26 32.5 2
#8.6 Combining slicing with functions
# The mean tooth length of Guinea pigs given OJ
# Step 1: Create a subsettted dataframe called oj
oj <- subset(x = ToothGrowth,
subset = supp == "OJ")
# Step 2: Calculate the mean of the len column from the new subsetted dataset
mean(oj$len)[1] 20.66333
# Get the same solution using logical indexing
# Step 1: Create a subsettted dataframe called oj
oj <- ToothGrowth[ToothGrowth$supp == "OJ",]
# Step 2: Calculate the mean of the len column from the new subsetted dataset
mean(oj$len)[1] 20.66333
[1] 20.66333
#The with() function helps to save you some typing when you are using multiple columns from a dataframe. Specifically, it allows you to specify a dataframe (or any other object in R) once at the beginning of a line – then, for every object you refer to in the code in that line, R will assume you’re referring to that object in an expression.
#create a dataframe called health
health <- data.frame("age" = c(32, 24, 43, 19, 43),
"height" = c(1.75, 1.65, 1.50, 1.92, 1.80),
"weight" = c(70, 65, 62, 79, 85))
#Add a new column called bmi which represents a person’s body mass index (BMI)
health$weight / health$height ^ 2[1] 22.85714 23.87511 27.55556 21.43012 26.23457
[1] 22.85714 23.87511 27.55556 21.43012 26.23457
Chapter 9 Importing, saving and managing data
[1] "C:/Users/L5M/Desktop"
#change the working directory to diferent path
#setwd(dir= )
#see all the objects defined in current workspace
ls() [1] "bsale" "bsale.2" "health"
[4] "oj" "old.black.bsale" "survey"
[7] "ToothGrowth.OJ.a" "ToothGrowth.VC" "x"
[10] "y" "z"
# Create objects to save later
study1.df<-data.frame(id = 1:5,
sex = c("m", "m", "f", "f", "m"),
score = c(51, 20, 67, 52, 42))
score.by.sex<-aggregate(score ~ sex,
FUN = mean,
data = study1.df)
study1.htest<-t.test(score ~ sex,
data = study1.df)
# Save two objects as a new .RData file in the data folder of my current working directory
#save(study1.df, score.by.sex, study1.htest,file = "data")
#To save all the objects in your workspace as a .RData file, use the "save.image()" function
#To load an .RData file, that is, to import all of the objects contained in the .RData file into your current workspace, use the load() function
#To remove objects from your workspace, use the "rm()" function
# Read a text file from the web
fromweb <- read.table(file = 'http://goo.gl/jTNf6P',
sep = '\t',
header = TRUE)
# Print the result
fromweb message randomdata
1 Congratulations! 1
2 you 2
3 just 3
4 downloaded 4
5 this 5
6 table 6
7 from 7
8 the 8
9 web! 9
Exploratory Data Analysis Checklist
library(readr)
ozone<- read_csv("C:/Users/L5M/Desktop/HU/EDA/EDA_R_project/US EPA data 2017.csv")
names(ozone) <- make.names(names(ozone))
#rewrite the names of the columns to remove any spaces
# check the number or columns and row
nrow(ozone)[1] 66869
[1] 55
Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 66869 obs. of 55 variables:
$ State.Code : chr "01" "01" "01" "01" ...
$ County.Code : chr "003" "003" "003" "003" ...
$ Site.Num : chr "0010" "0010" "0010" "0010" ...
$ Parameter.Code : num 44201 44201 44201 44201 88101 ...
$ POC : num 1 1 1 1 1 1 1 1 1 1 ...
$ Latitude : num 30.5 30.5 30.5 30.5 30.5 ...
$ Longitude : num -87.9 -87.9 -87.9 -87.9 -87.9 ...
$ Datum : chr "NAD83" "NAD83" "NAD83" "NAD83" ...
$ Parameter.Name : chr "Ozone" "Ozone" "Ozone" "Ozone" ...
$ Sample.Duration : chr "1 HOUR" "8-HR RUN AVG BEGIN HOUR" "8-HR RUN AVG BEGIN HOUR" "8-HR RUN AVG BEGIN HOUR" ...
$ Pollutant.Standard : chr "Ozone 1-hour 1979" "Ozone 8-Hour 1997" "Ozone 8-Hour 2008" "Ozone 8-hour 2015" ...
$ Metric.Used : chr "Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)" "Daily maximum of 8 hour running average of observed hourly values" "Daily maximum of 8 hour running average of observed hourly values" "Daily maximum of 8-hour running average" ...
$ Method.Name : chr "INSTRUMENTAL - ULTRA VIOLET ABSORPTION" NA NA NA ...
$ Year : num 2017 2017 2017 2017 2017 ...
$ Units.of.Measure : chr "Parts per million" "Parts per million" "Parts per million" "Parts per million" ...
$ Event.Type : chr "No Events" "No Events" "No Events" "No Events" ...
$ Observation.Count : num 5240 5426 5426 3852 110 ...
$ Observation.Percent : num 93 91 91 90 90 90 90 90 97 97 ...
$ Completeness.Indicator : chr "Y" "Y" "Y" "Y" ...
$ Valid.Day.Count : num 227 222 222 221 110 110 110 110 118 118 ...
$ Required.Day.Count : num 245 245 245 245 122 122 122 122 122 122 ...
$ Exceptional.Data.Count : num 0 0 0 0 0 0 0 0 0 0 ...
$ Null.Data.Count : num 640 0 0 0 12 12 12 12 6 6 ...
$ Primary.Exceedance.Count : num 0 0 0 1 0 0 NA NA 0 0 ...
$ Secondary.Exceedance.Count : num 0 0 0 1 0 0 NA NA 0 0 ...
$ Certification.Indicator : chr "Certified" "Certified" "Certified" "Certified" ...
$ Num.Obs.Below.MDL : num 0 0 0 0 0 0 0 0 0 0 ...
$ Arithmetic.Mean : num 0.0443 0.0401 0.0401 0.0399 7.3918 ...
$ Arithmetic.Standard.Dev : num 0.0122 0.0116 0.0116 0.0116 3.5723 ...
$ X1st.Max.Value : num 0.078 0.073 0.073 0.073 19.7 19.7 19.7 19.7 19 19 ...
$ X1st.Max.DateTime : POSIXct, format: "2017-05-08 15:00:00" "2017-05-08 14:00:00" ...
$ X2nd.Max.Value : num 0.078 0.07 0.07 0.07 18.9 18.9 18.9 18.9 17.4 17.4 ...
$ X2nd.Max.DateTime : POSIXct, format: "2017-05-09 14:00:00" "2017-05-09 09:00:00" ...
$ X3rd.Max.Value : num 0.072 0.067 0.067 0.067 18.9 18.9 18.9 18.9 16.8 16.8 ...
$ X3rd.Max.DateTime : POSIXct, format: "2017-05-10 14:00:00" "2017-05-10 09:00:00" ...
$ X4th.Max.Value : num 0.067 0.064 0.064 0.064 16 16 16 16 16.3 16.3 ...
$ X4th.Max.DateTime : POSIXct, format: "2017-04-25 15:00:00" "2017-05-07 13:00:00" ...
$ X1st.Max.Non.Overlapping.Value: num NA NA NA NA NA NA NA NA NA NA ...
$ X1st.NO.Max.DateTime : POSIXct, format: NA NA ...
$ X2nd.Max.Non.Overlapping.Value: num NA NA NA NA NA NA NA NA NA NA ...
$ X2nd.NO.Max.DateTime : POSIXct, format: NA NA ...
$ X99th.Percentile : num 0.072 0.067 0.067 0.067 18.9 18.9 18.9 18.9 17.4 17.4 ...
$ X98th.Percentile : num 0.067 0.062 0.062 0.062 18.9 18.9 18.9 18.9 16.8 16.8 ...
$ X95th.Percentile : num 0.063 0.058 0.058 0.058 15.4 15.4 15.4 15.4 15 15 ...
$ X90th.Percentile : num 0.06 0.054 0.054 0.054 11.8 11.8 11.8 11.8 12.5 12.5 ...
$ X75th.Percentile : num 0.053 0.049 0.049 0.048 8.9 8.9 8.9 8.9 10.4 10.4 ...
$ X50th.Percentile : num 0.044 0.04 0.04 0.04 7 7 7 7 7.2 7.2 ...
$ X10th.Percentile : num 0.027 0.024 0.024 0.024 3.8 3.8 3.8 3.8 3.8 3.8 ...
$ Local.Site.Name : chr "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" ...
$ Address : chr "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" ...
$ State.Name : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
$ County.Name : chr "Baldwin" "Baldwin" "Baldwin" "Baldwin" ...
$ City.Name : chr "Fairhope" "Fairhope" "Fairhope" "Fairhope" ...
$ CBSA.Name : chr "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" ...
$ Date.of.Last.Change : Date, format: "2018-07-18" "2018-07-18" ...
- attr(*, "spec")=
.. cols(
.. `State Code` = col_character(),
.. `County Code` = col_character(),
.. `Site Num` = col_character(),
.. `Parameter Code` = col_double(),
.. POC = col_double(),
.. Latitude = col_double(),
.. Longitude = col_double(),
.. Datum = col_character(),
.. `Parameter Name` = col_character(),
.. `Sample Duration` = col_character(),
.. `Pollutant Standard` = col_character(),
.. `Metric Used` = col_character(),
.. `Method Name` = col_character(),
.. Year = col_double(),
.. `Units of Measure` = col_character(),
.. `Event Type` = col_character(),
.. `Observation Count` = col_double(),
.. `Observation Percent` = col_double(),
.. `Completeness Indicator` = col_character(),
.. `Valid Day Count` = col_double(),
.. `Required Day Count` = col_double(),
.. `Exceptional Data Count` = col_double(),
.. `Null Data Count` = col_double(),
.. `Primary Exceedance Count` = col_double(),
.. `Secondary Exceedance Count` = col_double(),
.. `Certification Indicator` = col_character(),
.. `Num Obs Below MDL` = col_double(),
.. `Arithmetic Mean` = col_double(),
.. `Arithmetic Standard Dev` = col_double(),
.. `1st Max Value` = col_double(),
.. `1st Max DateTime` = col_datetime(format = ""),
.. `2nd Max Value` = col_double(),
.. `2nd Max DateTime` = col_datetime(format = ""),
.. `3rd Max Value` = col_double(),
.. `3rd Max DateTime` = col_datetime(format = ""),
.. `4th Max Value` = col_double(),
.. `4th Max DateTime` = col_datetime(format = ""),
.. `1st Max Non Overlapping Value` = col_double(),
.. `1st NO Max DateTime` = col_datetime(format = ""),
.. `2nd Max Non Overlapping Value` = col_double(),
.. `2nd NO Max DateTime` = col_datetime(format = ""),
.. `99th Percentile` = col_double(),
.. `98th Percentile` = col_double(),
.. `95th Percentile` = col_double(),
.. `90th Percentile` = col_double(),
.. `75th Percentile` = col_double(),
.. `50th Percentile` = col_double(),
.. `10th Percentile` = col_double(),
.. `Local Site Name` = col_character(),
.. Address = col_character(),
.. `State Name` = col_character(),
.. `County Name` = col_character(),
.. `City Name` = col_character(),
.. `CBSA Name` = col_character(),
.. `Date of Last Change` = col_date(format = "")
.. )
# A tibble: 6 x 55
State.Code County.Code Site.Num Parameter.Code POC Latitude Longitude
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 01 003 0010 44201 1 30.5 -87.9
2 01 003 0010 44201 1 30.5 -87.9
3 01 003 0010 44201 1 30.5 -87.9
4 01 003 0010 44201 1 30.5 -87.9
5 01 003 0010 88101 1 30.5 -87.9
6 01 003 0010 88101 1 30.5 -87.9
# ... with 48 more variables: Datum <chr>, Parameter.Name <chr>,
# Sample.Duration <chr>, Pollutant.Standard <chr>, Metric.Used <chr>,
# Method.Name <chr>, Year <dbl>, Units.of.Measure <chr>,
# Event.Type <chr>, Observation.Count <dbl>, Observation.Percent <dbl>,
# Completeness.Indicator <chr>, Valid.Day.Count <dbl>,
# Required.Day.Count <dbl>, Exceptional.Data.Count <dbl>,
# Null.Data.Count <dbl>, Primary.Exceedance.Count <dbl>,
# Secondary.Exceedance.Count <dbl>, Certification.Indicator <chr>,
# Num.Obs.Below.MDL <dbl>, Arithmetic.Mean <dbl>,
# Arithmetic.Standard.Dev <dbl>, X1st.Max.Value <dbl>,
# X1st.Max.DateTime <dttm>, X2nd.Max.Value <dbl>,
# X2nd.Max.DateTime <dttm>, X3rd.Max.Value <dbl>,
# X3rd.Max.DateTime <dttm>, X4th.Max.Value <dbl>,
# X4th.Max.DateTime <dttm>, X1st.Max.Non.Overlapping.Value <dbl>,
# X1st.NO.Max.DateTime <dttm>, X2nd.Max.Non.Overlapping.Value <dbl>,
# X2nd.NO.Max.DateTime <dttm>, X99th.Percentile <dbl>,
# X98th.Percentile <dbl>, X95th.Percentile <dbl>,
# X90th.Percentile <dbl>, X75th.Percentile <dbl>,
# X50th.Percentile <dbl>, X10th.Percentile <dbl>, Local.Site.Name <chr>,
# Address <chr>, State.Name <chr>, County.Name <chr>, City.Name <chr>,
# CBSA.Name <chr>, Date.of.Last.Change <date>
# A tibble: 6 x 3
Latitude Longitude Sample.Duration
<dbl> <dbl> <chr>
1 32.7 -115. 24 HOUR
2 32.7 -115. 24 HOUR
3 32.5 -115. 1 HOUR
4 32.5 -115. 8-HR RUN AVG BEGIN HOUR
5 32.5 -115. 8-HR RUN AVG BEGIN HOUR
6 32.5 -115. 8-HR RUN AVG BEGIN HOUR
#take a look at the Stat.Code variable to see what time measurements are recorded as being taken
table(ozone$State.Code)
01 02 04 05 06 08 09 10 11 12 13 15 16 17 18
677 547 2031 340 9174 2452 387 270 405 1832 1537 470 548 1274 2282
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
954 419 1075 599 912 952 1187 2051 1099 287 1142 1052 305 697 466
34 35 36 37 38 39 40 41 42 44 45 46 47 48 49
1273 813 2569 1961 535 2740 1427 1256 2356 709 682 338 743 3922 1337
50 51 53 54 55 56 72 78 80
626 842 1793 445 1127 1564 229 56 103
# A tibble: 56 x 55
State.Code County.Code Site.Num Parameter.Code POC Latitude Longitude
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 78 010 0012 68105 1 17.7 -64.8
2 78 010 0012 68108 1 17.7 -64.8
3 78 010 0012 85101 1 17.7 -64.8
4 78 010 0012 85101 2 17.7 -64.8
5 78 010 0012 88101 1 17.7 -64.8
6 78 010 0012 88101 1 17.7 -64.8
7 78 010 0012 88101 1 17.7 -64.8
8 78 010 0012 88101 1 17.7 -64.8
9 78 010 0012 88501 3 17.7 -64.8
10 78 010 0012 88501 3 17.7 -64.8
# ... with 46 more rows, and 48 more variables: Datum <chr>,
# Parameter.Name <chr>, Sample.Duration <chr>, Pollutant.Standard <chr>,
# Metric.Used <chr>, Method.Name <chr>, Year <dbl>,
# Units.of.Measure <chr>, Event.Type <chr>, Observation.Count <dbl>,
# Observation.Percent <dbl>, Completeness.Indicator <chr>,
# Valid.Day.Count <dbl>, Required.Day.Count <dbl>,
# Exceptional.Data.Count <dbl>, Null.Data.Count <dbl>,
# Primary.Exceedance.Count <dbl>, Secondary.Exceedance.Count <dbl>,
# Certification.Indicator <chr>, Num.Obs.Below.MDL <dbl>,
# Arithmetic.Mean <dbl>, Arithmetic.Standard.Dev <dbl>,
# X1st.Max.Value <dbl>, X1st.Max.DateTime <dttm>, X2nd.Max.Value <dbl>,
# X2nd.Max.DateTime <dttm>, X3rd.Max.Value <dbl>,
# X3rd.Max.DateTime <dttm>, X4th.Max.Value <dbl>,
# X4th.Max.DateTime <dttm>, X1st.Max.Non.Overlapping.Value <dbl>,
# X1st.NO.Max.DateTime <dttm>, X2nd.Max.Non.Overlapping.Value <dbl>,
# X2nd.NO.Max.DateTime <dttm>, X99th.Percentile <dbl>,
# X98th.Percentile <dbl>, X95th.Percentile <dbl>,
# X90th.Percentile <dbl>, X75th.Percentile <dbl>,
# X50th.Percentile <dbl>, X10th.Percentile <dbl>, Local.Site.Name <chr>,
# Address <chr>, State.Name <chr>, County.Name <chr>, City.Name <chr>,
# CBSA.Name <chr>, Date.of.Last.Change <date>
[1] 54
[1] "Alabama" "Alaska" "Arizona"
[4] "Arkansas" "California" "Colorado"
[7] "Connecticut" "Delaware" "District Of Columbia"
[10] "Florida" "Georgia" "Hawaii"
[13] "Idaho" "Illinois" "Indiana"
[16] "Iowa" "Kansas" "Kentucky"
[19] "Louisiana" "Maine" "Maryland"
[22] "Massachusetts" "Michigan" "Minnesota"
[25] "Mississippi" "Missouri" "Montana"
[28] "Nebraska" "Nevada" "New Hampshire"
[31] "New Jersey" "New Mexico" "New York"
[34] "North Carolina" "North Dakota" "Ohio"
[37] "Oklahoma" "Oregon" "Pennsylvania"
[40] "Rhode Island" "South Carolina" "South Dakota"
[43] "Tennessee" "Texas" "Utah"
[46] "Vermont" "Virginia" "Washington"
[49] "West Virginia" "Wisconsin" "Wyoming"
[52] "Puerto Rico" "Virgin Islands" "Country Of Mexico"
Min. 1st Qu. Median Mean 3rd Qu. Max.
-14.63 0.04 0.40 43.99 7.72 50128.95
0% 25% 50% 75% 100%
-14.626471 0.036608 0.400000 7.723881 50128.946050
Data Transformaion
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
##dplyr
#Pick observations by their values (filter()).
#Reorder the rows (arrange()).
#Pick variables by their names (select()).
#Create new variables with functions of existing variables (mutate()).
#Collapse many values down to a single summary (summarise()).
##filter
#filter() allows you to subset observations based on their values. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame.
jan1 <- filter(flights, month == 1, day == 1)
(dec25 <- filter(flights, month == 12, day == 25))# A tibble: 719 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 12 25 456 500 -4 649
2 2013 12 25 524 515 9 805
3 2013 12 25 542 540 2 832
4 2013 12 25 546 550 -4 1022
5 2013 12 25 556 600 -4 730
6 2013 12 25 557 600 -3 743
7 2013 12 25 557 600 -3 818
8 2013 12 25 559 600 -1 855
9 2013 12 25 559 600 -1 849
10 2013 12 25 600 600 0 850
# ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
[1] TRUE
[1] TRUE
#Computers use finite precision arithmetic, so remember that every number you see is an approximation. Instead of relying on ==, use near()
#Finds all flights that departed in November or December
filter(flights, month == 11 | month == 12)# A tibble: 55,403 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 11 1 5 2359 6 352
2 2013 11 1 35 2250 105 123
3 2013 11 1 455 500 -5 641
4 2013 11 1 539 545 -6 856
5 2013 11 1 542 545 -3 831
6 2013 11 1 549 600 -11 912
7 2013 11 1 550 600 -10 705
8 2013 11 1 554 600 -6 659
9 2013 11 1 554 600 -6 826
10 2013 11 1 554 600 -6 749
# ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
#"x %in% y" will select every row where x is one of the values in y
nov_dec <- filter(flights, month %in% c(11, 12))
filter(flights, !(arr_delay > 120 | dep_delay > 120))# A tibble: 316,050 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 316,040 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
# A tibble: 316,050 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 316,040 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
#De Morgan’s law: !(x & y) is the same as !x | !y, and !(x | y) is the same as !x & !y
# Missing Value
NA > 5[1] NA
[1] NA
[1] NA
[1] TRUE
# A tibble: 1 x 1
x
<dbl>
1 3
#filter() only includes rows where the condition is TRUE; it excludes both FALSE and NA values. If you want to preserve missing values, ask for them explicitly
filter(df, is.na(x) | x > 1)# A tibble: 2 x 1
x
<dbl>
1 NA
2 3
##Arrange
#arrange() works similarly to filter() except that instead of selecting rows, it changes their order
arrange(flights, year, month, day)# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
#If provide more than one column name, each additional column will be used to break ties in the values of preceding columns
#desc() to re-order by a column in descending order
arrange(flights, desc(dep_delay))# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 9 641 900 1301 1242
2 2013 6 15 1432 1935 1137 1607
3 2013 1 10 1121 1635 1126 1239
4 2013 9 20 1139 1845 1014 1457
5 2013 7 22 845 1600 1005 1044
6 2013 4 10 1100 1900 960 1342
7 2013 3 17 2321 810 911 135
8 2013 6 27 959 1900 899 1236
9 2013 7 22 2257 759 898 121
10 2013 12 5 756 1700 896 1058
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
# A tibble: 3 x 1
x
<dbl>
1 2
2 5
3 NA
# A tibble: 3 x 1
x
<dbl>
1 5
2 2
3 NA
## select
# allows you to rapidly zoom in on a useful subset using operations based on the names of the variables
# Select columns by name
select(flights, year, month, day)# A tibble: 336,776 x 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# ... with 336,766 more rows
# A tibble: 336,776 x 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# ... with 336,766 more rows
# A tibble: 336,776 x 16
dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
<int> <int> <dbl> <int> <int> <dbl>
1 517 515 2 830 819 11
2 533 529 4 850 830 20
3 542 540 2 923 850 33
4 544 545 -1 1004 1022 -18
5 554 600 -6 812 837 -25
6 554 558 -4 740 728 12
7 555 600 -5 913 854 19
8 557 600 -3 709 723 -14
9 557 600 -3 838 846 -8
10 558 600 -2 753 745 8
# ... with 336,766 more rows, and 10 more variables: carrier <chr>,
# flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
# distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#starts_with("abc"): matches names that begin with “abc”.
#ends_with("xyz"): matches names that end with “xyz”.
#contains("ijk"): matches names that contain “ijk”.
#matches("(.)\\1"): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.
#num_range("x", 1:3): matches x1, x2 and x3.
rename(flights, tail_num = tailnum)# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tail_num <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
#have a handful of variables to move to the start of the data frame.
select(flights, time_hour, air_time, everything())# A tibble: 336,776 x 19
time_hour air_time year month day dep_time sched_dep_time
<dttm> <dbl> <int> <int> <int> <int> <int>
1 2013-01-01 05:00:00 227 2013 1 1 517 515
2 2013-01-01 05:00:00 227 2013 1 1 533 529
3 2013-01-01 05:00:00 160 2013 1 1 542 540
4 2013-01-01 05:00:00 183 2013 1 1 544 545
5 2013-01-01 06:00:00 116 2013 1 1 554 600
6 2013-01-01 05:00:00 150 2013 1 1 554 558
7 2013-01-01 06:00:00 158 2013 1 1 555 600
8 2013-01-01 06:00:00 53 2013 1 1 557 600
9 2013-01-01 06:00:00 140 2013 1 1 557 600
10 2013-01-01 06:00:00 138 2013 1 1 558 600
# ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
# arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
# flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
# hour <dbl>, minute <dbl>
## Add new variables with mutate()
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate(flights_sml,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)# A tibble: 336,776 x 10
year month day dep_delay arr_delay distance air_time gain hours
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 2 11 1400 227 -9 3.78
2 2013 1 1 4 20 1416 227 -16 3.78
3 2013 1 1 2 33 1089 160 -31 2.67
4 2013 1 1 -1 -18 1576 183 17 3.05
5 2013 1 1 -6 -25 762 116 19 1.93
6 2013 1 1 -4 12 719 150 -16 2.5
7 2013 1 1 -5 19 1065 158 -24 2.63
8 2013 1 1 -3 -14 229 53 11 0.883
9 2013 1 1 -3 -8 944 140 5 2.33
10 2013 1 1 -2 8 733 138 -10 2.3
# ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
#only want to keep the new variables
transmute(flights,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)# A tibble: 336,776 x 3
gain hours gain_per_hour
<dbl> <dbl> <dbl>
1 -9 3.78 -2.38
2 -16 3.78 -4.23
3 -31 2.67 -11.6
4 17 3.05 5.57
5 19 1.93 9.83
6 -16 2.5 -6.4
7 -24 2.63 -9.11
8 11 0.883 12.5
9 5 2.33 2.14
10 -10 2.3 -4.35
# ... with 336,766 more rows
#Arithmetic operators: +, -, *, /, ^. These are all vectorised, using the so called “recycling rules”. If one parameter is shorter than the other, it will be automatically extended to be the same length. This is most useful when one of the arguments is a single number: air_time / 60, hours * 60 + minute, etc.
#Arithmetic operators are also useful in conjunction with the aggregate functions you’ll learn about later. For example, x / sum(x) calculates the proportion of a total, and y - mean(y) computes the difference from the mean.
#Modular arithmetic: %/% (integer division) and %% (remainder), where x == y * (x %/% y) + (x %% y). Modular arithmetic is a handy tool because it allows you to break integers up into pieces. For example, in the flights dataset, you can compute hour and minute from dep_time with:
transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100
)# A tibble: 336,776 x 3
dep_time hour minute
<int> <dbl> <dbl>
1 517 5 17
2 533 5 33
3 542 5 42
4 544 5 44
5 554 5 54
6 554 5 54
7 555 5 55
8 557 5 57
9 557 5 57
10 558 5 58
# ... with 336,766 more rows
#Offsets: lead() and lag() allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. x - lag(x)) or find when values change (x != lag(x)). They are most useful in conjunction with group_by(), which you’ll learn about shortly.
(x <- 1:10) [1] 1 2 3 4 5 6 7 8 9 10
[1] NA 1 2 3 4 5 6 7 8 9
[1] 2 3 4 5 6 7 8 9 10 NA
[1] 1 2 3 4 5 6 7 8 9 10
[1] 1 3 6 10 15 21 28 36 45 55
[1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
#Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: cumsum(), cumprod(), cummin(), cummax(); and dplyr provides cummean() for cumulative means
#ranking
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)[1] 1 2 2 NA 4 5
[1] 5 3 3 NA 2 1
[1] 1 2 3 NA 4 5
[1] 1 2 2 NA 3 4
[1] 0.00 0.25 0.25 NA 0.75 1.00
[1] 0.2 0.6 0.6 NA 0.8 1.0
##Summaries
# collapses a data frame to a single row
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))# A tibble: 1 x 1
delay
<dbl>
1 12.6
#remove the NA
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))# A tibble: 365 x 4
# Groups: year, month [12]
year month day delay
<int> <int> <int> <dbl>
1 2013 1 1 11.5
2 2013 1 2 13.9
3 2013 1 3 11.0
4 2013 1 4 8.95
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.55
9 2013 1 9 2.28
10 2013 1 10 2.84
# ... with 355 more rows
## %>% Pipe to combine multiple operations
by_dest <- group_by(flights, dest)
#Group flights by destination.
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
#Summarise to compute distance, average delay, and number of flights.
delay <- filter(delay, count > 20, dest != "HNL")
#Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
# delays increase with distance up to ~750 miles and then decrease. Maybe as flights get longer there's more ability to make up delays in the air
#ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
# geom_point(aes(size = count), alpha = 1/3) +
# geom_smooth(se = FALSE)
#remove the cancel flight(NA)
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))# A tibble: 365 x 4
# Groups: year, month [12]
year month day mean
<int> <int> <int> <dbl>
1 2013 1 1 11.4
2 2013 1 2 13.7
3 2013 1 3 10.9
4 2013 1 4 8.97
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.56
9 2013 1 9 2.30
10 2013 1 10 2.84
# ... with 355 more rows
#Whenever you do any aggregation, it’s always a good idea to include either a count (n()), or a count of non-missing values (sum(!is.na(x))). That way you can check that you’re not drawing conclusions based on very small amounts of data
# The planes (identified by their tail number) that have the highest average delays
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
)
#ggplot(data = delays, mapping = aes(x = delay)) +
# geom_freqpoly(binwidth = 10)
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
#ggplot(data = delays, mapping = aes(x = n, y = delay)) +
# geom_point(alpha = 1/10)
delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)# Convert to a tibble so it prints nicely
batting <- as_tibble(Lahman::Batting)
batters <- batting %>%
group_by(playerID) %>%
summarise(
ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
ab = sum(AB, na.rm = TRUE)
)
batters %>%
filter(ab > 100) %>%
ggplot(mapping = aes(x = ab, y = ba)) +
geom_point() +
geom_smooth(se = FALSE)# A tibble: 19,428 x 3
playerID ba ab
<chr> <dbl> <int>
1 abramge01 1 1
2 alberan01 1 1
3 allarko01 1 1
4 banisje01 1 1
5 bartocl01 1 1
6 bassdo01 1 1
7 birasst01 1 2
8 bruneju01 1 1
9 burnscb01 1 1
10 cammaer01 1 1
# ... with 19,418 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay1 = mean(arr_delay),
avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
)# A tibble: 365 x 5
# Groups: year, month [12]
year month day avg_delay1 avg_delay2
<int> <int> <int> <dbl> <dbl>
1 2013 1 1 12.7 32.5
2 2013 1 2 12.7 32.0
3 2013 1 3 5.73 27.7
4 2013 1 4 -1.93 28.3
5 2013 1 5 -1.53 22.6
6 2013 1 6 4.24 24.4
7 2013 1 7 -4.95 27.8
8 2013 1 8 -3.23 20.8
9 2013 1 9 -0.264 25.6
10 2013 1 10 -5.90 27.3
# ... with 355 more rows
# Why is distance to some destinations more variable than to others?
not_cancelled %>%
group_by(dest) %>%
summarise(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))# A tibble: 104 x 2
dest distance_sd
<chr> <dbl>
1 EGE 10.5
2 SAN 10.4
3 SFO 10.2
4 HNL 10.0
5 SEA 9.98
6 LAS 9.91
7 PDX 9.87
8 PHX 9.86
9 LAX 9.66
10 IND 9.46
# ... with 94 more rows
# When do the first and last flights leave each day?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)# A tibble: 365 x 5
# Groups: year, month [12]
year month day first last
<int> <int> <int> <int> <int>
1 2013 1 1 517 2356
2 2013 1 2 42 2354
3 2013 1 3 32 2349
4 2013 1 4 25 2358
5 2013 1 5 14 2357
6 2013 1 6 16 2355
7 2013 1 7 49 2359
8 2013 1 8 454 2351
9 2013 1 9 2 2252
10 2013 1 10 3 2320
# ... with 355 more rows
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time)
)# A tibble: 365 x 5
# Groups: year, month [12]
year month day first_dep last_dep
<int> <int> <int> <int> <int>
1 2013 1 1 517 2356
2 2013 1 2 42 2354
3 2013 1 3 32 2349
4 2013 1 4 25 2358
5 2013 1 5 14 2357
6 2013 1 6 16 2355
7 2013 1 7 49 2359
8 2013 1 8 454 2351
9 2013 1 9 2 2252
10 2013 1 10 3 2320
# ... with 355 more rows
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))# A tibble: 770 x 20
# Groups: year, month, day [365]
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 2356 2359 -3 425
3 2013 1 2 42 2359 43 518
4 2013 1 2 2354 2359 -5 413
5 2013 1 3 32 2359 33 504
6 2013 1 3 2349 2359 -10 434
7 2013 1 4 25 2359 26 505
8 2013 1 4 2358 2359 -1 429
9 2013 1 4 2358 2359 -1 436
10 2013 1 5 14 2359 15 503
# ... with 760 more rows, and 13 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>, r <int>
# Which destinations have the most carriers?
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))# A tibble: 104 x 2
dest carriers
<chr> <int>
1 ATL 7
2 BOS 7
3 CLT 7
4 ORD 7
5 TPA 7
6 AUS 6
7 DCA 6
8 DTW 6
9 IAD 6
10 MSP 6
# ... with 94 more rows
# A tibble: 104 x 2
dest n
<chr> <int>
1 ABQ 254
2 ACK 264
3 ALB 418
4 ANC 8
5 ATL 16837
6 AUS 2411
7 AVL 261
8 BDL 412
9 BGR 358
10 BHM 269
# ... with 94 more rows
# “count” (sum) the total number of miles a plane flew
not_cancelled %>%
count(tailnum, wt = distance)# A tibble: 4,037 x 2
tailnum n
<chr> <dbl>
1 D942DN 3418
2 N0EGMQ 239143
3 N10156 109664
4 N102UW 25722
5 N103US 24619
6 N104UW 24616
7 N10575 139903
8 N105UW 23618
9 N107US 21677
10 N108UW 32070
# ... with 4,027 more rows
# How many flights left before 5am? (these usually indicate delayed flights from the previous day)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(n_early = sum(dep_time < 500))# A tibble: 365 x 4
# Groups: year, month [12]
year month day n_early
<int> <int> <int> <int>
1 2013 1 1 0
2 2013 1 2 3
3 2013 1 3 4
4 2013 1 4 3
5 2013 1 5 3
6 2013 1 6 2
7 2013 1 7 2
8 2013 1 8 1
9 2013 1 9 3
10 2013 1 10 3
# ... with 355 more rows
# What proportion of flights are delayed by more than an hour?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(hour_perc = mean(arr_delay > 60))# A tibble: 365 x 4
# Groups: year, month [12]
year month day hour_perc
<int> <int> <int> <dbl>
1 2013 1 1 0.0722
2 2013 1 2 0.0851
3 2013 1 3 0.0567
4 2013 1 4 0.0396
5 2013 1 5 0.0349
6 2013 1 6 0.0470
7 2013 1 7 0.0333
8 2013 1 8 0.0213
9 2013 1 9 0.0202
10 2013 1 10 0.0183
# ... with 355 more rows
# A tibble: 365 x 4
# Groups: year, month [12]
year month day flights
<int> <int> <int> <int>
1 2013 1 1 842
2 2013 1 2 943
3 2013 1 3 914
4 2013 1 4 915
5 2013 1 5 720
6 2013 1 6 832
7 2013 1 7 933
8 2013 1 8 899
9 2013 1 9 902
10 2013 1 10 932
# ... with 355 more rows
# A tibble: 12 x 3
# Groups: year [1]
year month flights
<int> <int> <int>
1 2013 1 27004
2 2013 2 24951
3 2013 3 28834
4 2013 4 28330
5 2013 5 28796
6 2013 6 28243
7 2013 7 29425
8 2013 8 29327
9 2013 9 27574
10 2013 10 28889
11 2013 11 27268
12 2013 12 28135
# A tibble: 1 x 2
year flights
<int> <int>
1 2013 336776
##ungroup
daily %>%
ungroup() %>% # no longer grouped by date
summarise(flights = n()) # all flights# A tibble: 1 x 1
flights
<int>
1 336776
#Find the worst members of each group
flights_sml %>%
group_by(year, month, day) %>%
filter(rank(desc(arr_delay)) < 10)# A tibble: 3,306 x 7
# Groups: year, month, day [365]
year month day dep_delay arr_delay distance air_time
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 853 851 184 41
2 2013 1 1 290 338 1134 213
3 2013 1 1 260 263 266 46
4 2013 1 1 157 174 213 60
5 2013 1 1 216 222 708 121
6 2013 1 1 255 250 589 115
7 2013 1 1 285 246 1085 146
8 2013 1 1 192 191 199 44
9 2013 1 1 379 456 1092 222
10 2013 1 2 224 207 550 94
# ... with 3,296 more rows
#Find all groups bigger than a threshold
popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365)
popular_dests# A tibble: 332,577 x 19
# Groups: dest [77]
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 332,567 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
#Standardise to compute per group metrics
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)# A tibble: 131,106 x 6
# Groups: dest [77]
year month day dest arr_delay prop_delay
<int> <int> <int> <chr> <dbl> <dbl>
1 2013 1 1 IAH 11 0.000111
2 2013 1 1 IAH 20 0.000201
3 2013 1 1 MIA 33 0.000235
4 2013 1 1 ORD 12 0.0000424
5 2013 1 1 FLL 19 0.0000938
6 2013 1 1 ORD 8 0.0000283
7 2013 1 1 LAX 7 0.0000344
8 2013 1 1 DFW 31 0.000282
9 2013 1 1 ATL 12 0.0000400
10 2013 1 1 DTW 16 0.000116
# ... with 131,096 more rows
Tidy data
library(tidyverse)
#There are three interrelated rules which make a dataset tidy:
#1.Each variable must have its own column.
#2.Each observation must have its own row.
#3.Each value must have its own cell.
table1 #tidy table# A tibble: 6 x 4
country year cases population
<chr> <int> <int> <int>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
# A tibble: 12 x 4
country year type count
<chr> <int> <chr> <int>
1 Afghanistan 1999 cases 745
2 Afghanistan 1999 population 19987071
3 Afghanistan 2000 cases 2666
4 Afghanistan 2000 population 20595360
5 Brazil 1999 cases 37737
6 Brazil 1999 population 172006362
7 Brazil 2000 cases 80488
8 Brazil 2000 population 174504898
9 China 1999 cases 212258
10 China 1999 population 1272915272
11 China 2000 cases 213766
12 China 2000 population 1280428583
# A tibble: 6 x 3
country year rate
* <chr> <int> <chr>
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583
# A tibble: 3 x 3
country `1999` `2000`
* <chr> <int> <int>
1 Afghanistan 745 2666
2 Brazil 37737 80488
3 China 212258 213766
# A tibble: 3 x 3
country `1999` `2000`
* <chr> <int> <int>
1 Afghanistan 19987071 20595360
2 Brazil 172006362 174504898
3 China 1272915272 1280428583
# A tibble: 6 x 5
country year cases population rate
<chr> <int> <int> <int> <dbl>
1 Afghanistan 1999 745 19987071 0.373
2 Afghanistan 2000 2666 20595360 1.29
3 Brazil 1999 37737 172006362 2.19
4 Brazil 2000 80488 174504898 4.61
5 China 1999 212258 1272915272 1.67
6 China 2000 213766 1280428583 1.67
# A tibble: 2 x 2
year n
<int> <int>
1 1999 250740
2 2000 296920
# Visualise changes over time
#library(ggplot2)
#ggplot(table1, aes(year, cases)) +
# geom_line(aes(group = country), colour = "grey50") +
# geom_point(aes(colour = country))
##Gather
#some of the column names are not names of variables, but values of a variable. Take table4a: the column names 1999 and 2000 represent values of the year variable, and each row represents two observations, not one
table4a# A tibble: 3 x 3
country `1999` `2000`
* <chr> <int> <int>
1 Afghanistan 745 2666
2 Brazil 37737 80488
3 China 212258 213766
#The name of the variable whose values form the column names. I call that the key, and here it is year.
#The name of the variable whose values are spread over the cells. I call that value, and here it’s the number of cases.
table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")# A tibble: 6 x 3
country year cases
<chr> <chr> <int>
1 Afghanistan 1999 745
2 Brazil 1999 37737
3 China 1999 212258
4 Afghanistan 2000 2666
5 Brazil 2000 80488
6 China 2000 213766
#the variable stored in the cell values:
table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")# A tibble: 6 x 3
country year population
<chr> <chr> <int>
1 Afghanistan 1999 19987071
2 Brazil 1999 172006362
3 China 1999 1272915272
4 Afghanistan 2000 20595360
5 Brazil 2000 174504898
6 China 2000 1280428583
#To combine the tidied versions of table4a and table4b into a single tibble, we need to use dplyr::left_join()
tidy4a <- table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)# A tibble: 6 x 4
country year cases population
<chr> <chr> <int> <int>
1 Afghanistan 1999 745 19987071
2 Brazil 1999 37737 172006362
3 China 1999 212258 1272915272
4 Afghanistan 2000 2666 20595360
5 Brazil 2000 80488 174504898
6 China 2000 213766 1280428583
##Spreading
#Spreading is the opposite of gathering. You use it when an observation is scattered across multiple rows
table2# A tibble: 12 x 4
country year type count
<chr> <int> <chr> <int>
1 Afghanistan 1999 cases 745
2 Afghanistan 1999 population 19987071
3 Afghanistan 2000 cases 2666
4 Afghanistan 2000 population 20595360
5 Brazil 1999 cases 37737
6 Brazil 1999 population 172006362
7 Brazil 2000 cases 80488
8 Brazil 2000 population 174504898
9 China 1999 cases 212258
10 China 1999 population 1272915272
11 China 2000 cases 213766
12 China 2000 population 1280428583
#The column that contains variable names, the key column. Here, it’s type.
#The column that contains values from multiple variables, the value column. Here it’s count.
table2 %>%
spread(key = type, value = count)# A tibble: 6 x 4
country year cases population
<chr> <int> <int> <int>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
#gather() makes wide tables narrower and longer; spread() makes long tables shorter and wider.
##Separarte
#pulls apart one column into multiple columns, by splitting wherever a separator character appears
table3# A tibble: 6 x 3
country year rate
* <chr> <int> <chr>
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583
#The rate column contains both cases and population variables, and we need to split it into two variables
table3 %>%
separate(rate, into = c("cases", "population"))# A tibble: 6 x 4
country year cases population
<chr> <int> <chr> <chr>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
#separate() will split values wherever it sees a non-alphanumeric character
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/")# A tibble: 6 x 4
country year cases population
<chr> <int> <chr> <chr>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
#If you wish to use a specific character to separate a column, you can pass the character to the sep argument of separate().
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)# A tibble: 6 x 4
country year cases population
<chr> <int> <int> <int>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
#ask separate() to try and convert to better types using convert = TRUE
#pass a vector of integers to sep. separate() will interpret the integers as positions to split at. Positive values start at 1 on the far-left of the strings; negative value start at -1 on the far-right of the strings. When using integers to separate strings, the length of sep should be one less than the number of names in "into"
#Separate the last two digits of each year
table3 %>%
separate(year, into = c("century", "year"), sep = 2)# A tibble: 6 x 4
country century year rate
<chr> <chr> <chr> <chr>
1 Afghanistan 19 99 745/19987071
2 Afghanistan 20 00 2666/20595360
3 Brazil 19 99 37737/172006362
4 Brazil 20 00 80488/174504898
5 China 19 99 212258/1272915272
6 China 20 00 213766/1280428583
#unite() is the inverse of separate(): it combines multiple columns into a single column
#use unite() to rejoin the century and year columns
tidyr::table5# A tibble: 6 x 4
country century year rate
* <chr> <chr> <chr> <chr>
1 Afghanistan 19 99 745/19987071
2 Afghanistan 20 00 2666/20595360
3 Brazil 19 99 37737/172006362
4 Brazil 20 00 80488/174504898
5 China 19 99 212258/1272915272
6 China 20 00 213766/1280428583
# A tibble: 6 x 3
country new rate
<chr> <chr> <chr>
1 Afghanistan 19_99 745/19987071
2 Afghanistan 20_00 2666/20595360
3 Brazil 19_99 37737/172006362
4 Brazil 20_00 80488/174504898
5 China 19_99 212258/1272915272
6 China 20_00 213766/1280428583
# A tibble: 6 x 3
country new rate
<chr> <chr> <chr>
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583
##Missing values
#Explicitly, i.e. flagged with NA.
#Implicitly, i.e. simply not present in the data.
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
#make the implicit missing value explicit by putting years in the columns
stocks %>%
spread(year, return)# A tibble: 4 x 3
qtr `2015` `2016`
<dbl> <dbl> <dbl>
1 1 1.88 NA
2 2 0.59 0.92
3 3 0.35 0.17
4 4 NA 2.66
#set na.rm = TRUE in gather() to turn explicit missing values implicit
stocks %>%
spread(year, return) %>%
gather(year, return, `2015`:`2016`, na.rm = TRUE)# A tibble: 6 x 3
qtr year return
<dbl> <chr> <dbl>
1 1 2015 1.88
2 2 2015 0.59
3 3 2015 0.35
4 2 2016 0.92
5 3 2016 0.17
6 4 2016 2.66
# A tibble: 8 x 3
year qtr return
<dbl> <dbl> <dbl>
1 2015 1 1.88
2 2015 2 0.59
3 2015 3 0.35
4 2015 4 NA
5 2016 1 NA
6 2016 2 0.92
7 2016 3 0.17
8 2016 4 2.66
#complete() takes a set of columns, and finds all unique combinations. It then ensures the original dataset contains all those values, filling in explicit NAs where necessary
#when a data source has primarily been used for data entry, missing values indicate that the previous value should be carried forward
treatment <- tribble(
~ person, ~ treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
#takes a set of columns where you want missing values to be replaced by the most recent non-missing value
treatment %>%
fill(person)# A tibble: 4 x 3
person treatment response
<chr> <dbl> <dbl>
1 Derrick Whitmore 1 7
2 Derrick Whitmore 2 10
3 Derrick Whitmore 3 9
4 Katherine Burke 1 4
Vectors
[1] "character"
[1] "integer"
[1] 3
#Logical vectors are the simplest type of atomic vector because they can take only three possible values: FALSE, TRUE, and NA
#comparisons
1:10 %% 3 == 0 [1] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
[1] TRUE TRUE FALSE NA
#Integer and double vectors are known collectively as numeric vectors
#To make an integer, place an L after the number
typeof(1)[1] "double"
[1] "integer"
[1] 1.5
[1] 2
[1] 4.440892e-16
# when working with floating point numbers: most calculations include some approximation error. Instead of comparing floating point numbers using ==, you should use dplyr::near() which allows for some numerical tolerance
#Integers have one special value: NA, while doubles have four: NA, NaN, Inf and -Inf. All three special values NaN, Inf and -Inf can arise during division
c(-1, 0, 1) / 0[1] -Inf NaN Inf
#Avoid using == to check for these other special values. Instead use the helper functions is.finite(), is.infinite(), and is.nan()
#Character
#each unique string is only stored in memory once, and every use of the string points to that representation. This reduces the amount of memory needed by duplicated strings. You can see this behaviour in practice with pryr::object_size()
x <- "This is a reasonably long string."
pryr::object_size(x)152 B
8.14 kB
#y doesn’t take up 1,000x as much memory as x, because each element of y is just a pointer to that same string. A pointer is 8 bytes, so 1000 pointers to a 136 B string is 8 * 1000 + 136 = 8.13 kB
#Missing Value
#Each type of atomic vector has its own missing value
NA # logical[1] NA
[1] NA
[1] NA
[1] NA
#Coercion
#There are two ways to convert, or coerce, one type of vector to another:
#1 Explicit coercion
# as.logical(), as.integer(), as.double(), or as.character()
#2 Implicit coercion
#EX: using a logical vector in a numeric context
#TRUE=1, FALSE=0
x <- sample(20, 100, replace = TRUE)
y <- x > 10
sum(y) # how many are greater than 10[1] 48
[1] 0.48
#when a vector containing multiple types with c(): the most complex type always wins.
typeof(c(TRUE, 1L))[1] "integer"
[1] "double"
[1] "character"
#Each predicate also comes with a “scalar” version, like is_scalar_atomic(), which checks that the length is 1. This is useful, for example, if you want to check that an argument to your function is a single logical value.
#Vector recycling: implicitly coerce the length of vectors, because the shorter vector is repeated, or recycled, to the same length as the longer vector.
# Vectorised, meaning that they will operate on a vector of numbers
sample(10) + 100 [1] 109 102 106 110 105 107 108 104 101 103
[1] FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
#R will expand the shortest vector to the same length as the longest, so called recycling
1:10 + 1:2 [1] 2 4 4 6 6 8 8 10 10 12
[1] 2 4 6 5 7 9 8 10 12 11
#While vector recycling can be used to create very succinct, clever code, it can also silently conceal problems. For this reason, the vectorised functions in tidyverse will throw errors when you recycle anything other than a scalar. If you do want to recycle, you’ll need to do it yourself with rep():
tibble(x = 1:4, y = rep(1:2, 2))# A tibble: 4 x 2
x y
<int> <int>
1 1 1
2 2 2
3 3 1
4 4 2
# A tibble: 4 x 2
x y
<int> <int>
1 1 1
2 2 1
3 3 2
4 4 2
x y z
1 2 4
a b c
1 2 3
##Subset
#1.A numeric vector containing only integers. The integers must either be all positive, all negative, or zero. Subsetting with positive integers keeps the elements at those positions:
x <- c("one", "two", "three", "four", "five")
x[c(3, 2, 5)][1] "three" "two" "five"
[1] "one" "one" "five" "five" "five" "two"
[1] "two" "four"
#2.Subsetting with a logical vector keeps all values corresponding to a TRUE value. This is most often useful in conjunction with the comparison functions
x <- c(10, 3, NA, 5, 8, 1, NA)
# All non-missing values of x
x[!is.na(x)][1] 10 3 5 8 1
[1] 10 NA 8 NA
#3.If you have a named vector, you can subset it with a character vector
x <- c(abc = 1, def = 2, xyz = 5)
x[c("xyz", "def")]xyz def
5 2
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
List of 3
$ : num 1
$ : num 2
$ : num 3
List of 3
$ a: num 1
$ b: num 2
$ c: num 3
List of 4
$ : chr "a"
$ : int 1
$ : num 1.5
$ : logi TRUE
List of 2
$ :List of 2
..$ : num 1
..$ : num 2
$ :List of 2
..$ : num 3
..$ : num 4
x1 <- list(c(1, 2), c(3, 4))
x2 <- list(list(1, 2), list(3, 4))
x3 <- list(1, list(2, list(3)))
a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
str(a[1:2])List of 2
$ a: int [1:3] 1 2 3
$ b: chr "a string"
List of 1
$ d:List of 2
..$ : num -1
..$ : num -5
#[[ extracts a single component from a list. It removes a level of hierarchy from the list.
str(a[[1]]) int [1:3] 1 2 3
List of 2
$ : num -1
$ : num -5
[1] 1 2 3
[1] 1 2 3
NULL
$greeting
[1] "Hi!"
$farewell
[1] "Bye!"
##Factors
# Factors are designed to represent categorical data that can take a fixed set of possible values. Factors are built on top of integers, and have a levels attribute
x <- factor(c("ab", "cd", "ab"), levels = c("ab", "cd", "ef"))
typeof(x)[1] "integer"
$levels
[1] "ab" "cd" "ef"
$class
[1] "factor"
#Dates in R are numeric vectors that represent the number of days since 1 January 1970
x <- as.Date("1971-01-01")
unclass(x)[1] 365
[1] "double"
$class
[1] "Date"
#Date-times are numeric vectors with class POSIXct that represent the number of seconds since 1 January 1970. (In case you were wondering, “POSIXct” stands for “Portable Operating System Interface”, calendar time.)
x <- lubridate::ymd_hm("1970-01-01 01:00")
unclass(x)[1] 3600
attr(,"tzone")
[1] "UTC"
[1] "double"
$class
[1] "POSIXct" "POSIXt"
$tzone
[1] "UTC"
#The tzone attribute is optional. It controls how the time is printed, not what absolute time it refers to
attr(x, "tzone") <- "US/Pacific"
x[1] "1969-12-31 17:00:00 PST"
[1] "1969-12-31 20:00:00 EST"
# Another type of date-times called POSIXlt. These are built on top of named lists
y <- as.POSIXlt(x)
typeof(y)[1] "list"
$names
[1] "sec" "min" "hour" "mday" "mon" "year" "wday"
[8] "yday" "isdst" "zone" "gmtoff"
$class
[1] "POSIXlt" "POSIXt"
$tzone
[1] "US/Eastern" "EST" "EDT"
#POSIXct’s are always easier to work with, so if you find you have a POSIXlt, you should always convert it to a regular data time lubridate::as_date_time()
#Tibbles are augmented lists: they have class “tbl_df” + “tbl” + “data.frame”, and names (column) and row.names attributes
tb <- tibble::tibble(x = 1:5, y = 5:1)
typeof(tb)[1] "list"
$names
[1] "x" "y"
$row.names
[1] 1 2 3 4 5
$class
[1] "tbl_df" "tbl" "data.frame"
#The difference between a tibble and a list is that all the elements of a data frame must be vectors with the same length. All functions that work with tibbles enforce this constraint.
#Traditional data.frames have a very similar structure
df <- data.frame(x = 1:5, y = 5:1)
typeof(df)[1] "list"
$names
[1] "x" "y"
$class
[1] "data.frame"
$row.names
[1] 1 2 3 4 5
---
title: "EDA code Profolio Wk1-6"
output:
flexdashboard::flex_dashboard:
orientation: columns
source_code: embed
---
```{r setup }
library(flexdashboard)
```
YaRrr!
=====================================
Column {.tabset}
-------------------------------------
### Ch 8
Chapter 8 Matrices and Dataframes
```{r echo = TRUE}
# Create a dataframe of boat sale data named bsale
bsale <- data.frame(name = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),
color = c("black", "green", "pink", "blue", "blue",
"green", "green", "yellow", "black", "black"),
age = c(143, 53, 356, 23, 647, 24, 532, 43, 66, 86),
price = c(53, 87, 54, 66, 264, 32, 532, 58, 99, 132),
cost = c(52, 80, 20, 100, 189, 12, 520, 68, 80, 100),
stringsAsFactors = FALSE)
# Avoid converting strings to factors!
# Explore the bsale dataset:
head(bsale) # Show the first few rows
str(bsale) # Show the structure of the data
View(bsale) # Open the data in a new window
names(bsale) # Show the names of the columns
nrow(bsale) # Show how many rows are there in the data
# Calculating statistics from column vectors
mean(bsale$age) # The mean age
table(bsale$color) # How many boats were there of each color?
max(bsale$price) # The maximum price
# Adding new columns
bsale$id <- 1:nrow(bsale)
bsale$age.decades <- bsale$age / 10
bsale$profit <- bsale$price - bsale$cost
# The mean price of green boats
with(bsale, mean(price[color == "green"]))
# The names of boats older than 100 years?
with(bsale, name[age > 100])
# The percent of black boats had a positive profit?
with(subset(bsale, color == "black"), mean(profit > 0))
# Save only the price and cost columns in a new dataframe
bsale.2 <- bsale[c("price", "cost")]
# Change the names of the columns to "p" and "c"
names(bsale.2) <- c("p", "c")
# Use Subset to create a dataframe called old.black.bsale containing only data from black boats older than 50 years
old.black.bsale <- subset(bsale, color == "black" & age > 50)
x <- 1:5
y <- 6:10
z <- 11:15
# Create a matrix where x, y and z are columns
cbind(x, y, z)
# Create a matrix where x, y and z are rows
rbind(x, y, z)
# Creating a matrix with numeric and character columns will make everything a character:
cbind(c(1, 2, 3, 4, 5),
c("a", "b", "c", "d", "e"))
# Create a matrix of the integers 1:10, with 5 rows and 2 columns
matrix(data = 1:10,
nrow = 5,
ncol = 2)
# with 2 rows and 5 columns
matrix(data = 1:10,
nrow = 2,
ncol = 5)
# With 2 rows and 5 columns, but fill by row instead of columns
matrix(data = 1:10,
nrow = 2,
ncol = 5,
byrow = TRUE)
# Create a dataframe of survey data
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
"sex" = c("m", "m", "m", "f", "f"),
"age" = c(99, 46, 23, 54, 23))
survey
# Show me the structure of the survey dataframe
str(survey)
#R converted sex to a factor with only two possible levels
# Create a dataframe of survey data without factors
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
"sex" = c("m", "m", "m", "f", "f"),
"age" = c(99, 46, 23, 54, 23),
stringsAsFactors = FALSE)
#check the structure if if there is factor
str(survey)
#To see a complete list of all the datasets included in the datasets package, run the code: library(help = "datasets")
library(help = "datasets")
##8.3 Matrix and dataframe function
# head() shows the first few rows
head(ChickWeight)
# tail() shows the last few rows
tail(ChickWeight)
# View() opens the entire dataframe in a new window
View(ChickWeight)
# Print summary statistics of ToothGrowth to the console
summary(ToothGrowth)
# Print additional information about ToothGrowth to the console
str(ToothGrowth)
##8.4 Dataframe column names
# The names of columns in the ToothGrowth dataframe
names(ToothGrowth)
# Return the len column of ToothGrowth
ToothGrowth$len
# The mean of the len column of ToothGrowth?
mean(ToothGrowth$len)
# Table of the supp column of ToothGrowth.
table(ToothGrowth$supp)
#The len AND supp columns of ToothGrowth
head(ToothGrowth[c("len", "supp")])
# Create a new dataframe called survey
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
"age" = c(24, 25, 42, 56, 22))
survey
# Add a new column called sex to survey
survey$sex <- c("m", "m", "f", "f", "m")
#survey with new sex column
survey
#Change name of 1st column of df to "a": names(df)[1] <- "a"
#Change name of 2nd column of df to "b": names(df)[2] <- "b"
# Change the name of the first column of survey to "participant.number"
names(survey)[1] <- "participant.number"
survey
# Change the column name from age to age.years
names(survey)[names(survey) == "age"] <- "years"
survey
##8.5 Slicing dataframes
# Return row 1 df[1, ]
# Return column 5 df[, 5]
# Rows 1:5 and column 2 df[1:5, 2]
# Show the rows 1-6 and column 1 of ToothGrowth
ToothGrowth[1:6, 1]
# Show the rows 1-3 and columns 1 and 3 of ToothGrowth
ToothGrowth[1:3, c(1,3)]
# Show the 1st row (and all columns) of ToothGrowth
ToothGrowth[1, ]
# Show the 2nd column (and all rows) of ToothGrowth
ToothGrowth[, 2]
# Create a new df with only the rows of ToothGrowth where supp equals VC
ToothGrowth.VC <- ToothGrowth[ToothGrowth$supp == "VC", ]
# Create a new df with only the rows of ToothGrowth where supp equals OJ and dose < 1
ToothGrowth.OJ.a <- ToothGrowth[ToothGrowth$supp == "OJ" &
ToothGrowth$dose < 1, ]
# Show rows of ToothGrowth where len < 20 AND supp == "OJ" AND dose >= 1
subset(x = ToothGrowth,
subset = len < 20 &
supp == "OJ" &
dose >= 1)
# Get rows of ToothGrowth where len > 30 AND supp == "VC", but only return the len and dose columns
subset(x = ToothGrowth,
subset = len > 30 & supp == "VC",
select = c(len, dose))
#8.6 Combining slicing with functions
# The mean tooth length of Guinea pigs given OJ
# Step 1: Create a subsettted dataframe called oj
oj <- subset(x = ToothGrowth,
subset = supp == "OJ")
# Step 2: Calculate the mean of the len column from the new subsetted dataset
mean(oj$len)
# Get the same solution using logical indexing
# Step 1: Create a subsettted dataframe called oj
oj <- ToothGrowth[ToothGrowth$supp == "OJ",]
# Step 2: Calculate the mean of the len column from the new subsetted dataset
mean(oj$len)
# Only referring to column vectors
mean(ToothGrowth$len[ToothGrowth$supp == "OJ"])
#The with() function helps to save you some typing when you are using multiple columns from a dataframe. Specifically, it allows you to specify a dataframe (or any other object in R) once at the beginning of a line – then, for every object you refer to in the code in that line, R will assume you’re referring to that object in an expression.
#create a dataframe called health
health <- data.frame("age" = c(32, 24, 43, 19, 43),
"height" = c(1.75, 1.65, 1.50, 1.92, 1.80),
"weight" = c(70, 65, 62, 79, 85))
#Add a new column called bmi which represents a person’s body mass index (BMI)
health$weight / health$height ^ 2
#use with function
# Save typing by using with()
with(health, weight / height ^ 2)
```
### Ch 9
Chapter 9 Importing, saving and managing data
```{r echo = TRUE}
#print the current working directory
getwd()
#change the working directory to diferent path
#setwd(dir= )
#see all the objects defined in current workspace
ls()
# Create objects to save later
study1.df<-data.frame(id = 1:5,
sex = c("m", "m", "f", "f", "m"),
score = c(51, 20, 67, 52, 42))
score.by.sex<-aggregate(score ~ sex,
FUN = mean,
data = study1.df)
study1.htest<-t.test(score ~ sex,
data = study1.df)
# Save two objects as a new .RData file in the data folder of my current working directory
#save(study1.df, score.by.sex, study1.htest,file = "data")
#To save all the objects in your workspace as a .RData file, use the "save.image()" function
#To load an .RData file, that is, to import all of the objects contained in the .RData file into your current workspace, use the load() function
#To remove objects from your workspace, use the "rm()" function
# Read a text file from the web
fromweb <- read.table(file = 'http://goo.gl/jTNf6P',
sep = '\t',
header = TRUE)
# Print the result
fromweb
a <- data.frame("sex" = c("m", "f", "m"),
"age" = c(19, 43, 25),
"favorite.movie" = c("Moon", "The Goonies", "Spice World"))
b <- mean(a$age)
c <- table(a$sex)
```
Exploratory Data Analysis with R
=====================================
Column {.tabset}
-------------------------------------
### Ch4
Exploratory Data Analysis Checklist
```{r echo = TRUE}
library(readr)
ozone<- read_csv("C:/Users/L5M/Desktop/HU/EDA/EDA_R_project/US EPA data 2017.csv")
names(ozone) <- make.names(names(ozone))
#rewrite the names of the columns to remove any spaces
# check the number or columns and row
nrow(ozone)
ncol(ozone)
#examine the classes of each of the columns
str(ozone)
#Look at the top 6 rows of data
head(ozone)
#Look at the botom 6 rows of specific columns
tail(ozone[, c(6:7, 10)])
#take a look at the Stat.Code variable to see what time measurements are recorded as being taken
table(ozone$State.Code)
library(dplyr)
#use filter function to filter dataset
filter(ozone,ozone$State.Code=="78" )
#select specific range of data
select(ozone, State.Name) %>% unique %>% nrow
#unique elements
unique(ozone$State.Name)
#statistic summary for specific variable
summary(ozone$Arithmetic.Mean)
quantile(ozone$Arithmetic.Mean)
```
R for Data Science
=====================================
Column {.tabset}
-------------------------------------
### Ch5
Data Transformaion
```{r echo = TRUE}
library(nycflights13)
library(tidyverse)
flights
##dplyr
#Pick observations by their values (filter()).
#Reorder the rows (arrange()).
#Pick variables by their names (select()).
#Create new variables with functions of existing variables (mutate()).
#Collapse many values down to a single summary (summarise()).
##filter
#filter() allows you to subset observations based on their values. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame.
jan1 <- filter(flights, month == 1, day == 1)
(dec25 <- filter(flights, month == 12, day == 25))
#wrap the assignment in parentheses
near(sqrt(2) ^ 2, 2)
near(1 / 49 * 49, 1)
#Computers use finite precision arithmetic, so remember that every number you see is an approximation. Instead of relying on ==, use near()
#Finds all flights that departed in November or December
filter(flights, month == 11 | month == 12)
#"x %in% y" will select every row where x is one of the values in y
nov_dec <- filter(flights, month %in% c(11, 12))
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
#De Morgan’s law: !(x & y) is the same as !x | !y, and !(x | y) is the same as !x & !y
# Missing Value
NA > 5
NA + 10
NA / 2
#to determine if a value is missing, use "is.na()"
x<- NA
is.na(x)
df <- tibble(x = c(1, NA, 3))
filter(df, x > 1)
#filter() only includes rows where the condition is TRUE; it excludes both FALSE and NA values. If you want to preserve missing values, ask for them explicitly
filter(df, is.na(x) | x > 1)
##Arrange
#arrange() works similarly to filter() except that instead of selecting rows, it changes their order
arrange(flights, year, month, day)
#If provide more than one column name, each additional column will be used to break ties in the values of preceding columns
#desc() to re-order by a column in descending order
arrange(flights, desc(dep_delay))
#Missing values are always sorted at the end
df <- tibble(x = c(5, 2, NA))
arrange(df, x)
arrange(df, desc(x))
## select
# allows you to rapidly zoom in on a useful subset using operations based on the names of the variables
# Select columns by name
select(flights, year, month, day)
# Select all columns between year and day (inclusive)
select(flights, year:day)
# Select all columns except those from year to day (exclusive)
select(flights, -(year:day))
#starts_with("abc"): matches names that begin with “abc”.
#ends_with("xyz"): matches names that end with “xyz”.
#contains("ijk"): matches names that contain “ijk”.
#matches("(.)\\1"): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.
#num_range("x", 1:3): matches x1, x2 and x3.
rename(flights, tail_num = tailnum)
#have a handful of variables to move to the start of the data frame.
select(flights, time_hour, air_time, everything())
## Add new variables with mutate()
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate(flights_sml,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
#only want to keep the new variables
transmute(flights,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
#Arithmetic operators: +, -, *, /, ^. These are all vectorised, using the so called “recycling rules”. If one parameter is shorter than the other, it will be automatically extended to be the same length. This is most useful when one of the arguments is a single number: air_time / 60, hours * 60 + minute, etc.
#Arithmetic operators are also useful in conjunction with the aggregate functions you’ll learn about later. For example, x / sum(x) calculates the proportion of a total, and y - mean(y) computes the difference from the mean.
#Modular arithmetic: %/% (integer division) and %% (remainder), where x == y * (x %/% y) + (x %% y). Modular arithmetic is a handy tool because it allows you to break integers up into pieces. For example, in the flights dataset, you can compute hour and minute from dep_time with:
transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100
)
#Offsets: lead() and lag() allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. x - lag(x)) or find when values change (x != lag(x)). They are most useful in conjunction with group_by(), which you’ll learn about shortly.
(x <- 1:10)
lag(x)
lead(x)
x
cumsum(x)
cummean(x)
#Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: cumsum(), cumprod(), cummin(), cummax(); and dplyr provides cummean() for cumulative means
#ranking
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
min_rank(desc(y))
row_number(y)
dense_rank(y)
percent_rank(y)
cume_dist(y)
##Summaries
# collapses a data frame to a single row
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
#remove the NA
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## %>% Pipe to combine multiple operations
by_dest <- group_by(flights, dest)
#Group flights by destination.
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
#Summarise to compute distance, average delay, and number of flights.
delay <- filter(delay, count > 20, dest != "HNL")
#Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
# delays increase with distance up to ~750 miles and then decrease. Maybe as flights get longer there's more ability to make up delays in the air
#ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
# geom_point(aes(size = count), alpha = 1/3) +
# geom_smooth(se = FALSE)
#remove the cancel flight(NA)
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
#Whenever you do any aggregation, it’s always a good idea to include either a count (n()), or a count of non-missing values (sum(!is.na(x))). That way you can check that you’re not drawing conclusions based on very small amounts of data
# The planes (identified by their tail number) that have the highest average delays
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
)
#ggplot(data = delays, mapping = aes(x = delay)) +
# geom_freqpoly(binwidth = 10)
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
#ggplot(data = delays, mapping = aes(x = n, y = delay)) +
# geom_point(alpha = 1/10)
delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
# Convert to a tibble so it prints nicely
batting <- as_tibble(Lahman::Batting)
batters <- batting %>%
group_by(playerID) %>%
summarise(
ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
ab = sum(AB, na.rm = TRUE)
)
batters %>%
filter(ab > 100) %>%
ggplot(mapping = aes(x = ab, y = ba)) +
geom_point() +
geom_smooth(se = FALSE)
batters %>%
arrange(desc(ba))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay1 = mean(arr_delay),
avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
)
# Why is distance to some destinations more variable than to others?
not_cancelled %>%
group_by(dest) %>%
summarise(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))
# When do the first and last flights leave each day?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time)
)
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
# Which destinations have the most carriers?
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
not_cancelled %>%
count(dest)
# “count” (sum) the total number of miles a plane flew
not_cancelled %>%
count(tailnum, wt = distance)
# How many flights left before 5am? (these usually indicate delayed flights from the previous day)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(n_early = sum(dep_time < 500))
# What proportion of flights are delayed by more than an hour?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(hour_perc = mean(arr_delay > 60))
daily <- group_by(flights, year, month, day)
(per_day <- summarise(daily, flights = n()))
(per_month <- summarise(per_day, flights = sum(flights)))
(per_year <- summarise(per_month, flights = sum(flights)))
##ungroup
daily %>%
ungroup() %>% # no longer grouped by date
summarise(flights = n()) # all flights
#Find the worst members of each group
flights_sml %>%
group_by(year, month, day) %>%
filter(rank(desc(arr_delay)) < 10)
#Find all groups bigger than a threshold
popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365)
popular_dests
#Standardise to compute per group metrics
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
```
### Ch12
Tidy data
```{r echo = TRUE}
library(tidyverse)
#There are three interrelated rules which make a dataset tidy:
#1.Each variable must have its own column.
#2.Each observation must have its own row.
#3.Each value must have its own cell.
table1 #tidy table
table2
table3
table4a
table4b
# Compute rate per 10,000
table1 %>%
mutate(rate = cases / population * 10000)
# Compute cases per year
table1 %>%
count(year, wt = cases)
# Visualise changes over time
#library(ggplot2)
#ggplot(table1, aes(year, cases)) +
# geom_line(aes(group = country), colour = "grey50") +
# geom_point(aes(colour = country))
##Gather
#some of the column names are not names of variables, but values of a variable. Take table4a: the column names 1999 and 2000 represent values of the year variable, and each row represents two observations, not one
table4a
#The name of the variable whose values form the column names. I call that the key, and here it is year.
#The name of the variable whose values are spread over the cells. I call that value, and here it’s the number of cases.
table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
#the variable stored in the cell values:
table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
#To combine the tidied versions of table4a and table4b into a single tibble, we need to use dplyr::left_join()
tidy4a <- table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)
##Spreading
#Spreading is the opposite of gathering. You use it when an observation is scattered across multiple rows
table2
#The column that contains variable names, the key column. Here, it’s type.
#The column that contains values from multiple variables, the value column. Here it’s count.
table2 %>%
spread(key = type, value = count)
#gather() makes wide tables narrower and longer; spread() makes long tables shorter and wider.
##Separarte
#pulls apart one column into multiple columns, by splitting wherever a separator character appears
table3
#The rate column contains both cases and population variables, and we need to split it into two variables
table3 %>%
separate(rate, into = c("cases", "population"))
#separate() will split values wherever it sees a non-alphanumeric character
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/")
#If you wish to use a specific character to separate a column, you can pass the character to the sep argument of separate().
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
#ask separate() to try and convert to better types using convert = TRUE
#pass a vector of integers to sep. separate() will interpret the integers as positions to split at. Positive values start at 1 on the far-left of the strings; negative value start at -1 on the far-right of the strings. When using integers to separate strings, the length of sep should be one less than the number of names in "into"
#Separate the last two digits of each year
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
#unite() is the inverse of separate(): it combines multiple columns into a single column
#use unite() to rejoin the century and year columns
tidyr::table5
table5%>%
unite(new, century, year)
#use the sep argument
table5 %>%
unite(new, century, year, sep = "")
##Missing values
#Explicitly, i.e. flagged with NA.
#Implicitly, i.e. simply not present in the data.
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
#make the implicit missing value explicit by putting years in the columns
stocks %>%
spread(year, return)
#set na.rm = TRUE in gather() to turn explicit missing values implicit
stocks %>%
spread(year, return) %>%
gather(year, return, `2015`:`2016`, na.rm = TRUE)
stocks %>%
complete(year, qtr)
#complete() takes a set of columns, and finds all unique combinations. It then ensures the original dataset contains all those values, filling in explicit NAs where necessary
#when a data source has primarily been used for data entry, missing values indicate that the previous value should be carried forward
treatment <- tribble(
~ person, ~ treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
#takes a set of columns where you want missing values to be replaced by the most recent non-missing value
treatment %>%
fill(person)
```
### Ch20
Vectors
```{r echo = TRUE}
library(tidyverse)
#2 properties of vectors: type and length
#Type
typeof(letters)
typeof(1:10)
#length
x <- list("a", "b", 1:10)
length(x)
#Logical vectors are the simplest type of atomic vector because they can take only three possible values: FALSE, TRUE, and NA
#comparisons
1:10 %% 3 == 0
#create by hand
c(TRUE, TRUE, FALSE, NA)
#Integer and double vectors are known collectively as numeric vectors
#To make an integer, place an L after the number
typeof(1)
typeof(1L)
1.5L
#Doubles are approximations.
x <- sqrt(2) ^ 2
x
x - 2
# when working with floating point numbers: most calculations include some approximation error. Instead of comparing floating point numbers using ==, you should use dplyr::near() which allows for some numerical tolerance
#Integers have one special value: NA, while doubles have four: NA, NaN, Inf and -Inf. All three special values NaN, Inf and -Inf can arise during division
c(-1, 0, 1) / 0
#Avoid using == to check for these other special values. Instead use the helper functions is.finite(), is.infinite(), and is.nan()
#Character
#each unique string is only stored in memory once, and every use of the string points to that representation. This reduces the amount of memory needed by duplicated strings. You can see this behaviour in practice with pryr::object_size()
x <- "This is a reasonably long string."
pryr::object_size(x)
y <- rep(x, 1000)
pryr::object_size(y)
#y doesn’t take up 1,000x as much memory as x, because each element of y is just a pointer to that same string. A pointer is 8 bytes, so 1000 pointers to a 136 B string is 8 * 1000 + 136 = 8.13 kB
#Missing Value
#Each type of atomic vector has its own missing value
NA # logical
NA_integer_ # integer
NA_real_ # double
NA_character_ # character
#Coercion
#There are two ways to convert, or coerce, one type of vector to another:
#1 Explicit coercion
# as.logical(), as.integer(), as.double(), or as.character()
#2 Implicit coercion
#EX: using a logical vector in a numeric context
#TRUE=1, FALSE=0
x <- sample(20, 100, replace = TRUE)
y <- x > 10
sum(y) # how many are greater than 10
mean(y) # proportion are greater than 10
#when a vector containing multiple types with c(): the most complex type always wins.
typeof(c(TRUE, 1L))
typeof(c(1L, 1.5))
typeof(c(1.5, "a"))
#Each predicate also comes with a “scalar” version, like is_scalar_atomic(), which checks that the length is 1. This is useful, for example, if you want to check that an argument to your function is a single logical value.
#Vector recycling: implicitly coerce the length of vectors, because the shorter vector is repeated, or recycled, to the same length as the longer vector.
# Vectorised, meaning that they will operate on a vector of numbers
sample(10) + 100
runif(10) > 0.5
#R will expand the shortest vector to the same length as the longest, so called recycling
1:10 + 1:2
1:10 + 1:3
#While vector recycling can be used to create very succinct, clever code, it can also silently conceal problems. For this reason, the vectorised functions in tidyverse will throw errors when you recycle anything other than a scalar. If you do want to recycle, you’ll need to do it yourself with rep():
tibble(x = 1:4, y = rep(1:2, 2))
tibble(x = 1:4, y = rep(1:2, each = 2))
# Name vector during creation with c():
c(x = 1, y = 2, z = 4)
#after the fact with purrr::set_names()
set_names(1:3, c("a", "b", "c"))
##Subset
#1.A numeric vector containing only integers. The integers must either be all positive, all negative, or zero. Subsetting with positive integers keeps the elements at those positions:
x <- c("one", "two", "three", "four", "five")
x[c(3, 2, 5)]
x[c(1, 1, 5, 5, 5, 2)]
#Negative values drop the elements at the specified positions
x[c(-1, -3, -5)]
#2.Subsetting with a logical vector keeps all values corresponding to a TRUE value. This is most often useful in conjunction with the comparison functions
x <- c(10, 3, NA, 5, 8, 1, NA)
# All non-missing values of x
x[!is.na(x)]
# All even (or missing!) values of x
x[x %% 2 == 0]
#3.If you have a named vector, you can subset it with a character vector
x <- c(abc = 1, def = 2, xyz = 5)
x[c("xyz", "def")]
##20.5 Recursive vectors (lists)
#create a list
x <- list(1, 2, 3)
x
str(x)
x_named <- list(a = 1, b = 2, c = 3)
str(x_named)
#list can contain a mix of objects
y <- list("a", 1L, 1.5, TRUE)
str(y)
#or even cotain other list
z <- list(list(1, 2), list(3, 4))
str(z)
x1 <- list(c(1, 2), c(3, 4))
x2 <- list(list(1, 2), list(3, 4))
x3 <- list(1, list(2, list(3)))
a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
str(a[1:2])
str(a[4]) #[ extracts a sub-list. The result will always be a list
#[[ extracts a single component from a list. It removes a level of hierarchy from the list.
str(a[[1]])
str(a[[4]])
# $ has same function without quote
a$a
a[["a"]]
##Attributes
x <- 1:10
attr(x, "greeting")
attr(x, "greeting") <- "Hi!"
attr(x, "farewell") <- "Bye!"
attributes(x)
##Factors
# Factors are designed to represent categorical data that can take a fixed set of possible values. Factors are built on top of integers, and have a levels attribute
x <- factor(c("ab", "cd", "ab"), levels = c("ab", "cd", "ef"))
typeof(x)
attributes(x)
#Dates in R are numeric vectors that represent the number of days since 1 January 1970
x <- as.Date("1971-01-01")
unclass(x)
typeof(x)
attributes(x)
#Date-times are numeric vectors with class POSIXct that represent the number of seconds since 1 January 1970. (In case you were wondering, “POSIXct” stands for “Portable Operating System Interface”, calendar time.)
x <- lubridate::ymd_hm("1970-01-01 01:00")
unclass(x)
typeof(x)
attributes(x)
#The tzone attribute is optional. It controls how the time is printed, not what absolute time it refers to
attr(x, "tzone") <- "US/Pacific"
x
attr(x, "tzone") <- "US/Eastern"
x
# Another type of date-times called POSIXlt. These are built on top of named lists
y <- as.POSIXlt(x)
typeof(y)
attributes(y)
#POSIXct’s are always easier to work with, so if you find you have a POSIXlt, you should always convert it to a regular data time lubridate::as_date_time()
#Tibbles are augmented lists: they have class “tbl_df” + “tbl” + “data.frame”, and names (column) and row.names attributes
tb <- tibble::tibble(x = 1:5, y = 5:1)
typeof(tb)
attributes(tb)
#The difference between a tibble and a list is that all the elements of a data frame must be vectors with the same length. All functions that work with tibbles enforce this constraint.
#Traditional data.frames have a very similar structure
df <- data.frame(x = 1:5, y = 5:1)
typeof(df)
attributes(df)
#The class of tibble includes “data.frame” which means tibbles inherit the regular data frame behaviour by default
```