Two packages to help you work with the structure of data.
tidyr
dplyr
Just like data frames, but play better with the console window.
Examine any data set with the View() command (Capital V)
library(dplyr)
select(tb, child:elderly)
tb %>% select(child:elderly)
# install.packages("devtools")
# devtools::install_github("rstudio/EDAWR")
library(EDAWR)
# Spot Checking data
#
# ?storms
# ?cases
# ?pollution
# ?tb
# Storms
storms$storm
## [1] "Alberto" "Alex" "Allison" "Ana" "Arlene" "Arthur"
storms$wind
## [1] 110 45 65 40 50 45
storms$pressure
## [1] 1007 1009 1005 1013 1010 1010
storms$date
## [1] "2000-08-03" "1998-07-27" "1995-06-03" "1997-06-30" "1999-06-11"
## [6] "1996-06-17"
# CASES
cases$country
## [1] "FR" "DE" "US"
names(cases)[-1]
## [1] "2011" "2012" "2013"
unlist(cases[1:3, 2:4])
## 20111 20112 20113 20121 20122 20123 20131 20132 20133
## 7000 5800 15000 6900 6000 14000 7000 6200 13000
# Pollution -> Errors
# pollution$city[1,3,5]
# pollution$amount[1,3,5]
# pollution$amount[2,4,6]
Variables in columns, observations in rows, each type in a table
Easy to access variables
Automatically preserves observations
Two main functions: gather() and spread()
GATHER:
Collapses multiple columns into two columns:
SPREAD:
Generates multiple columns from two columns:
Notes:
# install.packages("tidyr")
library(tidyr)
# Gather columns into key-value pairs.
# ?gather
# Key - former column names
# value - former cells
gather(cases, "year", "n", 2:4)
## country year n
## 1 FR 2011 7000
## 2 DE 2011 5800
## 3 US 2011 15000
## 4 FR 2012 6900
## 5 DE 2012 6000
## 6 US 2012 14000
## 7 FR 2013 7000
## 8 DE 2013 6200
## 9 US 2013 13000
# Spread a key-value pair across multiple columns.
# ?spread
spread(pollution, size, amount)
## city large small
## 1 Beijing 121 56
## 2 London 22 16
## 3 New York 23 14
separate()
Separate splits a column by a character string separator.
separate(storms, date, c(“year”, “month”, “day”), sep = “-”)
** unite**
Unite unites columns into a single column
unite(storms2, “date”, year, month, day, sep = “-”)
A package that reshapes the layout of data sets.
Make observations from variables with gather()
Make variables from observations with spread()
Split and merge columns with unite() and separate()
Notes:
# install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# ?select
# ?filter
# ?arrange
# ?mutate
# ?summarise
# ?group_by
library(nycflights13)
# ?airlines
# ?airports
# ?flights
# ?planes
# ?weather
Extract existing variables. - select()
Extract existing observations - filter()
Derive new variables (from existing variables) - mutate()
Change the unit of analysis - summarise()
Notes:
select(storms, storm, pressure)
## Source: local data frame [6 x 2]
##
## storm pressure
## (chr) (int)
## 1 Alberto 1007
## 2 Alex 1009
## 3 Allison 1005
## 4 Ana 1013
## 5 Arlene 1010
## 6 Arthur 1010
select(storms, -storm)
## Source: local data frame [6 x 3]
##
## wind pressure date
## (int) (int) (date)
## 1 110 1007 2000-08-03
## 2 45 1009 1998-07-27
## 3 65 1005 1995-06-03
## 4 40 1013 1997-06-30
## 5 50 1010 1999-06-11
## 6 45 1010 1996-06-17
select(storms, wind:date)
## Source: local data frame [6 x 3]
##
## wind pressure date
## (int) (int) (date)
## 1 110 1007 2000-08-03
## 2 45 1009 1998-07-27
## 3 65 1005 1995-06-03
## 4 40 1013 1997-06-30
## 5 50 1010 1999-06-11
## 6 45 1010 1996-06-17
# see ?select for more
Notes:
filter(storms, wind >= 50)
## Source: local data frame [3 x 4]
##
## storm wind pressure date
## (chr) (int) (int) (date)
## 1 Alberto 110 1007 2000-08-03
## 2 Allison 65 1005 1995-06-03
## 3 Arlene 50 1010 1999-06-11
filter(storms, wind >= 50,
storm %in% c("Alberto", "Alex", "Allison"))
## Source: local data frame [2 x 4]
##
## storm wind pressure date
## (chr) (int) (int) (date)
## 1 Alberto 110 1007 2000-08-03
## 2 Allison 65 1005 1995-06-03
?Comparison
?base::Logic
Notes:
mutate(storms, ratio = pressure / wind)
## Source: local data frame [6 x 5]
##
## storm wind pressure date ratio
## (chr) (int) (int) (date) (dbl)
## 1 Alberto 110 1007 2000-08-03 9.154545
## 2 Alex 45 1009 1998-07-27 22.422222
## 3 Allison 65 1005 1995-06-03 15.461538
## 4 Ana 40 1013 1997-06-30 25.325000
## 5 Arlene 50 1010 1999-06-11 20.200000
## 6 Arthur 45 1010 1996-06-17 22.444444
mutate(storms, ratio = pressure / wind, inverse = ratio^-1)
## Source: local data frame [6 x 6]
##
## storm wind pressure date ratio inverse
## (chr) (int) (int) (date) (dbl) (dbl)
## 1 Alberto 110 1007 2000-08-03 9.154545 0.10923535
## 2 Alex 45 1009 1998-07-27 22.422222 0.04459861
## 3 Allison 65 1005 1995-06-03 15.461538 0.06467662
## 4 Ana 40 1013 1997-06-30 25.325000 0.03948667
## 5 Arlene 50 1010 1999-06-11 20.200000 0.04950495
## 6 Arthur 45 1010 1996-06-17 22.444444 0.04455446
Notes:
pollution %>% summarise(median = median(amount), variance = var(amount))
## median variance
## 1 22.5 1731.6
pollution %>% summarise(mean = mean(amount), sum = sum(amount), n = n())
## mean sum n
## 1 42 252 6
All take a vector of values and return a single value
n_distinct() The number of distinct values in a vector
Notes:
arrange(storms, wind)
## Source: local data frame [6 x 4]
##
## storm wind pressure date
## (chr) (int) (int) (date)
## 1 Ana 40 1013 1997-06-30
## 2 Alex 45 1009 1998-07-27
## 3 Arthur 45 1010 1996-06-17
## 4 Arlene 50 1010 1999-06-11
## 5 Allison 65 1005 1995-06-03
## 6 Alberto 110 1007 2000-08-03
arrange(storms, desc(wind))
## Source: local data frame [6 x 4]
##
## storm wind pressure date
## (chr) (int) (int) (date)
## 1 Alberto 110 1007 2000-08-03
## 2 Allison 65 1005 1995-06-03
## 3 Arlene 50 1010 1999-06-11
## 4 Alex 45 1009 1998-07-27
## 5 Arthur 45 1010 1996-06-17
## 6 Ana 40 1013 1997-06-30
# First Wind, then data
arrange(storms, wind, date)
## Source: local data frame [6 x 4]
##
## storm wind pressure date
## (chr) (int) (int) (date)
## 1 Ana 40 1013 1997-06-30
## 2 Arthur 45 1010 1996-06-17
## 3 Alex 45 1009 1998-07-27
## 4 Arlene 50 1010 1999-06-11
## 5 Allison 65 1005 1995-06-03
## 6 Alberto 110 1007 2000-08-03
Notes:
# Method 1
select(storms, storm, pressure)
## Source: local data frame [6 x 2]
##
## storm pressure
## (chr) (int)
## 1 Alberto 1007
## 2 Alex 1009
## 3 Allison 1005
## 4 Ana 1013
## 5 Arlene 1010
## 6 Arthur 1010
# Method 2
storms %>% select(storm, pressure)
## Source: local data frame [6 x 2]
##
## storm pressure
## (chr) (int)
## 1 Alberto 1007
## 2 Alex 1009
## 3 Allison 1005
## 4 Ana 1013
## 5 Arlene 1010
## 6 Arthur 1010
# Filter 1
filter(storms, wind >= 50)
## Source: local data frame [3 x 4]
##
## storm wind pressure date
## (chr) (int) (int) (date)
## 1 Alberto 110 1007 2000-08-03
## 2 Allison 65 1005 1995-06-03
## 3 Arlene 50 1010 1999-06-11
# Filter 2
storms %>% filter(wind >= 50)
## Source: local data frame [3 x 4]
##
## storm wind pressure date
## (chr) (int) (int) (date)
## 1 Alberto 110 1007 2000-08-03
## 2 Allison 65 1005 1995-06-03
## 3 Arlene 50 1010 1999-06-11
# Combination
storms %>%
filter(wind >= 50) %>%
select(storm, pressure)
## Source: local data frame [3 x 2]
##
## storm pressure
## (chr) (int)
## 1 Alberto 1007
## 2 Allison 1005
## 3 Arlene 1010
storms %>%
mutate(ratio = pressure / wind) %>%
select(storm, ratio)
## Source: local data frame [6 x 2]
##
## storm ratio
## (chr) (dbl)
## 1 Alberto 9.154545
## 2 Alex 22.422222
## 3 Allison 15.461538
## 4 Ana 25.325000
## 5 Arlene 20.200000
## 6 Arthur 22.444444
Notes:
pollution %>% group_by(city)
## Source: local data frame [6 x 3]
## Groups: city [3]
##
## city size amount
## (chr) (chr) (dbl)
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
pollution %>% group_by(size)
## Source: local data frame [6 x 3]
## Groups: size [2]
##
## city size amount
## (chr) (chr) (dbl)
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
pollution %>%
group_by(city) %>%
summarise(mean = mean(amount), sum = sum(amount), n = n())
## Source: local data frame [3 x 4]
##
## city mean sum n
## (chr) (dbl) (dbl) (int)
## 1 Beijing 88.5 177 2
## 2 London 19.0 38 2
## 3 New York 18.5 37 2
pollution %>%
group_by(city) %>%
summarise(mean = mean(amount))
## Source: local data frame [3 x 2]
##
## city mean
## (chr) (dbl)
## 1 Beijing 88.5
## 2 London 19.0
## 3 New York 18.5
pollution %>%
group_by(size) %>%
summarise(mean = mean(amount))
## Source: local data frame [2 x 2]
##
## size mean
## (chr) (dbl)
## 1 large 55.33333
## 2 small 28.66667
pollution %>% ungroup()
## city size amount
## 1 New York large 23
## 2 New York small 14
## 3 London large 22
## 4 London small 16
## 5 Beijing large 121
## 6 Beijing small 56
# Note: 2X summarise
# tb %>%
# group_by(country, year) %>%
# summarise(cases = sum(cases)) %>%
# summarise(cases = sum(cases))
# JOIN STATEMENTS
left_join(songs, artists, by = "name")
## song name plays
## 1 Across the Universe John guitar
## 2 Come Together John guitar
## 3 Hello, Goodbye Paul bass
## 4 Peggy Sue Buddy <NA>
left_join(songs2, artists2, by = c("first", "last"))
## song first last plays
## 1 Across the Universe John Lennon guitar
## 2 Come Together John Lennon guitar
## 3 Hello, Goodbye Paul McCartney bass
## 4 Peggy Sue Buddy Holly <NA>
inner_join(songs, artists, by = "name")
## song name plays
## 1 Across the Universe John guitar
## 2 Come Together John guitar
## 3 Hello, Goodbye Paul bass
semi_join(songs, artists, by = "name")
## song name
## 1 Across the Universe John
## 2 Come Together John
## 3 Hello, Goodbye Paul
anti_join(songs, artists, by = "name")
## song name
## 1 Peggy Sue Buddy