How to work with the structures of your data

Two packages to help you work with the structure of data.

GROUND RULES

Just like data frames, but play better with the console window.

Examine any data set with the View() command (Capital V)

library(dplyr)

select(tb, child:elderly)

tb %>% select(child:elderly)


EDWAR

# install.packages("devtools")
# devtools::install_github("rstudio/EDAWR")

library(EDAWR)


# Spot Checking data
#
# ?storms
# ?cases
# ?pollution
# ?tb


# Storms
storms$storm
## [1] "Alberto" "Alex"    "Allison" "Ana"     "Arlene"  "Arthur"
storms$wind
## [1] 110  45  65  40  50  45
storms$pressure
## [1] 1007 1009 1005 1013 1010 1010
storms$date
## [1] "2000-08-03" "1998-07-27" "1995-06-03" "1997-06-30" "1999-06-11"
## [6] "1996-06-17"
# CASES
cases$country
## [1] "FR" "DE" "US"
names(cases)[-1]
## [1] "2011" "2012" "2013"
unlist(cases[1:3, 2:4])
## 20111 20112 20113 20121 20122 20123 20131 20132 20133 
##  7000  5800 15000  6900  6000 14000  7000  6200 13000
# Pollution -> Errors
# pollution$city[1,3,5]
# pollution$amount[1,3,5]
# pollution$amount[2,4,6]

Recap: Tidy data

Variables in columns, observations in rows, each type in a table

tidyr

Two main functions: gather() and spread()

GATHER:

Collapses multiple columns into two columns:

SPREAD:

Generates multiple columns from two columns:


GATHER and SPREAD

Notes:

# install.packages("tidyr")


library(tidyr)

# Gather columns into key-value pairs.
# ?gather


# Key - former column names
# value - former cells


gather(cases, "year", "n", 2:4)
##   country year     n
## 1      FR 2011  7000
## 2      DE 2011  5800
## 3      US 2011 15000
## 4      FR 2012  6900
## 5      DE 2012  6000
## 6      US 2012 14000
## 7      FR 2013  7000
## 8      DE 2013  6200
## 9      US 2013 13000
# Spread a key-value pair across multiple columns.
# ?spread


spread(pollution, size, amount)
##       city large small
## 1  Beijing   121    56
## 2   London    22    16
## 3 New York    23    14

UNITE and SEPARATE

separate()

** unite**

Recap: tidyr

A package that reshapes the layout of data sets.


dplyr


UNITE and SEPARATE

Notes:

# install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# ?select
# ?filter
# ?arrange
# ?mutate
# ?summarise
# ?group_by


library(nycflights13)



# ?airlines
# ?airports
# ?flights
# ?planes
# ?weather

Ways to Access Information


SELECT Examples -> Extract existing variables

Notes:

select(storms, storm, pressure)
## Source: local data frame [6 x 2]
## 
##     storm pressure
##     (chr)    (int)
## 1 Alberto     1007
## 2    Alex     1009
## 3 Allison     1005
## 4     Ana     1013
## 5  Arlene     1010
## 6  Arthur     1010
select(storms, -storm)
## Source: local data frame [6 x 3]
## 
##    wind pressure       date
##   (int)    (int)     (date)
## 1   110     1007 2000-08-03
## 2    45     1009 1998-07-27
## 3    65     1005 1995-06-03
## 4    40     1013 1997-06-30
## 5    50     1010 1999-06-11
## 6    45     1010 1996-06-17
select(storms, wind:date)
## Source: local data frame [6 x 3]
## 
##    wind pressure       date
##   (int)    (int)     (date)
## 1   110     1007 2000-08-03
## 2    45     1009 1998-07-27
## 3    65     1005 1995-06-03
## 4    40     1013 1997-06-30
## 5    50     1010 1999-06-11
## 6    45     1010 1996-06-17
# see ?select for more

Useful select functions


FILTER Examples -> Extract existing observations

Notes:

filter(storms, wind >= 50)
## Source: local data frame [3 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03
## 3  Arlene    50     1010 1999-06-11
filter(storms, wind >= 50,
storm %in% c("Alberto", "Alex", "Allison"))
## Source: local data frame [2 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03

Logical Tests in R

?Comparison

?base::Logic

MUTATE Examples - Derive new variables (from existing variables)

Notes:

mutate(storms, ratio = pressure / wind)
## Source: local data frame [6 x 5]
## 
##     storm  wind pressure       date     ratio
##     (chr) (int)    (int)     (date)     (dbl)
## 1 Alberto   110     1007 2000-08-03  9.154545
## 2    Alex    45     1009 1998-07-27 22.422222
## 3 Allison    65     1005 1995-06-03 15.461538
## 4     Ana    40     1013 1997-06-30 25.325000
## 5  Arlene    50     1010 1999-06-11 20.200000
## 6  Arthur    45     1010 1996-06-17 22.444444
mutate(storms, ratio = pressure / wind, inverse = ratio^-1)
## Source: local data frame [6 x 6]
## 
##     storm  wind pressure       date     ratio    inverse
##     (chr) (int)    (int)     (date)     (dbl)      (dbl)
## 1 Alberto   110     1007 2000-08-03  9.154545 0.10923535
## 2    Alex    45     1009 1998-07-27 22.422222 0.04459861
## 3 Allison    65     1005 1995-06-03 15.461538 0.06467662
## 4     Ana    40     1013 1997-06-30 25.325000 0.03948667
## 5  Arlene    50     1010 1999-06-11 20.200000 0.04950495
## 6  Arthur    45     1010 1996-06-17 22.444444 0.04455446

Useful mutate functions

NOTE: “Window” functions

SUMMARISE Examples - Change the unit of analysis

Notes:

pollution %>% summarise(median = median(amount), variance = var(amount))
##   median variance
## 1   22.5   1731.6
pollution %>% summarise(mean = mean(amount), sum = sum(amount), n = n())
##   mean sum n
## 1   42 252 6

Useful summary functions

“Summary” functions

ARRANGE - Examples

Notes:

arrange(storms, wind)
## Source: local data frame [6 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1     Ana    40     1013 1997-06-30
## 2    Alex    45     1009 1998-07-27
## 3  Arthur    45     1010 1996-06-17
## 4  Arlene    50     1010 1999-06-11
## 5 Allison    65     1005 1995-06-03
## 6 Alberto   110     1007 2000-08-03
arrange(storms, desc(wind))
## Source: local data frame [6 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03
## 3  Arlene    50     1010 1999-06-11
## 4    Alex    45     1009 1998-07-27
## 5  Arthur    45     1010 1996-06-17
## 6     Ana    40     1013 1997-06-30
# First Wind, then data
arrange(storms, wind, date)
## Source: local data frame [6 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1     Ana    40     1013 1997-06-30
## 2  Arthur    45     1010 1996-06-17
## 3    Alex    45     1009 1998-07-27
## 4  Arlene    50     1010 1999-06-11
## 5 Allison    65     1005 1995-06-03
## 6 Alberto   110     1007 2000-08-03

PIPE Operator “%>%” => “Then”

Notes:

# Method 1
select(storms, storm, pressure)
## Source: local data frame [6 x 2]
## 
##     storm pressure
##     (chr)    (int)
## 1 Alberto     1007
## 2    Alex     1009
## 3 Allison     1005
## 4     Ana     1013
## 5  Arlene     1010
## 6  Arthur     1010
# Method 2
storms %>% select(storm, pressure)
## Source: local data frame [6 x 2]
## 
##     storm pressure
##     (chr)    (int)
## 1 Alberto     1007
## 2    Alex     1009
## 3 Allison     1005
## 4     Ana     1013
## 5  Arlene     1010
## 6  Arthur     1010
# Filter 1
filter(storms, wind >= 50)
## Source: local data frame [3 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03
## 3  Arlene    50     1010 1999-06-11
# Filter 2
storms %>% filter(wind >= 50)
## Source: local data frame [3 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03
## 3  Arlene    50     1010 1999-06-11
# Combination
storms %>%
  filter(wind >= 50) %>%
  select(storm, pressure)
## Source: local data frame [3 x 2]
## 
##     storm pressure
##     (chr)    (int)
## 1 Alberto     1007
## 2 Allison     1005
## 3  Arlene     1010
storms %>%
  mutate(ratio = pressure / wind) %>%
  select(storm, ratio)
## Source: local data frame [6 x 2]
## 
##     storm     ratio
##     (chr)     (dbl)
## 1 Alberto  9.154545
## 2    Alex 22.422222
## 3 Allison 15.461538
## 4     Ana 25.325000
## 5  Arlene 20.200000
## 6  Arthur 22.444444

Unit Of Analysis

GROUP_BY / UNGROUP

Notes:

pollution %>% group_by(city)
## Source: local data frame [6 x 3]
## Groups: city [3]
## 
##       city  size amount
##      (chr) (chr)  (dbl)
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
## 6  Beijing small     56
pollution %>% group_by(size)
## Source: local data frame [6 x 3]
## Groups: size [2]
## 
##       city  size amount
##      (chr) (chr)  (dbl)
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
## 6  Beijing small     56
pollution %>% 
  group_by(city) %>%
  summarise(mean = mean(amount), sum = sum(amount), n = n())
## Source: local data frame [3 x 4]
## 
##       city  mean   sum     n
##      (chr) (dbl) (dbl) (int)
## 1  Beijing  88.5   177     2
## 2   London  19.0    38     2
## 3 New York  18.5    37     2
pollution %>% 
  group_by(city) %>% 
  summarise(mean = mean(amount))
## Source: local data frame [3 x 2]
## 
##       city  mean
##      (chr) (dbl)
## 1  Beijing  88.5
## 2   London  19.0
## 3 New York  18.5
pollution %>% 
  group_by(size) %>% 
  summarise(mean = mean(amount))
## Source: local data frame [2 x 2]
## 
##    size     mean
##   (chr)    (dbl)
## 1 large 55.33333
## 2 small 28.66667
pollution %>% ungroup()
##       city  size amount
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
## 6  Beijing small     56
# Note: 2X summarise 
# tb %>%
#  group_by(country, year) %>%
#  summarise(cases = sum(cases)) %>%
#  summarise(cases = sum(cases))



# JOIN STATEMENTS

left_join(songs, artists, by = "name")
##                  song  name  plays
## 1 Across the Universe  John guitar
## 2       Come Together  John guitar
## 3      Hello, Goodbye  Paul   bass
## 4           Peggy Sue Buddy   <NA>
left_join(songs2, artists2, by = c("first", "last"))
##                  song first      last  plays
## 1 Across the Universe  John    Lennon guitar
## 2       Come Together  John    Lennon guitar
## 3      Hello, Goodbye  Paul McCartney   bass
## 4           Peggy Sue Buddy     Holly   <NA>
inner_join(songs, artists, by = "name")
##                  song name  plays
## 1 Across the Universe John guitar
## 2       Come Together John guitar
## 3      Hello, Goodbye Paul   bass
semi_join(songs, artists, by = "name")
##                  song name
## 1 Across the Universe John
## 2       Come Together John
## 3      Hello, Goodbye Paul
anti_join(songs, artists, by = "name")
##        song  name
## 1 Peggy Sue Buddy