In this lesson students will gain more practice with the
tidyverse
with a particular focus on wrangling the
tidyr
package.
This lesson covers:
tidyr
Package
gather()
spread()
unite()
separate()
left_join
, inner_join
,
right_join
Before you start you will need to load the tidyverse
library(tidyverse)
tidyr
library(openintro)
data("helium")
## Look at data
head(helium)
## # A tibble: 6 × 3
## trial air helium
## <int> <int> <int>
## 1 1 25 25
## 2 2 23 16
## 3 3 18 25
## 4 4 16 14
## 5 5 35 23
## 6 6 15 29
This is not the format we want our data to be in! Let’s wrangle it!
### WRANGLE
football<-helium%>%
gather("air", "yards", -trial)
head(football)
## # A tibble: 6 × 3
## trial air yards
## <int> <chr> <int>
## 1 1 air 25
## 2 2 air 23
## 3 3 air 18
## 4 4 air 16
## 5 5 air 35
## 6 6 air 15
How does gather work?
YOUR ANSWER HERE
Why do we need to gather?
ggplot(data=football, aes(x=air, y=yards, fill=air))+
geom_boxplot()
# generate stock market data
set.seed(1)
stocks <- data.frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 20, 1),
Y = rnorm(10, 20, 2),
Z = rnorm(10, 20, 4)
)
stocks
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
## 7 2009-01-07 20.48743 19.96762 19.37682
## 8 2009-01-08 20.73832 21.88767 14.11699
## 9 2009-01-09 20.57578 21.64244 18.08740
## 10 2009-01-10 19.69461 21.18780 21.67177
# gather
stocksG<-stocks%>%
gather(key=stock, value=price, -time )
head(stocksG)
## time stock price
## 1 2009-01-01 X 19.37355
## 2 2009-01-02 X 20.18364
## 3 2009-01-03 X 19.16437
## 4 2009-01-04 X 21.59528
## 5 2009-01-05 X 20.32951
## 6 2009-01-06 X 19.17953
ggplot(stocksG, aes(x=time, y=price, color=stock))+
geom_line()
stocksS<-stocksG%>%
spread(key=stock, value=price)
head(stocksS)
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
These data come from Kaggle
## IMPORT THE DATA
hi <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/Hawaii_Tourism.csv",
header=TRUE)
## LOOK AT THE STRUCTURE
str(hi)
## 'data.frame': 121 obs. of 26 variables:
## $ Group : chr "All visitors by air" "Hotel-only visitors" "First-time visitors" "Honeymoon visitors" ...
## $ Indicator: chr "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ Units : chr "days" "days" "days" "days" ...
## $ X1999 : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
## $ X2000 : num 8.9 7.2 7.9 7.4 6.6 5.7 5.9 5.3 6.8 5.4 ...
## $ X2001 : num 9.2 7.5 8.4 NA 6.8 5.9 6.3 NA 6.9 5.7 ...
## $ X2002 : num 9.4 7.7 8.5 NA 7 6.2 6.4 NA 7.2 5.9 ...
## $ X2003 : num 9.2 7.7 8.5 NA 6.9 6.1 6.3 NA 7.3 6.3 ...
## $ X2004 : num 9.1 7.3 8.2 8 6.9 5.9 6.2 5.5 7.5 6.2 ...
## $ X2005 : num 9.1 7.3 8.3 7.8 6.9 6 6.3 5.5 7.5 6.3 ...
## $ X2006 : num 9.2 7.3 8.3 7.9 6.9 6 6.2 5.5 7.4 6.3 ...
## $ X2007 : num 9.2 7.2 8.4 7.9 6.8 5.9 6.2 5.5 7.3 6.2 ...
## $ X2008 : num 9.4 7.3 8.6 7.8 7.1 6.1 6.5 5.6 7.8 6.3 ...
## $ X2009 : num 9.4 7.3 8.6 7.6 7.3 6.2 6.7 5.6 8 6.4 ...
## $ X2010 : num 9.4 7.4 8.5 7.7 7.4 6.2 6.7 5.8 8.1 6.5 ...
## $ X2011 : num 9.5 7.4 8.5 7.6 7.4 6.4 6.6 5.7 8.1 5.9 ...
## $ X2012 : num 9.4 7.3 8.4 7.5 7.3 6.2 6.5 5.8 8.1 6.3 ...
## $ X2013 : num 9.3 7.2 8.3 7.5 7 6 6.3 5.6 8.2 6.3 ...
## $ X2014 : num 9.2 7.2 8.3 7.6 6.8 5.9 6.1 5.7 8.2 6.5 ...
## $ X2015 : num 9.1 7.2 8.3 7.5 6.8 5.9 6.2 5.6 8.2 6.5 ...
## $ X2016 : num 9 7.2 8.2 7.5 6.8 5.9 6.2 5.7 8.1 6.6 ...
## $ X2017 : num 9 7.3 8.4 7.6 6.7 6 6.2 5.7 8 6.6 ...
## $ X2018 : num 9 7.3 8.5 7.7 6.9 6.1 6.4 5.8 8.1 6.7 ...
## $ X2019 : num 8.8 7.1 8.2 7.7 6.8 6 6.3 5.9 7.9 6.7 ...
## $ X2020 : num 10.6 7.6 NA 8.1 8.5 6.5 NA 6.1 9.5 7.1 ...
## $ X2021 : num 9.6 7.9 8.8 9.1 8.1 6.9 7.3 6.3 8.7 7.4 ...
names(hi)
## [1] "Group" "Indicator" "Units" "X1999" "X2000" "X2001"
## [7] "X2002" "X2003" "X2004" "X2005" "X2006" "X2007"
## [13] "X2008" "X2009" "X2010" "X2011" "X2012" "X2013"
## [19] "X2014" "X2015" "X2016" "X2017" "X2018" "X2019"
## [25] "X2020" "X2021"
## RENAME THE COLUMNS
colnames(hi)[4:26]<-1999:2021
names(hi)
## [1] "Group" "Indicator" "Units" "1999" "2000" "2001"
## [7] "2002" "2003" "2004" "2005" "2006" "2007"
## [13] "2008" "2009" "2010" "2011" "2012" "2013"
## [19] "2014" "2015" "2016" "2017" "2018" "2019"
## [25] "2020" "2021"
gather()
## TIDY YOUR DATA
hiG<-hi%>%
filter(Units=="days")%>%
select(-Units)%>%
gather(key=year, value=days, -c(Group, Indicator))
## LOOK AT THE NEW STRUCTURE
str(hiG)
## 'data.frame': 2691 obs. of 4 variables:
## $ Group : chr "All visitors by air" "Hotel-only visitors" "First-time visitors" "Honeymoon visitors" ...
## $ Indicator: chr "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ year : chr "1999" "1999" "1999" "1999" ...
## $ days : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
head(hiG)
## Group Indicator year days
## 1 All visitors by air LOS Statewide 1999 8.9
## 2 Hotel-only visitors LOS Statewide 1999 7.4
## 3 First-time visitors LOS Statewide 1999 8.1
## 4 Honeymoon visitors LOS Statewide 1999 7.3
## 5 All visitors by air LOS on Oahu 1999 6.4
## 6 Hotel-only visitors LOS on Oahu 1999 5.6
## YEAR NEEDS TO BE NUMERIC TO PLOT
hiG$year<-as.numeric(hiG$year)
str(hiG)
## 'data.frame': 2691 obs. of 4 variables:
## $ Group : chr "All visitors by air" "Hotel-only visitors" "First-time visitors" "Honeymoon visitors" ...
## $ Indicator: chr "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ year : num 1999 1999 1999 1999 1999 ...
## $ days : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
## SCATTERPLOT WITH TREND
ggplot(hiG, aes(x=year, y=days))+
geom_point()
## Warning: Removed 567 rows containing missing values (geom_point).
## TRY GEOM_JITTER
ggplot(hiG, aes(x=year, y=days))+
geom_jitter()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 567 rows containing non-finite values (stat_smooth).
## Removed 567 rows containing missing values (geom_point).
Which island is the most popular to visit?
Use dplyr
to explore subgroups.
## WHAT ISLAND IS MOST POPULAR?
islands<-hiG%>%
filter(Indicator!="LOS Statewide")%>%
group_by(Indicator, year)%>%
summarise(avgDays=mean(days))
## `summarise()` has grouped output by 'Indicator'. You can override using the
## `.groups` argument.
## INSERT CODE HERE ##
## WHATS WRONG WITH THIS??
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
## Warning: Removed 120 row(s) containing missing values (geom_path).
Oh no! What happened?!
We forgot about NAs!
## WHAT ISLAND IS MOST POPULAR?
islands<-hiG%>%
filter(Indicator!="LOS Statewide")%>%
group_by(Indicator, year)%>%
summarise(avgDays=mean(days, na.rm=TRUE))
## `summarise()` has grouped output by 'Indicator'. You can override using the
## `.groups` argument.
## INSERT CODE HERE ##
## WHATS WRONG WITH THIS??
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
Try another example!
What are the trends in reasons for travels to Hawaii?
## ARE THERE TRENDS BY TYPE?
## INSERT CODE HERE ##
## INSERT CODE HERE ##
There are three types of joins that we will look at:
Inner Join: Looks for only elements that exist in both dataframes
Left Join (or Right Join): Matches elements from the left dataframe to the right
Tibbles are like data frames that can be more easily manipulated in the tidyverse. In this example we create a tibble for comic book characters.
# a tibble is a type of a data frame
superheroes <- tibble::tribble(
~name, ~alignment, ~gender, ~publisher,
"Magneto", "bad", "male", "Marvel",
"Storm", "good", "female", "Marvel",
"Mystique", "bad", "female", "Marvel",
"Batman", "good", "male", "DC",
"Joker", "bad", "male", "DC",
"Catwoman", "bad", "female", "DC",
"Hellboy", "good", "male", "Dark Horse Comics"
)
publishers <- tibble::tribble(
~publisher, ~yr_founded,
"DC", 1934L,
"Marvel", 1939L,
"Image", 1992L
)
# inner join super hero and publisher
insp<-inner_join(superheroes, publishers)
## Joining, by = "publisher"
insp
## # A tibble: 6 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
# left join super hero and publisher
ljsp<-left_join(superheroes, publishers)
## Joining, by = "publisher"
ljsp
## # A tibble: 7 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
These data come from Kaggle
## SALARY DATA for 2019-2020 season
salaries1920 <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/nba2019-20.csv",
header=TRUE)
## METRICS ON PLAYER PERFORMANCE
## 1996 to 2022
all_seasons <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/all_seasons.csv",
header=TRUE)
# SALARIES
## INSERT CODE HERE ##
str(salaries1920)
## 'data.frame': 528 obs. of 5 variables:
## $ team : chr "Golden State Warriors" "Oklahoma City Thunder" "Houston Rockets" "Washington Wizards" ...
## $ salary : int 40231758 38506482 38506482 38199000 38199000 38199000 37436858 34996296 34449964 32742000 ...
## $ player : chr "Stephen Curry" "Chris Paul" "Russell Westbrook" "John Wall" ...
## $ position: chr " PG" " PG" " PG" " PG" ...
## $ season : chr "2019-2020" "2019-2020" "2019-2020" "2019-2020" ...
# METRICS
## INSERT CODE HERE ##
str(all_seasons)
## 'data.frame': 12305 obs. of 22 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ player_name : chr "Dennis Rodman" "Dwayne Schintzius" "Earl Cureton" "Ed O'Bannon" ...
## $ team_abbreviation: chr "CHI" "LAC" "TOR" "DAL" ...
## $ age : num 36 28 39 24 34 38 25 28 29 28 ...
## $ player_height : num 198 216 206 203 206 ...
## $ player_weight : num 99.8 117.9 95.3 100.7 108.9 ...
## $ college : chr "Southeastern Oklahoma State" "Florida" "Detroit Mercy" "UCLA" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ draft_year : chr "1986" "1990" "1979" "1995" ...
## $ draft_round : chr "2" "1" "3" "1" ...
## $ draft_number : chr "27" "24" "58" "9" ...
## $ gp : int 55 15 9 64 27 52 80 77 71 82 ...
## $ pts : num 5.7 2.3 0.8 3.7 2.4 8.2 17.2 14.9 5.7 6.9 ...
## $ reb : num 16.1 1.5 1 2.3 2.4 2.7 4.1 8 1.6 1.5 ...
## $ ast : num 3.1 0.3 0.4 0.6 0.2 1 3.4 1.6 1.3 3 ...
## $ net_rating : num 16.1 12.3 -2.1 -8.7 -11.2 4.1 4.1 3.3 -0.3 -1.2 ...
## $ oreb_pct : num 0.186 0.078 0.105 0.06 0.109 0.034 0.035 0.095 0.036 0.018 ...
## $ dreb_pct : num 0.323 0.151 0.102 0.149 0.179 0.126 0.091 0.183 0.076 0.081 ...
## $ usg_pct : num 0.1 0.175 0.103 0.167 0.127 0.22 0.209 0.222 0.172 0.177 ...
## $ ts_pct : num 0.479 0.43 0.376 0.399 0.611 0.541 0.559 0.52 0.539 0.557 ...
## $ ast_pct : num 0.113 0.048 0.148 0.077 0.04 0.102 0.149 0.087 0.141 0.262 ...
## $ season : chr "1996-97" "1996-97" "1996-97" "1996-97" ...
We need to make an apples to apples comparison.
season1920<-all_seasons%>%
filter(season=="2019-20")%>%
select(-season)%>%
mutate(player=player_name)
str(season1920)
## 'data.frame': 529 obs. of 22 variables:
## $ X : int 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 ...
## $ player_name : chr "Aaron Holiday" "Maxi Kleber" "Max Strus" "Maurice Harkless" ...
## $ team_abbreviation: chr "IND" "DAL" "CHI" "NYK" ...
## $ age : num 23 28 24 27 29 25 23 23 30 34 ...
## $ player_height : num 183 208 196 201 190 ...
## $ player_weight : num 83.9 108.9 97.5 99.8 90.7 ...
## $ college : chr "UCLA" "None" "DePaul" "St. John's" ...
## $ country : chr "USA" "Germany" "USA" "USA" ...
## $ draft_year : chr "2018" "Undrafted" "Undrafted" "2012" ...
## $ draft_round : chr "1" "Undrafted" "Undrafted" "1" ...
## $ draft_number : chr "23" "Undrafted" "Undrafted" "15" ...
## $ gp : int 66 74 2 62 57 41 4 65 61 58 ...
## $ pts : num 9.5 9.1 2.5 5.8 3.1 4.9 0.5 4.7 7.2 5.9 ...
## $ reb : num 2.4 5.2 0.5 3.9 1.3 1.5 0.8 1.6 5.2 3.2 ...
## $ ast : num 3.4 1.2 0 1.1 3.2 0.5 0.3 1.2 2.5 1 ...
## $ net_rating : num 2.2 3.4 105.8 1.2 -1.2 ...
## $ oreb_pct : num 0.013 0.054 0.167 0.038 0.022 0.018 0 0.033 0.087 0.025 ...
## $ dreb_pct : num 0.077 0.136 0 0.117 0.067 0.1 0.158 0.049 0.195 0.132 ...
## $ usg_pct : num 0.182 0.134 0.158 0.105 0.128 0.152 0.08 0.108 0.173 0.12 ...
## $ ts_pct : num 0.521 0.605 0.727 0.572 0.463 0.65 0.25 0.539 0.613 0.587 ...
## $ ast_pct : num 0.188 0.065 0 0.062 0.297 0.076 0.071 0.081 0.21 0.076 ...
## $ player : chr "Aaron Holiday" "Maxi Kleber" "Max Strus" "Maurice Harkless" ...
## INSERT CODE HERE ##
joinNBA<-salaries1920%>%
left_join(season1920)
## Joining, by = "player"
#View(joinNBA)
ggplot(joinNBA, aes(x=pts, y=salary))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 77 rows containing non-finite values (stat_smooth).
## Warning: Removed 77 rows containing missing values (geom_point).
unite
# lets create more fake data for dates and times
set.seed(1)
date <- as.Date('2016-01-01') + 0:14
hour <- sample(1:24, 15)
min <- sample(1:60, 15)
second <- sample(1:60, 15)
event <- sample(letters, 15)
data <- data.frame(date, hour, min, second, event)
data
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a
dataUnite <- data %>%
unite(datehour, date, hour, sep = ' ') %>%
unite(datetime, datehour, min, second, sep = ':')
str(dataUnite)
## 'data.frame': 15 obs. of 2 variables:
## $ datetime: chr "2016-01-01 4:15:35" "2016-01-02 7:21:6" "2016-01-03 1:37:10" "2016-01-04 2:41:42" ...
## $ event : chr "w" "x" "f" "g" ...
# separate
dataSep <- dataUnite %>%
separate(datetime, c('date', 'time'), sep = ' ') %>%
separate(time, c('hour', 'min', 'second'), sep = ':')
dataSep
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a