In this lesson students will gain more practice with the
tidyverse
with a particular focus on wrangling the
tidyr
package.
This lesson covers:
tidyr
Package
gather()
spread()
unite()
separate()
left_join
, inner_join
,
right_join
Before you start you will need to load the tidyverse
library(tidyverse)
tidyr
# generate stock market data
set.seed(1)
stocks <- data.frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 20, 1),
Y = rnorm(10, 20, 2),
Z = rnorm(10, 20, 4)
)
stocks
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
## 7 2009-01-07 20.48743 19.96762 19.37682
## 8 2009-01-08 20.73832 21.88767 14.11699
## 9 2009-01-09 20.57578 21.64244 18.08740
## 10 2009-01-10 19.69461 21.18780 21.67177
# gather
stocksG<-stocks%>%
gather(key=stock, value=price, -time )
head(stocksG)
## time stock price
## 1 2009-01-01 X 19.37355
## 2 2009-01-02 X 20.18364
## 3 2009-01-03 X 19.16437
## 4 2009-01-04 X 21.59528
## 5 2009-01-05 X 20.32951
## 6 2009-01-06 X 19.17953
ggplot(stocksG, aes(time, price, color=stock))+
geom_line()
stocksS<-stocksG%>%
spread(key=stock, value=price)
head(stocksS)
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
Length of stay (LOS) in days for visitors traveling to Hawaii by air grouped by accommodation choice, purpose of trip, whether it was their first trip or if they’re a repeat visitor, and island visited.
These data are hosted on Kaggle at:
https://www.kaggle.com/datasets/csafrit2/hawaii-travel-length-of-trip
## IMPORT THE DATA
hi <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/Hawaii_Tourism.csv",
header=TRUE)
## LOOK AT THE STRUCTURE
str(hi)
## 'data.frame': 121 obs. of 26 variables:
## $ Group : chr "All visitors by air" "Hotel-only visitors" "First-time visitors" "Honeymoon visitors" ...
## $ Indicator: chr "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ Units : chr "days" "days" "days" "days" ...
## $ X1999 : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
## $ X2000 : num 8.9 7.2 7.9 7.4 6.6 5.7 5.9 5.3 6.8 5.4 ...
## $ X2001 : num 9.2 7.5 8.4 NA 6.8 5.9 6.3 NA 6.9 5.7 ...
## $ X2002 : num 9.4 7.7 8.5 NA 7 6.2 6.4 NA 7.2 5.9 ...
## $ X2003 : num 9.2 7.7 8.5 NA 6.9 6.1 6.3 NA 7.3 6.3 ...
## $ X2004 : num 9.1 7.3 8.2 8 6.9 5.9 6.2 5.5 7.5 6.2 ...
## $ X2005 : num 9.1 7.3 8.3 7.8 6.9 6 6.3 5.5 7.5 6.3 ...
## $ X2006 : num 9.2 7.3 8.3 7.9 6.9 6 6.2 5.5 7.4 6.3 ...
## $ X2007 : num 9.2 7.2 8.4 7.9 6.8 5.9 6.2 5.5 7.3 6.2 ...
## $ X2008 : num 9.4 7.3 8.6 7.8 7.1 6.1 6.5 5.6 7.8 6.3 ...
## $ X2009 : num 9.4 7.3 8.6 7.6 7.3 6.2 6.7 5.6 8 6.4 ...
## $ X2010 : num 9.4 7.4 8.5 7.7 7.4 6.2 6.7 5.8 8.1 6.5 ...
## $ X2011 : num 9.5 7.4 8.5 7.6 7.4 6.4 6.6 5.7 8.1 5.9 ...
## $ X2012 : num 9.4 7.3 8.4 7.5 7.3 6.2 6.5 5.8 8.1 6.3 ...
## $ X2013 : num 9.3 7.2 8.3 7.5 7 6 6.3 5.6 8.2 6.3 ...
## $ X2014 : num 9.2 7.2 8.3 7.6 6.8 5.9 6.1 5.7 8.2 6.5 ...
## $ X2015 : num 9.1 7.2 8.3 7.5 6.8 5.9 6.2 5.6 8.2 6.5 ...
## $ X2016 : num 9 7.2 8.2 7.5 6.8 5.9 6.2 5.7 8.1 6.6 ...
## $ X2017 : num 9 7.3 8.4 7.6 6.7 6 6.2 5.7 8 6.6 ...
## $ X2018 : num 9 7.3 8.5 7.7 6.9 6.1 6.4 5.8 8.1 6.7 ...
## $ X2019 : num 8.8 7.1 8.2 7.7 6.8 6 6.3 5.9 7.9 6.7 ...
## $ X2020 : num 10.6 7.6 NA 8.1 8.5 6.5 NA 6.1 9.5 7.1 ...
## $ X2021 : num 9.6 7.9 8.8 9.1 8.1 6.9 7.3 6.3 8.7 7.4 ...
## RENAME THE COLUMNS
colnames(hi)[4:26]<-1999:2021
gather()
## TIDY YOUR DATA
hiG<-hi%>%
filter(Units=="days")%>%
select(-Units)%>%
gather(key=year, value=days, -c(Group, Indicator))
## LOOK AT THE NEW STRUCTURE
str(hiG)
## 'data.frame': 2691 obs. of 4 variables:
## $ Group : chr "All visitors by air" "Hotel-only visitors" "First-time visitors" "Honeymoon visitors" ...
## $ Indicator: chr "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ year : chr "1999" "1999" "1999" "1999" ...
## $ days : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
## YEAR NEEDS TO BE NUMERIC TO PLOT
hiG$year<-as.numeric(hiG$year)
## SCATTERPLOT WITH TREND
ggplot(hiG, aes(x=year, y=days))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 567 rows containing non-finite values (stat_smooth).
## Warning: Removed 567 rows containing missing values (geom_point).
## TRY GEOM_JITTER
ggplot(hiG, aes(x=year, y=days))+
geom_jitter()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 567 rows containing non-finite values (stat_smooth).
## Removed 567 rows containing missing values (geom_point).
Which island do visitors stay at the longest?
Use dplyr
to explore subgroups.
## WHAT ISLAND IS MOST POPULAR?
islands<-hiG%>%
filter(Indicator!="LOS Statewide")%>%
group_by(Indicator, year)%>%
summarise(avgDays=mean(days))
## `summarise()` has grouped output by 'Indicator'. You can override using the
## `.groups` argument.
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
## Warning: Removed 120 row(s) containing missing values (geom_path).
## WHATS WRONG WITH THIS??
Oh no! What happened?!
We forgot about NAs!
islands<-hiG%>%
group_by(Indicator, year)%>%
filter(Indicator!="LOS Statewide")%>%
summarise(avgDays=mean(days, na.rm=TRUE))
## `summarise()` has grouped output by 'Indicator'. You can override using the
## `.groups` argument.
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
Try another example!
What are the trends in reasons for travels to Hawaii?
## ARE THERE TRENDS BY TYPE?
groupHI<-hiG%>%
group_by(Group, year)%>%
summarise(avgDays=mean(days, na.rm=TRUE))
## `summarise()` has grouped output by 'Group'. You can override using the
## `.groups` argument.
ggplot(groupHI, aes(x=year, y=avgDays, color=Group))+
geom_line()
## Warning: Removed 54 row(s) containing missing values (geom_path).
There are three types of joins that we will look at:
Inner Join: Looks for only elements that exist in both dataframes
Left Join (or Right Join): Matches elements from the left dataframe to the right
Tibbles are like data frames that can be more easily manipulated in the tidyverse. In this example we create a tibble for comic book characters.
# a tibble is a type of a data frame
superheroes <- tibble::tribble(
~name, ~alignment, ~gender, ~publisher,
"Magneto", "bad", "male", "Marvel",
"Storm", "good", "female", "Marvel",
"Mystique", "bad", "female", "Marvel",
"Batman", "good", "male", "DC",
"Joker", "bad", "male", "DC",
"Catwoman", "bad", "female", "DC",
"Hellboy", "good", "male", "Dark Horse Comics"
)
publishers <- tibble::tribble(
~publisher, ~yr_founded,
"DC", 1934L,
"Marvel", 1939L,
"Image", 1992L
)
# inner join super hero and publisher
insp<-inner_join(superheroes, publishers)
## Joining, by = "publisher"
insp
## # A tibble: 6 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
# left join super hero and publisher
ljsp<-left_join(superheroes, publishers)
## Joining, by = "publisher"
ljsp
## # A tibble: 7 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
Suppose that you have two really interesting data sets that you want to use to gain more insight about a topic? How can you combine them?
In this example we will join two different NBA data sets.
All NBA Player’s salaries 2019-2020.
Scrape from ESPN (http://www.espn.com/nba/salaries).
Hosted on Kaggle at:
https://www.kaggle.com/datasets/junfenglim/nba-player-salaries-201920
## SALARY DATA for 2019-2020 season
salaries1920 <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/nba2019-20.csv",
header=TRUE)
The data set contains over two decades of data on each player who has been part of an NBA teams’ roster. It captures demographic variables such as age, height, weight and place of birth, biographical details like the team played for, draft year and round. In addition, it has basic box score statistics such as games played, average number of points, rebounds, assists, etc.
These data are hosted on Kaggle at:
https://www.kaggle.com/datasets/justinas/nba-players-data
## METRICS ON PLAYER PERFORMANCE
## 1996 to 2022
all_seasons <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/all_seasons.csv",
header=TRUE)
# SALARIES
str(salaries1920)
## 'data.frame': 528 obs. of 5 variables:
## $ team : chr "Golden State Warriors" "Oklahoma City Thunder" "Houston Rockets" "Washington Wizards" ...
## $ salary : int 40231758 38506482 38506482 38199000 38199000 38199000 37436858 34996296 34449964 32742000 ...
## $ player : chr "Stephen Curry" "Chris Paul" "Russell Westbrook" "John Wall" ...
## $ position: chr " PG" " PG" " PG" " PG" ...
## $ season : chr "2019-2020" "2019-2020" "2019-2020" "2019-2020" ...
# METRICS
str(all_seasons)
## 'data.frame': 12305 obs. of 22 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ player_name : chr "Dennis Rodman" "Dwayne Schintzius" "Earl Cureton" "Ed O'Bannon" ...
## $ team_abbreviation: chr "CHI" "LAC" "TOR" "DAL" ...
## $ age : num 36 28 39 24 34 38 25 28 29 28 ...
## $ player_height : num 198 216 206 203 206 ...
## $ player_weight : num 99.8 117.9 95.3 100.7 108.9 ...
## $ college : chr "Southeastern Oklahoma State" "Florida" "Detroit Mercy" "UCLA" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ draft_year : chr "1986" "1990" "1979" "1995" ...
## $ draft_round : chr "2" "1" "3" "1" ...
## $ draft_number : chr "27" "24" "58" "9" ...
## $ gp : int 55 15 9 64 27 52 80 77 71 82 ...
## $ pts : num 5.7 2.3 0.8 3.7 2.4 8.2 17.2 14.9 5.7 6.9 ...
## $ reb : num 16.1 1.5 1 2.3 2.4 2.7 4.1 8 1.6 1.5 ...
## $ ast : num 3.1 0.3 0.4 0.6 0.2 1 3.4 1.6 1.3 3 ...
## $ net_rating : num 16.1 12.3 -2.1 -8.7 -11.2 4.1 4.1 3.3 -0.3 -1.2 ...
## $ oreb_pct : num 0.186 0.078 0.105 0.06 0.109 0.034 0.035 0.095 0.036 0.018 ...
## $ dreb_pct : num 0.323 0.151 0.102 0.149 0.179 0.126 0.091 0.183 0.076 0.081 ...
## $ usg_pct : num 0.1 0.175 0.103 0.167 0.127 0.22 0.209 0.222 0.172 0.177 ...
## $ ts_pct : num 0.479 0.43 0.376 0.399 0.611 0.541 0.559 0.52 0.539 0.557 ...
## $ ast_pct : num 0.113 0.048 0.148 0.077 0.04 0.102 0.149 0.087 0.141 0.262 ...
## $ season : chr "1996-97" "1996-97" "1996-97" "1996-97" ...
We need to make an apples to apples comparison.
season1920<-all_seasons%>%
filter(season=="2019-20")%>%
select(-season)%>%
mutate(player=player_name)
joinNBA<-salaries1920%>%
left_join(season1920)
## Joining, by = "player"
str(joinNBA)
## 'data.frame': 528 obs. of 26 variables:
## $ team : chr "Golden State Warriors" "Oklahoma City Thunder" "Houston Rockets" "Washington Wizards" ...
## $ salary : int 40231758 38506482 38506482 38199000 38199000 38199000 37436858 34996296 34449964 32742000 ...
## $ player : chr "Stephen Curry" "Chris Paul" "Russell Westbrook" "John Wall" ...
## $ position : chr " PG" " PG" " PG" " PG" ...
## $ season : chr "2019-2020" "2019-2020" "2019-2020" "2019-2020" ...
## $ X : int 10869 10921 10888 NA NA 11041 10734 10743 10995 10777 ...
## $ player_name : chr "Stephen Curry" "Chris Paul" "Russell Westbrook" NA ...
## $ team_abbreviation: chr "GSW" "OKC" "HOU" NA ...
## $ age : num 32 35 31 NA NA 30 35 34 31 27 ...
## $ player_height : num 190 185 190 NA NA ...
## $ player_weight : num 83.9 79.4 90.7 NA NA ...
## $ college : chr "Davidson" "Wake Forest" "UCLA" NA ...
## $ country : chr "USA" "USA" "USA" NA ...
## $ draft_year : chr "2009" "2005" "2008" NA ...
## $ draft_round : chr "1" "1" "1" NA ...
## $ draft_number : chr "7" "4" "4" NA ...
## $ gp : int 5 70 57 NA NA 68 67 58 18 72 ...
## $ pts : num 20.8 17.6 27.2 NA NA 34.3 25.3 19.4 15.5 19.6 ...
## $ reb : num 5.2 5 7.9 NA NA 6.6 7.8 5 4.7 6.9 ...
## $ ast : num 6.6 6.7 7 NA NA 7.5 10.2 7.5 3.3 3.2 ...
## $ net_rating : num -15.3 6.9 3.1 NA NA 5.8 8.5 6 -2.6 3.1 ...
## $ oreb_pct : num 0.023 0.012 0.044 NA NA 0.026 0.028 0.015 0.032 0.027 ...
## $ dreb_pct : num 0.162 0.143 0.157 NA NA 0.139 0.191 0.115 0.128 0.172 ...
## $ usg_pct : num 0.286 0.228 0.33 NA NA 0.356 0.308 0.225 0.277 0.236 ...
## $ ts_pct : num 0.557 0.61 0.536 NA NA 0.626 0.577 0.59 0.476 0.556 ...
## $ ast_pct : num 0.355 0.324 0.34 NA NA 0.366 0.477 0.309 0.184 0.145 ...
ggplot(joinNBA, aes(x=pts, y=salary))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 77 rows containing non-finite values (stat_smooth).
## Warning: Removed 77 rows containing missing values (geom_point).
unite
# lets create more fake data for dates and times
set.seed(1)
date <- as.Date('2016-01-01') + 0:14
hour <- sample(1:24, 15)
min <- sample(1:60, 15)
second <- sample(1:60, 15)
event <- sample(letters, 15)
data <- data.frame(date, hour, min, second, event)
data
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a
dataUnite <- data %>%
unite(datehour, date, hour, sep = ' ') %>%
unite(datetime, datehour, min, second, sep = ':')
str(dataUnite)
## 'data.frame': 15 obs. of 2 variables:
## $ datetime: chr "2016-01-01 4:15:35" "2016-01-02 7:21:6" "2016-01-03 1:37:10" "2016-01-04 2:41:42" ...
## $ event : chr "w" "x" "f" "g" ...
# separate
dataSep <- dataUnite %>%
separate(datetime, c('date', 'time'), sep = ' ') %>%
separate(time, c('hour', 'min', 'second'), sep = ':')
dataSep
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a