In this lesson students will gain more practice with the
tidyverse
with a particular focus on wrangling the
tidyr
package.
This lesson covers:
tidyr
Package
pivot_longer()
(older version:
gather()
)pivot_wider()
(older version
spread()
)unite()
separate()
left_join
, inner_join
,
right_join
UPDATED: October 2024
Before you start you will need to load the tidyverse
library(tidyverse)
tidyr
library(openintro)
data("helium")
## Look at data
head(helium)
## # A tibble: 6 × 3
## trial air helium
## <int> <int> <int>
## 1 1 25 25
## 2 2 23 16
## 3 3 18 25
## 4 4 16 14
## 5 5 35 23
## 6 6 15 29
This is not the format we want our data to be in! Let’s wrangle it!
### WRANGLE
football<-helium%>%
pivot_longer(!trial, names_to = "airType", values_to="yards")
head(football)
## # A tibble: 6 × 3
## trial airType yards
## <int> <chr> <int>
## 1 1 air 25
## 2 1 helium 25
## 3 2 air 23
## 4 2 helium 16
## 5 3 air 18
## 6 3 helium 25
# AN OLDER FUNCTION FOR THIS IS GATHER
#football<-helium%>%
# gather("airType", "yards", -trial)
How does pivot_longer
work?
YOUR ANSWER HERE
Why do we need to have this format?
## ggplot
## INSERT CODE HERE ##
ggplot(data = football, aes(x = airType, y = yards, fill = airType))+
geom_boxplot()
# generate stock market data
set.seed(1)
stocks <- data.frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 20, 1),
Y = rnorm(10, 20, 2),
Z = rnorm(10, 20, 4)
)
stocks
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
## 7 2009-01-07 20.48743 19.96762 19.37682
## 8 2009-01-08 20.73832 21.88767 14.11699
## 9 2009-01-09 20.57578 21.64244 18.08740
## 10 2009-01-10 19.69461 21.18780 21.67177
# pivot_longer
## INSERT CODE HERE ##
stocksL<-stocks%>%
pivot_longer(-time, names_to="stock", values_to="price")
head(stocksL)
## # A tibble: 6 × 3
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X 19.4
## 2 2009-01-01 Y 23.0
## 3 2009-01-01 Z 23.7
## 4 2009-01-02 X 20.2
## 5 2009-01-02 Y 20.8
## 6 2009-01-02 Z 23.1
# OLD METHOD gather
#stocksG<-stocks%>%
# gather(key=stock, value=price, -time )
tidyr
and dplyr
stocks%>%
pivot_longer(-time, names_to="stock", values_to="price")%>%
group_by(stock)%>%
summarise(min=min(price),
max=max(price))
## # A tibble: 3 × 3
## stock min max
## <chr> <dbl> <dbl>
## 1 X 19.2 21.6
## 2 Y 15.6 23.0
## 3 Z 12.0 23.7
ggplot(stocksL, aes(time, price, color=stock))+
geom_line()
stocksW<-stocksL%>%
pivot_wider(names_from = stock, values_from = price)
head(stocksW)
## # A tibble: 6 × 4
## time X Y Z
## <date> <dbl> <dbl> <dbl>
## 1 2009-01-01 19.4 23.0 23.7
## 2 2009-01-02 20.2 20.8 23.1
## 3 2009-01-03 19.2 18.8 20.3
## 4 2009-01-04 21.6 15.6 12.0
## 5 2009-01-05 20.3 22.2 22.5
## 6 2009-01-06 19.2 19.9 19.8
## OLD CODE spread
#stocksS<-stocksG%>%
# spread(key=stock, value=price)
## IMPORT THE DATA
## TRY TO IMPORT FILE ##
### JUST IN CASE HERE IS THE LINK ###
hi <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/Hawaii_Tourism.csv", header=TRUE)
## LOOK AT THE STRUCTURE
## INSERT CODE HERE ##
str(hi)
## 'data.frame': 121 obs. of 26 variables:
## $ Group : chr "All visitors by air" "Hotel-only visitors" "First-time visitors" "Honeymoon visitors" ...
## $ Indicator: chr "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ Units : chr "days" "days" "days" "days" ...
## $ X1999 : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
## $ X2000 : num 8.9 7.2 7.9 7.4 6.6 5.7 5.9 5.3 6.8 5.4 ...
## $ X2001 : num 9.2 7.5 8.4 NA 6.8 5.9 6.3 NA 6.9 5.7 ...
## $ X2002 : num 9.4 7.7 8.5 NA 7 6.2 6.4 NA 7.2 5.9 ...
## $ X2003 : num 9.2 7.7 8.5 NA 6.9 6.1 6.3 NA 7.3 6.3 ...
## $ X2004 : num 9.1 7.3 8.2 8 6.9 5.9 6.2 5.5 7.5 6.2 ...
## $ X2005 : num 9.1 7.3 8.3 7.8 6.9 6 6.3 5.5 7.5 6.3 ...
## $ X2006 : num 9.2 7.3 8.3 7.9 6.9 6 6.2 5.5 7.4 6.3 ...
## $ X2007 : num 9.2 7.2 8.4 7.9 6.8 5.9 6.2 5.5 7.3 6.2 ...
## $ X2008 : num 9.4 7.3 8.6 7.8 7.1 6.1 6.5 5.6 7.8 6.3 ...
## $ X2009 : num 9.4 7.3 8.6 7.6 7.3 6.2 6.7 5.6 8 6.4 ...
## $ X2010 : num 9.4 7.4 8.5 7.7 7.4 6.2 6.7 5.8 8.1 6.5 ...
## $ X2011 : num 9.5 7.4 8.5 7.6 7.4 6.4 6.6 5.7 8.1 5.9 ...
## $ X2012 : num 9.4 7.3 8.4 7.5 7.3 6.2 6.5 5.8 8.1 6.3 ...
## $ X2013 : num 9.3 7.2 8.3 7.5 7 6 6.3 5.6 8.2 6.3 ...
## $ X2014 : num 9.2 7.2 8.3 7.6 6.8 5.9 6.1 5.7 8.2 6.5 ...
## $ X2015 : num 9.1 7.2 8.3 7.5 6.8 5.9 6.2 5.6 8.2 6.5 ...
## $ X2016 : num 9 7.2 8.2 7.5 6.8 5.9 6.2 5.7 8.1 6.6 ...
## $ X2017 : num 9 7.3 8.4 7.6 6.7 6 6.2 5.7 8 6.6 ...
## $ X2018 : num 9 7.3 8.5 7.7 6.9 6.1 6.4 5.8 8.1 6.7 ...
## $ X2019 : num 8.8 7.1 8.2 7.7 6.8 6 6.3 5.9 7.9 6.7 ...
## $ X2020 : num 10.6 7.6 NA 8.1 8.5 6.5 NA 6.1 9.5 7.1 ...
## $ X2021 : num 9.6 7.9 8.8 9.1 8.1 6.9 7.3 6.3 8.7 7.4 ...
## RENAME THE COLUMNS
## INSERT CODE HERE ##
colnames(hi)[4:26]<-1999:2021
names(hi)
## [1] "Group" "Indicator" "Units" "1999" "2000" "2001"
## [7] "2002" "2003" "2004" "2005" "2006" "2007"
## [13] "2008" "2009" "2010" "2011" "2012" "2013"
## [19] "2014" "2015" "2016" "2017" "2018" "2019"
## [25] "2020" "2021"
pivot_longer()
## TIDY YOUR DATA
## INSERT CODE HERE ##
hiL<-hi%>%
filter(Units=="days")%>%
select(-Units)%>%
pivot_longer(-c(Group, Indicator), names_to = "year", values_to = "days")
## LOOK AT THE NEW STRUCTURE
str(hiL)
## tibble [2,691 × 4] (S3: tbl_df/tbl/data.frame)
## $ Group : chr [1:2691] "All visitors by air" "All visitors by air" "All visitors by air" "All visitors by air" ...
## $ Indicator: chr [1:2691] "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ year : chr [1:2691] "1999" "2000" "2001" "2002" ...
## $ days : num [1:2691] 8.9 8.9 9.2 9.4 9.2 9.1 9.1 9.2 9.2 9.4 ...
## YEAR NEEDS TO BE NUMERIC TO PLOT
## INSERT CODE HERE ##
hiL$year<-as.numeric(hiL$year)
str(hiL)
## tibble [2,691 × 4] (S3: tbl_df/tbl/data.frame)
## $ Group : chr [1:2691] "All visitors by air" "All visitors by air" "All visitors by air" "All visitors by air" ...
## $ Indicator: chr [1:2691] "LOS Statewide" "LOS Statewide" "LOS Statewide" "LOS Statewide" ...
## $ year : num [1:2691] 1999 2000 2001 2002 2003 ...
## $ days : num [1:2691] 8.9 8.9 9.2 9.4 9.2 9.1 9.1 9.2 9.2 9.4 ...
## SCATTERPLOT WITH TREND
## INSERT CODE HERE ##
ggplot(hiL, aes(x=year, y=days))+
geom_point()
## Warning: Removed 567 rows containing missing values (geom_point).
## TRY GEOM_JITTER
## INSERT CODE HERE ##
ggplot(hiL, aes(x=year, y=days))+
geom_jitter()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 567 rows containing non-finite values (stat_smooth).
## Removed 567 rows containing missing values (geom_point).
Which island is the most popular to visit?
Use dplyr
to explore subgroups.
## WHAT ISLAND IS MOST POPULAR?
islands<-hiL%>%
filter(Indicator!="LOS Statewide")%>%
group_by(Indicator, year)%>%
summarise(avgDays=mean(days))
## `summarise()` has grouped output by 'Indicator'. You can override using the
## `.groups` argument.
## INSERT CODE HERE ##
## WHATS WRONG WITH THIS??
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
## Warning: Removed 120 row(s) containing missing values (geom_path).
Oh no! What happened?!
We forgot about NAs!
## WHAT ISLAND IS MOST POPULAR?
## TAKE 2
## INSERT CODE HERE ##
islands<-hiL%>%
filter(Indicator!="LOS Statewide")%>%
group_by(Indicator, year)%>%
summarise(avgDays=mean(days, na.rm=TRUE))
## `summarise()` has grouped output by 'Indicator'. You can override using the
## `.groups` argument.
## INSERT CODE HERE ##
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
Are there other interesting trends? Try looking at the reasons for
traveling to Hawaii. These are specified in the Group
variable.
## ARE THERE TRENDS BY TYPE?
## INSERT CODE HERE ##
## INSERT CODE HERE ##
There are three types of joins that we will look at:
Inner Join: Looks for only elements that exist in both dataframes
Left Join (or Right Join): Matches elements from the left dataframe to the right
Tibbles are like data frames that can be more easily manipulated in the tidyverse. In this example we create a tibble for comic book characters.
# a tibble is a type of a data frame
superheroes <- tibble::tribble(
~name, ~alignment, ~gender, ~publisher,
"Magneto", "bad", "male", "Marvel",
"Storm", "good", "female", "Marvel",
"Mystique", "bad", "female", "Marvel",
"Batman", "good", "male", "DC",
"Joker", "bad", "male", "DC",
"Catwoman", "bad", "female", "DC",
"Hellboy", "good", "male", "Dark Horse Comics"
)
publishers <- tibble::tribble(
~publisher, ~yr_founded,
"DC", 1934L,
"Marvel", 1939L,
"Image", 1992L
)
# inner join super hero and publisher
insp<-inner_join(superheroes, publishers)
## Joining, by = "publisher"
insp
## # A tibble: 6 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
# left join super hero and publisher
ljsp<-left_join(superheroes, publishers)
## Joining, by = "publisher"
ljsp
## # A tibble: 7 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
Data Set #1: NBA Salaries
All NBA Player’s salaries 2019-2020.
Scrape from ESPN (http://www.espn.com/nba/salaries).
Hosted on Kaggle at:
https://www.kaggle.com/datasets/junfenglim/nba-player-salaries-201920
## SALARY DATA for 2019-2020 season
salaries1920 <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/nba2019-20.csv", header=TRUE)
Data Set #2: NBA Players
The data set contains over two decades of data on each player who has been part of an NBA teams’ roster. It captures demographic variables such as age, height, weight and place of birth, biographical details like the team played for, draft year and round. In addition, it has basic box score statistics such as games played, average number of points, rebounds, assists, etc.
These data are hosted on Kaggle at:
https://www.kaggle.com/datasets/justinas/nba-players-data
## METRICS ON PLAYER PERFORMANCE
## 1996 to 2022
all_seasons <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA151/main/Data/all_seasons.csv", header=TRUE)
# SALARIES
## INSERT CODE HERE ##
str(salaries1920)
## 'data.frame': 528 obs. of 5 variables:
## $ team : chr "Golden State Warriors" "Oklahoma City Thunder" "Houston Rockets" "Washington Wizards" ...
## $ salary : int 40231758 38506482 38506482 38199000 38199000 38199000 37436858 34996296 34449964 32742000 ...
## $ player : chr "Stephen Curry" "Chris Paul" "Russell Westbrook" "John Wall" ...
## $ position: chr " PG" " PG" " PG" " PG" ...
## $ season : chr "2019-2020" "2019-2020" "2019-2020" "2019-2020" ...
# METRICS
## INSERT CODE HERE ##
str(all_seasons)
## 'data.frame': 12305 obs. of 22 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ player_name : chr "Dennis Rodman" "Dwayne Schintzius" "Earl Cureton" "Ed O'Bannon" ...
## $ team_abbreviation: chr "CHI" "LAC" "TOR" "DAL" ...
## $ age : num 36 28 39 24 34 38 25 28 29 28 ...
## $ player_height : num 198 216 206 203 206 ...
## $ player_weight : num 99.8 117.9 95.3 100.7 108.9 ...
## $ college : chr "Southeastern Oklahoma State" "Florida" "Detroit Mercy" "UCLA" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ draft_year : chr "1986" "1990" "1979" "1995" ...
## $ draft_round : chr "2" "1" "3" "1" ...
## $ draft_number : chr "27" "24" "58" "9" ...
## $ gp : int 55 15 9 64 27 52 80 77 71 82 ...
## $ pts : num 5.7 2.3 0.8 3.7 2.4 8.2 17.2 14.9 5.7 6.9 ...
## $ reb : num 16.1 1.5 1 2.3 2.4 2.7 4.1 8 1.6 1.5 ...
## $ ast : num 3.1 0.3 0.4 0.6 0.2 1 3.4 1.6 1.3 3 ...
## $ net_rating : num 16.1 12.3 -2.1 -8.7 -11.2 4.1 4.1 3.3 -0.3 -1.2 ...
## $ oreb_pct : num 0.186 0.078 0.105 0.06 0.109 0.034 0.035 0.095 0.036 0.018 ...
## $ dreb_pct : num 0.323 0.151 0.102 0.149 0.179 0.126 0.091 0.183 0.076 0.081 ...
## $ usg_pct : num 0.1 0.175 0.103 0.167 0.127 0.22 0.209 0.222 0.172 0.177 ...
## $ ts_pct : num 0.479 0.43 0.376 0.399 0.611 0.541 0.559 0.52 0.539 0.557 ...
## $ ast_pct : num 0.113 0.048 0.148 0.077 0.04 0.102 0.149 0.087 0.141 0.262 ...
## $ season : chr "1996-97" "1996-97" "1996-97" "1996-97" ...
We need to make an apples to apples comparison.
## INSERT CODE HERE ##
season1920<-all_seasons%>%
filter(season=="2019-20")%>%
select(-season)%>%
rename(player=player_name)
str(season1920)
## 'data.frame': 529 obs. of 21 variables:
## $ X : int 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 ...
## $ player : chr "Aaron Holiday" "Maxi Kleber" "Max Strus" "Maurice Harkless" ...
## $ team_abbreviation: chr "IND" "DAL" "CHI" "NYK" ...
## $ age : num 23 28 24 27 29 25 23 23 30 34 ...
## $ player_height : num 183 208 196 201 190 ...
## $ player_weight : num 83.9 108.9 97.5 99.8 90.7 ...
## $ college : chr "UCLA" "None" "DePaul" "St. John's" ...
## $ country : chr "USA" "Germany" "USA" "USA" ...
## $ draft_year : chr "2018" "Undrafted" "Undrafted" "2012" ...
## $ draft_round : chr "1" "Undrafted" "Undrafted" "1" ...
## $ draft_number : chr "23" "Undrafted" "Undrafted" "15" ...
## $ gp : int 66 74 2 62 57 41 4 65 61 58 ...
## $ pts : num 9.5 9.1 2.5 5.8 3.1 4.9 0.5 4.7 7.2 5.9 ...
## $ reb : num 2.4 5.2 0.5 3.9 1.3 1.5 0.8 1.6 5.2 3.2 ...
## $ ast : num 3.4 1.2 0 1.1 3.2 0.5 0.3 1.2 2.5 1 ...
## $ net_rating : num 2.2 3.4 105.8 1.2 -1.2 ...
## $ oreb_pct : num 0.013 0.054 0.167 0.038 0.022 0.018 0 0.033 0.087 0.025 ...
## $ dreb_pct : num 0.077 0.136 0 0.117 0.067 0.1 0.158 0.049 0.195 0.132 ...
## $ usg_pct : num 0.182 0.134 0.158 0.105 0.128 0.152 0.08 0.108 0.173 0.12 ...
## $ ts_pct : num 0.521 0.605 0.727 0.572 0.463 0.65 0.25 0.539 0.613 0.587 ...
## $ ast_pct : num 0.188 0.065 0 0.062 0.297 0.076 0.071 0.081 0.21 0.076 ...
## WHAT KIND OF JOIN SHOULD WE USE?
## INSERT CODE HERE ##
joinNBA<-salaries1920%>%
left_join(season1920)
## Joining, by = "player"
## GGPLOT2
## INSERT CODE HERE ##
ggplot(joinNBA, aes(x=pts, y=salary))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 77 rows containing non-finite values (stat_smooth).
## Warning: Removed 77 rows containing missing values (geom_point).
unite
# lets create more fake data for dates and times
set.seed(1)
date <- as.Date('2016-01-01') + 0:14
hour <- sample(1:24, 15)
min <- sample(1:60, 15)
second <- sample(1:60, 15)
event <- sample(letters, 15)
data <- data.frame(date, hour, min, second, event)
data
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a
dataUnite <- data %>%
unite(datehour, date, hour, sep = ' ') %>%
unite(datetime, datehour, min, second, sep = ':')
str(dataUnite)
## 'data.frame': 15 obs. of 2 variables:
## $ datetime: chr "2016-01-01 4:15:35" "2016-01-02 7:21:6" "2016-01-03 1:37:10" "2016-01-04 2:41:42" ...
## $ event : chr "w" "x" "f" "g" ...
# seperate
dataSep <- dataUnite %>%
separate(datetime, c('date', 'time'), sep = ' ') %>%
separate(time, c('hour', 'min', 'second'), sep = ':')
dataSep
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a