In this lesson students will gain more practice with the tidyverse
with a particular focus on wrangling the tidyr
package.
This lesson covers:
tidyr
Package
gather()
spread()
unite()
separate()
left_join
, inner_join
, right_join
Before you start you will need to load the tidyverse
library(tidyverse)
tidyr
# generate stock market data
set.seed(1)
stocks <- data.frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 20, 1),
Y = rnorm(10, 20, 2),
Z = rnorm(10, 20, 4)
)
stocks
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
## 7 2009-01-07 20.48743 19.96762 19.37682
## 8 2009-01-08 20.73832 21.88767 14.11699
## 9 2009-01-09 20.57578 21.64244 18.08740
## 10 2009-01-10 19.69461 21.18780 21.67177
# gather
stocksG<-stocks%>%
gather(key=stock, value=price, -time )
head(stocksG)
## time stock price
## 1 2009-01-01 X 19.37355
## 2 2009-01-02 X 20.18364
## 3 2009-01-03 X 19.16437
## 4 2009-01-04 X 21.59528
## 5 2009-01-05 X 20.32951
## 6 2009-01-06 X 19.17953
ggplot(stocksG, aes(time, price, color=stock))+
geom_line()
stocksS<-stocksG%>%
spread(key=stock, value=price)
head(stocksS)
## time X Y Z
## 1 2009-01-01 19.37355 23.02356 23.67591
## 2 2009-01-02 20.18364 20.77969 23.12855
## 3 2009-01-03 19.16437 18.75752 20.29826
## 4 2009-01-04 21.59528 15.57060 12.04259
## 5 2009-01-05 20.32951 22.24986 22.47930
## 6 2009-01-06 19.17953 19.91013 19.77549
## IMPORT THE DATA
hi <- read.csv("~/Downloads/Hawaii_Tourism.csv")
## LOOK AT THE STRUCTURE
str(hi)
## 'data.frame': 121 obs. of 26 variables:
## $ Group : Factor w/ 17 levels "All visitors by air",..: 1 10 6 8 1 10 6 8 1 10 ...
## $ Indicator: Factor w/ 10 levels "","LOS in Hilo",..: 10 10 10 10 9 9 9 9 7 7 ...
## $ Units : Factor w/ 2 levels "","days": 2 2 2 2 2 2 2 2 2 2 ...
## $ X1999 : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
## $ X2000 : num 8.9 7.2 7.9 7.4 6.6 5.7 5.9 5.3 6.8 5.4 ...
## $ X2001 : num 9.2 7.5 8.4 NA 6.8 5.9 6.3 NA 6.9 5.7 ...
## $ X2002 : num 9.4 7.7 8.5 NA 7 6.2 6.4 NA 7.2 5.9 ...
## $ X2003 : num 9.2 7.7 8.5 NA 6.9 6.1 6.3 NA 7.3 6.3 ...
## $ X2004 : num 9.1 7.3 8.2 8 6.9 5.9 6.2 5.5 7.5 6.2 ...
## $ X2005 : num 9.1 7.3 8.3 7.8 6.9 6 6.3 5.5 7.5 6.3 ...
## $ X2006 : num 9.2 7.3 8.3 7.9 6.9 6 6.2 5.5 7.4 6.3 ...
## $ X2007 : num 9.2 7.2 8.4 7.9 6.8 5.9 6.2 5.5 7.3 6.2 ...
## $ X2008 : num 9.4 7.3 8.6 7.8 7.1 6.1 6.5 5.6 7.8 6.3 ...
## $ X2009 : num 9.4 7.3 8.6 7.6 7.3 6.2 6.7 5.6 8 6.4 ...
## $ X2010 : num 9.4 7.4 8.5 7.7 7.4 6.2 6.7 5.8 8.1 6.5 ...
## $ X2011 : num 9.5 7.4 8.5 7.6 7.4 6.4 6.6 5.7 8.1 5.9 ...
## $ X2012 : num 9.4 7.3 8.4 7.5 7.3 6.2 6.5 5.8 8.1 6.3 ...
## $ X2013 : num 9.3 7.2 8.3 7.5 7 6 6.3 5.6 8.2 6.3 ...
## $ X2014 : num 9.2 7.2 8.3 7.6 6.8 5.9 6.1 5.7 8.2 6.5 ...
## $ X2015 : num 9.1 7.2 8.3 7.5 6.8 5.9 6.2 5.6 8.2 6.5 ...
## $ X2016 : num 9 7.2 8.2 7.5 6.8 5.9 6.2 5.7 8.1 6.6 ...
## $ X2017 : num 9 7.3 8.4 7.6 6.7 6 6.2 5.7 8 6.6 ...
## $ X2018 : num 9 7.3 8.5 7.7 6.9 6.1 6.4 5.8 8.1 6.7 ...
## $ X2019 : num 8.8 7.1 8.2 7.7 6.8 6 6.3 5.9 7.9 6.7 ...
## $ X2020 : num 10.6 7.6 NA 8.1 8.5 6.5 NA 6.1 9.5 7.1 ...
## $ X2021 : num 9.6 7.9 8.8 9.1 8.1 6.9 7.3 6.3 8.7 7.4 ...
## RENAME THE COLUMNS
colnames(hi)[4:26]<-1999:2021
gather()
## TIDY YOUR DATA
hiG<-hi%>%
filter(Units=="days")%>%
select(-Units)%>%
gather(key=year, value=days, -c(Group, Indicator))
## LOOK AT THE NEW STRUCTURE
str(hiG)
## 'data.frame': 2691 obs. of 4 variables:
## $ Group : Factor w/ 17 levels "All visitors by air",..: 1 10 6 8 1 10 6 8 1 10 ...
## $ Indicator: Factor w/ 10 levels "","LOS in Hilo",..: 10 10 10 10 9 9 9 9 7 7 ...
## $ year : chr "1999" "1999" "1999" "1999" ...
## $ days : num 8.9 7.4 8.1 7.3 6.4 5.6 5.9 5.3 6.7 5.6 ...
## YEAR NEEDS TO BE NUMERIC TO PLOT
hiG$year<-as.numeric(hiG$year)
## SCATTERPLOT WITH TREND
ggplot(hiG, aes(x=year, y=days))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 567 rows containing non-finite values (stat_smooth).
## Warning: Removed 567 rows containing missing values (geom_point).
## TRY GEOM_JITTER
ggplot(hiG, aes(x=year, y=days))+
geom_jitter()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 567 rows containing non-finite values (stat_smooth).
## Warning: Removed 567 rows containing missing values (geom_point).
Which island do visitors stay at the longest?
Use dplyr
to explore subgroups.
## WHAT ISLAND IS MOST POPULAR?
islands<-hiG%>%
filter(Indicator!="LOS Statewide")%>%
group_by(Indicator, year)%>%
summarise(avgDays=mean(days))
## `summarise()` has grouped output by 'Indicator'. You can override using the `.groups` argument.
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
## Warning: Removed 120 row(s) containing missing values (geom_path).
## WHATS WRONG WITH THIS??
Oh no! What happened?!
We forgot about NAs!
islands<-hiG%>%
group_by(Indicator, year)%>%
filter(Indicator!="LOS Statewide")%>%
summarise(avgDays=mean(days, na.rm=TRUE))
## `summarise()` has grouped output by 'Indicator'. You can override using the `.groups` argument.
ggplot(islands, aes(x=year, y=avgDays, color=Indicator))+
geom_line()
Try another example!
What are the trends in reasons for travels to Hawaii?
## ARE THERE TRENDS BY TYPE?
groupHI<-hiG%>%
group_by(Group, year)%>%
summarise(avgDays=mean(days, na.rm=TRUE))
## `summarise()` has grouped output by 'Group'. You can override using the `.groups` argument.
ggplot(groupHI, aes(x=year, y=avgDays, color=Group))+
geom_line()
## Warning: Removed 54 row(s) containing missing values (geom_path).
There are three types of joins that we will look at:
Inner Join: Looks for only elements that exist in both dataframes
Left Join (or Right Join): Matches elements from the left dataframe to the right
Tibbles are like data frames that can be more easily manipulated in the tidyverse. In this example we create a tibble for comic book characters.
# a tibble is a type of a data frame
superheroes <- tibble::tribble(
~name, ~alignment, ~gender, ~publisher,
"Magneto", "bad", "male", "Marvel",
"Storm", "good", "female", "Marvel",
"Mystique", "bad", "female", "Marvel",
"Batman", "good", "male", "DC",
"Joker", "bad", "male", "DC",
"Catwoman", "bad", "female", "DC",
"Hellboy", "good", "male", "Dark Horse Comics"
)
publishers <- tibble::tribble(
~publisher, ~yr_founded,
"DC", 1934L,
"Marvel", 1939L,
"Image", 1992L
)
# inner join super hero and publisher
insp<-inner_join(superheroes, publishers)
## Joining, by = "publisher"
insp
## # A tibble: 6 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
# left join super hero and publisher
ljsp<-left_join(superheroes, publishers)
## Joining, by = "publisher"
ljsp
## # A tibble: 7 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## SALARY DATA for 2019-2020 season
salaries1920 <- read.csv("~/Downloads/nba2019-20.csv")
## METRICS ON PLAYER PERFORMANCE
## 1996 to 2022
all_seasons <- read.csv("~/Downloads/all_seasons.csv")
# SALARIES
str(salaries1920)
## 'data.frame': 528 obs. of 5 variables:
## $ team : Factor w/ 30 levels "Atlanta Hawks",..: 10 21 11 30 3 11 14 28 9 23 ...
## $ salary : int 40231758 38506482 38506482 38199000 38199000 38199000 37436858 34996296 34449964 32742000 ...
## $ player : Factor w/ 528 levels "Aaron Gordon",..: 457 73 439 255 295 221 323 312 37 483 ...
## $ position: Factor w/ 7 levels " C"," F"," G",..: 5 5 5 5 6 7 6 5 4 6 ...
## $ season : Factor w/ 1 level "2019-2020": 1 1 1 1 1 1 1 1 1 1 ...
# METRICS
str(all_seasons)
## 'data.frame': 12305 obs. of 22 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ player_name : Factor w/ 2463 levels "A.C. Green","A.J. Bramlett",..: 585 705 716 720 721 727 728 737 738 745 ...
## $ team_abbreviation: Factor w/ 36 levels "ATL","BKN","BOS",..: 6 14 33 8 17 12 15 15 1 18 ...
## $ age : num 36 28 39 24 34 38 25 28 29 28 ...
## $ player_height : num 198 216 206 203 206 ...
## $ player_weight : num 99.8 117.9 95.3 100.7 108.9 ...
## $ college : Factor w/ 347 levels " "," ",..: 255 85 75 299 315 110 275 58 324 155 ...
## $ country : Factor w/ 82 levels "Angola","Argentina",..: 79 79 79 79 79 79 79 79 79 79 ...
## $ draft_year : Factor w/ 47 levels "1963","1976",..: 11 15 4 20 10 6 19 15 17 16 ...
## $ draft_round : Factor w/ 9 levels "0","1","2","3",..: 3 2 4 2 2 3 2 2 9 3 ...
## $ draft_number : Factor w/ 76 levels "0","1","10","11",..: 27 24 61 75 3 29 3 27 76 38 ...
## $ gp : int 55 15 9 64 27 52 80 77 71 82 ...
## $ pts : num 5.7 2.3 0.8 3.7 2.4 8.2 17.2 14.9 5.7 6.9 ...
## $ reb : num 16.1 1.5 1 2.3 2.4 2.7 4.1 8 1.6 1.5 ...
## $ ast : num 3.1 0.3 0.4 0.6 0.2 1 3.4 1.6 1.3 3 ...
## $ net_rating : num 16.1 12.3 -2.1 -8.7 -11.2 4.1 4.1 3.3 -0.3 -1.2 ...
## $ oreb_pct : num 0.186 0.078 0.105 0.06 0.109 0.034 0.035 0.095 0.036 0.018 ...
## $ dreb_pct : num 0.323 0.151 0.102 0.149 0.179 0.126 0.091 0.183 0.076 0.081 ...
## $ usg_pct : num 0.1 0.175 0.103 0.167 0.127 0.22 0.209 0.222 0.172 0.177 ...
## $ ts_pct : num 0.479 0.43 0.376 0.399 0.611 0.541 0.559 0.52 0.539 0.557 ...
## $ ast_pct : num 0.113 0.048 0.148 0.077 0.04 0.102 0.149 0.087 0.141 0.262 ...
## $ season : Factor w/ 26 levels "1996-97","1997-98",..: 1 1 1 1 1 1 1 1 1 1 ...
We need to make an apples to apples comparison.
season1920<-all_seasons%>%
filter(season=="2019-20")%>%
select(-season)%>%
mutate(player=player_name)
joinNBA<-salaries1920%>%
left_join(season1920)
## Joining, by = "player"
str(joinNBA)
## 'data.frame': 528 obs. of 26 variables:
## $ team : Factor w/ 30 levels "Atlanta Hawks",..: 10 21 11 30 3 11 14 28 9 23 ...
## $ salary : int 40231758 38506482 38506482 38199000 38199000 38199000 37436858 34996296 34449964 32742000 ...
## $ player : Factor w/ 2469 levels "Aaron Gordon",..: 457 73 439 255 295 221 323 312 37 483 ...
## $ position : Factor w/ 7 levels " C"," F"," G",..: 5 5 5 5 6 7 6 5 4 6 ...
## $ season : Factor w/ 1 level "2019-2020": 1 1 1 1 1 1 1 1 1 1 ...
## $ X : int 10869 10921 10888 NA NA 11041 10734 10743 10995 10777 ...
## $ player_name : Factor w/ 2463 levels "A.C. Green","A.J. Bramlett",..: 2154 391 2028 NA NA 1031 1479 1435 205 2249 ...
## $ team_abbreviation: Factor w/ 36 levels "ATL","BKN","BOS",..: 11 25 12 NA NA 12 15 33 10 27 ...
## $ age : num 32 35 31 NA NA 30 35 34 31 27 ...
## $ player_height : num 190 185 190 NA NA ...
## $ player_weight : num 83.9 79.4 90.7 NA NA ...
## $ college : Factor w/ 347 levels " "," ",..: 68 322 299 NA NA 12 191 315 212 276 ...
## $ country : Factor w/ 82 levels "Angola","Argentina",..: 79 79 79 NA NA 79 79 79 79 79 ...
## $ draft_year : Factor w/ 47 levels "1963","1976",..: 34 30 33 NA NA 34 28 31 34 36 ...
## $ draft_round : Factor w/ 9 levels "0","1","2","3",..: 2 2 2 NA NA 2 2 2 2 2 ...
## $ draft_number : Factor w/ 76 levels "0","1","10","11",..: 68 41 41 NA NA 30 2 24 2 18 ...
## $ gp : int 5 70 57 NA NA 68 67 58 18 72 ...
## $ pts : num 20.8 17.6 27.2 NA NA 34.3 25.3 19.4 15.5 19.6 ...
## $ reb : num 5.2 5 7.9 NA NA 6.6 7.8 5 4.7 6.9 ...
## $ ast : num 6.6 6.7 7 NA NA 7.5 10.2 7.5 3.3 3.2 ...
## $ net_rating : num -15.3 6.9 3.1 NA NA 5.8 8.5 6 -2.6 3.1 ...
## $ oreb_pct : num 0.023 0.012 0.044 NA NA 0.026 0.028 0.015 0.032 0.027 ...
## $ dreb_pct : num 0.162 0.143 0.157 NA NA 0.139 0.191 0.115 0.128 0.172 ...
## $ usg_pct : num 0.286 0.228 0.33 NA NA 0.356 0.308 0.225 0.277 0.236 ...
## $ ts_pct : num 0.557 0.61 0.536 NA NA 0.626 0.577 0.59 0.476 0.556 ...
## $ ast_pct : num 0.355 0.324 0.34 NA NA 0.366 0.477 0.309 0.184 0.145 ...
ggplot(joinNBA, aes(x=pts, y=salary))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 77 rows containing non-finite values (stat_smooth).
## Warning: Removed 77 rows containing missing values (geom_point).
unite
# lets create more fake data for dates and times
set.seed(1)
date <- as.Date('2016-01-01') + 0:14
hour <- sample(1:24, 15)
min <- sample(1:60, 15)
second <- sample(1:60, 15)
event <- sample(letters, 15)
data <- data.frame(date, hour, min, second, event)
data
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a
dataUnite <- data %>%
unite(datehour, date, hour, sep = ' ') %>%
unite(datetime, datehour, min, second, sep = ':')
str(dataUnite)
## 'data.frame': 15 obs. of 2 variables:
## $ datetime: chr "2016-01-01 4:15:35" "2016-01-02 7:21:6" "2016-01-03 1:37:10" "2016-01-04 2:41:42" ...
## $ event : Factor w/ 15 levels "a","b","f","g",..: 13 14 3 4 10 5 15 7 2 6 ...
# seperate
dataSep <- dataUnite %>%
separate(datetime, c('date', 'time'), sep = ' ') %>%
separate(time, c('hour', 'min', 'second'), sep = ':')
dataSep
## date hour min second event
## 1 2016-01-01 4 15 35 w
## 2 2016-01-02 7 21 6 x
## 3 2016-01-03 1 37 10 f
## 4 2016-01-04 2 41 42 g
## 5 2016-01-05 11 25 38 s
## 6 2016-01-06 14 46 47 j
## 7 2016-01-07 18 58 20 y
## 8 2016-01-08 22 54 28 n
## 9 2016-01-09 5 34 54 b
## 10 2016-01-10 16 42 44 m
## 11 2016-01-11 10 56 23 r
## 12 2016-01-12 6 44 59 t
## 13 2016-01-13 19 60 40 v
## 14 2016-01-14 23 33 51 o
## 15 2016-01-15 9 20 25 a