Overview
Tidyverse is a versatile tool for data science and I will be using a fivethirtyeight dataset to dive into tidyverse. This comprehensive package offers a wide array of dataframe tools, as well as tools to manipulate other forms of data. In short, tidyverse is the master key for data manipulation.
Loading all Tidyverse packages
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(purrr)
library(ggplot2)
library(readr)
library(stringr)
library(tidyr)
library(tibble)
library(forcats)By loading library(tidyverse), all of the packages are also loaded, so it was just an extra step to intentionally list out all eight packages. It is pretty clear that someone on the naming team at tidyverse loves cats.
Loading the Data
link <- 'https://raw.githubusercontent.com/st3vejobs/tidyverse-CREATE/main/nfl_elo_latest.csv'
raw <- read.csv(url(link), na.strings = "")
raw <- subset(raw, select = -c(playoff))
nfl <- na.omit(raw)filter( )
If I only wanted to see Elo changes over time in the month of October, I can use filter( ) from the dplyr package in tidyverse.
oct <- nfl %>%
filter(date >= "2021-10-01", date < "2021-11-01")
head(oct)## date season neutral team1 team2 elo1_pre elo2_pre elo_prob1 elo_prob2
## 1 2021-10-03 2021 0 CHI DET 1467.762 1356.949 0.7334239 0.2665761
## 2 2021-10-03 2021 0 PHI KC 1429.401 1610.832 0.3384448 0.6615552
## 3 2021-10-03 2021 0 NYJ TEN 1332.623 1544.821 0.2999907 0.7000093
## 4 2021-10-03 2021 0 MIA IND 1493.752 1498.930 0.5852496 0.4147504
## 5 2021-10-03 2021 0 ATL WSH 1443.767 1439.248 0.5987316 0.4012684
## 6 2021-10-03 2021 0 BUF HOU 1648.028 1405.664 0.8543736 0.1456264
## elo1_post elo2_post qbelo1_pre qbelo2_pre qb1 qb2
## 1 1479.601 1345.110 1448.692 1338.897 Justin Fields Jared Goff
## 2 1412.912 1627.322 1438.790 1609.937 Jalen Hurts Patrick Mahomes
## 3 1353.423 1524.021 1347.976 1527.587 Zach Wilson Ryan Tannehill
## 4 1464.900 1527.782 1490.794 1472.099 Jacoby Brissett Carson Wentz
## 5 1423.865 1459.149 1427.519 1469.368 Matt Ryan Taylor Heinicke
## 6 1657.518 1396.174 1618.630 1373.595 Josh Allen Davis Mills
## qb1_value_pre qb2_value_pre qb1_adj qb2_adj qbelo_prob1 qbelo_prob2
## 1 68.14545 179.14471 -39.900119 7.118706 0.6645030 0.33549701
## 2 155.42411 256.62512 10.090350 12.944424 0.3404501 0.65954994
## 3 54.17359 194.79136 -29.758214 -0.309080 0.2953799 0.70462011
## 4 131.86768 106.60662 -1.358616 -17.281970 0.6316359 0.36836406
## 5 166.75088 67.18041 -4.318374 -96.459200 0.6499408 0.35005925
## 6 257.91961 81.09868 15.833292 -135.047374 0.9324797 0.06752034
## qb1_game_value qb2_game_value qb1_value_post qb2_value_post qbelo1_post
## 1 20.06013 180.20325 63.33692 179.25056 1463.958
## 2 321.96689 383.98856 172.07839 269.36146 1422.192
## 3 152.73856 29.35701 64.03009 178.24792 1368.952
## 4 124.27718 215.80361 131.10863 117.52632 1459.155
## 5 311.11366 318.85378 181.18716 92.34774 1405.523
## 6 212.93431 -219.03445 253.42108 51.08537 1622.784
## qbelo2_post score1 score2 quality importance total_rating
## 1 1323.631 24 14 3 16 10
## 2 1626.536 30 42 64 58 61
## 3 1506.611 27 24 12 16 14
## 4 1503.738 17 27 34 67 51
## 5 1491.364 30 34 7 31 19
## 6 1369.441 40 0 10 3 7
arrange( )
Now, let’s say I wanted to go back to the nfl dataframe and sort it by the importance of the game.
nfl_important <- arrange(nfl, desc(importance))
paste("The most important game so far this season was on November 14 where the Chiefs defeated the Raiders. ")## [1] "The most important game so far this season was on November 14 where the Chiefs defeated the Raiders. "
nfl_important[1, ]## date season neutral team1 team2 elo1_pre elo2_pre elo_prob1 elo_prob2
## 1 2021-11-14 2021 0 OAK KC 1500.814 1609.987 0.4367708 0.5632292
## elo1_post elo2_post qbelo1_pre qbelo2_pre qb1 qb2
## 1 1472.279 1638.522 1487.522 1588.911 Derek Carr Patrick Mahomes
## qb1_value_pre qb2_value_pre qb1_adj qb2_adj qbelo_prob1 qbelo_prob2
## 1 192.6431 209.2709 3.533611 -13.41658 0.4643178 0.5356822
## qb1_game_value qb2_game_value qb1_value_post qb2_value_post qbelo1_post
## 1 178.9716 492.7609 191.276 237.6199 1456.924
## qbelo2_post score1 score2 quality importance total_rating
## 1 1619.51 14 41 65 98 82
Rename ( )
Two of the columns have names that are not arranged properly and they may be confusing. I will rename them to make them easier to understand. They are the quarterback-adjusted win probabilities for each team in a particular game.
nfl_important <- nfl_important %>% rename(qb1_adjusted_elo_prob = qbelo_prob1, qb2_adjusted_elo_prob = qbelo_prob2)Select ( )
Select is another subsetting tool that goes a bit deeper. I will create a subset of all Chicago Bears games and then I will select a few key columns.
bears <- subset(nfl_important, team1 == "CHI" | team2 == "CHI")
bearst1 <- subset(bears, team1 == "CHI")
bearst2 <- subset(bears, team2 == "CHI")
bearsqb_elo_1 <- bearst1 %>% select(team1,elo_prob1,qb1,qb1_adjusted_elo_prob,importance)
bearsqb_elo_1 <- bearsqb_elo_1 %>% rename(team = team1, elo_prob = elo_prob1, qb = qb1, qb_elo_prob = qb1_adjusted_elo_prob)
bearsqb_elo_2 <- bearst2 %>% select(team2,elo_prob2,qb2,qb2_adjusted_elo_prob,importance)
bearsqb_elo_2 <- bearsqb_elo_2 %>% rename(team = team2, elo_prob = elo_prob2, qb = qb2, qb_elo_prob = qb2_adjusted_elo_prob)
bearsqb_elo <- rbind(bearsqb_elo_1,bearsqb_elo_2)
bearsqb_elo## team elo_prob qb qb_elo_prob importance
## 24 CHI 0.4232868 Justin Fields 0.3971045 80
## 81 CHI 0.6226012 Justin Fields 0.5132856 51
## 84 CHI 0.6619565 Andy Dalton 0.5729006 49
## 133 CHI 0.7334239 Justin Fields 0.6645030 16
## 75 CHI 0.3080676 Andy Dalton 0.2659959 54
## 90 CHI 0.3560446 Justin Fields 0.3041971 47
## 91 CHI 0.3400144 Justin Fields 0.2682454 46
## 95 CHI 0.3035732 Justin Fields 0.3072571 38
## 115 CHI 0.1954160 Justin Fields 0.1522307 21
Relocate ( )
With the new dataframe, I would like to reorder the columns so that it is easier to view. Team may be unnecessary, but I will move it to the end just in case. I would like to list the QB first so it is easier to analyze the value of each quarterback.
bearsqb_elo <- bearsqb_elo %>% relocate(qb, team)
bearsqb_elo <- bearsqb_elo %>% relocate(team, .after = importance)Add_Column ( ), stringr commands, group_by ( )
Now, I will perform some additional manipulation on the dataframe.
qb_impact <- bearsqb_elo$qb_elo_prob - bearsqb_elo$elo_prob
names <- unlist(bearsqb_elo$qb)
first <- names[seq(1,length(names),by = 1)]
first <- str_extract(names, '[A-Z].{1,30}')
first <- c(str_trim(str_extract(names, '.+\\s{1,}')))
initials <- str_extract_all(first, '(^[A-Z])|(\\s)([A-Z])')
first_initial <- str_extract(first, '^[A-Z]')
bearsqb_elo <- bearsqb_elo %>% add_column(qb_impact)
bearsqb_elo <- bearsqb_elo %>% relocate(qb_impact, .before = importance)
bearsqb_elo <- bearsqb_elo %>% add_column(first_initial)
bearsqb_elo <- bearsqb_elo %>% relocate(first_initial, .after = qb)
bearsqb_elo$qb <- as.factor(bearsqb_elo$qb)
bearsqb_elo <- bearsqb_elo %>% group_by(qb) %>% arrange(qb)
#Note: arrange must be used with group_by to rearrange rows. group_by does not affect the layout of rows.
bearsqb_elo## # A tibble: 9 × 7
## # Groups: qb [2]
## qb first_initial elo_prob qb_elo_prob qb_impact importance team
## <fct> <chr> <dbl> <dbl> <dbl> <int> <chr>
## 1 Andy Dalton A 0.662 0.573 -0.0891 49 CHI
## 2 Andy Dalton A 0.308 0.266 -0.0421 54 CHI
## 3 Justin Fields J 0.423 0.397 -0.0262 80 CHI
## 4 Justin Fields J 0.623 0.513 -0.109 51 CHI
## 5 Justin Fields J 0.733 0.665 -0.0689 16 CHI
## 6 Justin Fields J 0.356 0.304 -0.0518 47 CHI
## 7 Justin Fields J 0.340 0.268 -0.0718 46 CHI
## 8 Justin Fields J 0.304 0.307 0.00368 38 CHI
## 9 Justin Fields J 0.195 0.152 -0.0432 21 CHI
Mutate ( )
Mutate takes a function found in base R and makes it even better. I will perform a couple of example mutations below. I performed the base R version of this function above with qb_impact.
bearsqb_elo <- bearsqb_elo %>%
mutate(mean_elo_prob = (elo_prob + qb_elo_prob)/2)As one can see, this provides a more streamlined approach to a similar calculation.
ggplot ( )
Here, I will use a new subset for the Chicago Bears, and this time I will plot Elo over time using ggplot2.
bears <- subset(nfl_important, team1 == "CHI" | team2 == "CHI")
bearst1 <- subset(bears, team1 == "CHI")
bearst2 <- subset(bears, team2 == "CHI")
bearsqb_elo_1 <- bearst1 %>% select(date, team1, elo1_pre, elo1_post, elo_prob1, qb1, qb1_adjusted_elo_prob, importance)
bearsqb_elo_1 <- bearsqb_elo_1 %>% rename(team = team1, elo_pre = elo1_pre, elo_post = elo1_post, elo_prob = elo_prob1, qb = qb1, qb_elo_prob = qb1_adjusted_elo_prob)
bearsqb_elo_2 <- bearst2 %>% select(date, team2, elo2_pre, elo2_post, elo_prob2, qb2, qb2_adjusted_elo_prob, importance)
bearsqb_elo_2 <- bearsqb_elo_2 %>% rename(team = team2, elo_pre = elo2_pre, elo_post = elo2_post, elo_prob = elo_prob2, qb = qb2, qb_elo_prob = qb2_adjusted_elo_prob)
bears <- rbind(bearsqb_elo_1,bearsqb_elo_2)
bears## date team elo_pre elo_post elo_prob qb qb_elo_prob
## 24 2021-10-17 CHI 1513.175 1493.359 0.4232868 Justin Fields 0.3971045
## 81 2021-10-31 CHI 1480.761 1448.546 0.6226012 Justin Fields 0.5132856
## 84 2021-09-19 CHI 1478.535 1487.436 0.6619565 Andy Dalton 0.5729006
## 133 2021-10-03 CHI 1467.762 1479.601 0.7334239 Justin Fields 0.6645030
## 75 2021-09-12 CHI 1496.167 1478.535 0.3080676 Andy Dalton 0.2659959
## 90 2021-10-10 CHI 1479.601 1513.175 0.3560446 Justin Fields 0.3041971
## 91 2021-09-26 CHI 1487.436 1467.762 0.3400144 Justin Fields 0.2682454
## 95 2021-11-08 CHI 1448.546 1442.286 0.3035732 Justin Fields 0.3072571
## 115 2021-10-24 CHI 1493.359 1480.761 0.1954160 Justin Fields 0.1522307
## importance
## 24 80
## 81 51
## 84 49
## 133 16
## 75 54
## 90 47
## 91 46
## 95 38
## 115 21
ggplot(bears)+
geom_point(aes(x = date, y = elo_pre, color = "elo_pre"), size=3)+
geom_point(aes(x = date, y = elo_post, color = "elo_post"), size=3)+
ggtitle('Bears Elo Over Time')+
labs(
x = 'Date',
y = 'Elo Rating',
color = 'Legend')+
geom_line(aes(x = date, y = elo_pre, group = 1, color = "elo_pre"), linetype = "dashed")+
geom_line(aes(x = date, y = elo_post, group = 1, color = "elo_post"), linetype = "dashed")+
theme(plot.title = element_text(hjust = 0.5), )+
scale_x_discrete(guide = guide_axis(n.dodge = 3))It is clear that tidyverse is one of the most valuable tools for data manipulation and analysis. String manipulation, dataframe tidying, dataframe manipulation, and plotting are all made so much better by tidyverse.