mkolp0-" date: “11/30/2019” output: html_document —
library(knitr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.0.1 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ----------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(formattable)
## Warning: package 'formattable' was built under R version 3.5.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.5.3
We use a fixed commit because this version has a data issue.
df <- read_csv("https://raw.githubusercontent.com/willoutcault/Data607-Data-Acquisition/975ba8d48e0c3d8590bd9afa79e75632152aabfc/runningbacks%20consolidated.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Team = col_character(),
## DisplayName = col_character(),
## GameClock = col_time(format = ""),
## PossessionTeam = col_character(),
## OffenseFormation = col_character(),
## OffensePersonnel = col_character(),
## DefensePersonnel = col_character(),
## PlayDirection = col_character(),
## TimeHandoff = col_datetime(format = ""),
## TimeSnap = col_datetime(format = ""),
## PlayerHeight = col_character(),
## Stadium = col_character()
## )
## See spec(...) for full column specifications.
df2 <- select(df, "DisplayName", "PlayerHeight","PossessionTeam", "PlayerWeight")
The height column has actually been misparsed as a date (e.g. 02-Jun for 6 feet 2 inches). we can use Lubridate to parse this out and recover, but we have to be a bit clever because the misparsing reads as Jun-00 for someone 6 feet exactly. So we replace the 00 with 13 using stringr and then take the month compoennt as feet, and the inches component we check if the day is 13 and take that as 0 otherwise we use days directly.
df2$playerHeightDate <- lubridate::parse_date_time(str_replace(df2$PlayerHeight,"00","13"), c("d-b","b-d"))
df3 <- df2
df3$feet <- lubridate::month(df2$playerHeightDate)
df3$inches <- ifelse(lubridate::day(df2$playerHeightDate)==13,0,lubridate::day(df2$playerHeightDate))
df4 <- df3 %>% mutate(Height = (feet*12) + inches) %>%
select("DisplayName", "Height", "PlayerWeight", "PossessionTeam")
df5 <- df4 %>%
mutate(bmi = 703*PlayerWeight/(Height^2))
kable(head(df5))
| DisplayName | Height | PlayerWeight | PossessionTeam | bmi |
|---|---|---|---|---|
| Matt Ryan | 76 | 217 | ATL | 26.41118 |
| Andy Levitre | 74 | 303 | ATL | 38.89865 |
| Alex Mack | 76 | 311 | ATL | 37.85197 |
| Brandon Fusco | 76 | 306 | ATL | 37.24342 |
| Julio Jones | 75 | 220 | ATL | 27.49511 |
| Logan Paulsen | 77 | 268 | ATL | 31.77669 |
df5 %>% ggplot(aes(bmi)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df6 <- df5 %>%
summarize(
meanbmi = mean(bmi),
sdbmi = sd(bmi)
)
df6
## # A tibble: 1 x 2
## meanbmi sdbmi
## <dbl> <dbl>
## 1 32.0 4.79
df5 <- df5 %>%
mutate(meanbmi = df6$meanbmi) %>%
mutate(sdbmi = df6$sdbmi) %>%
mutate(standardizedbmi = (bmi-meanbmi)/sdbmi)
head(df5)
## # A tibble: 6 x 8
## DisplayName Height PlayerWeight PossessionTeam bmi meanbmi sdbmi
## <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Matt Ryan 76 217 ATL 26.4 32.0 4.79
## 2 Andy Levit~ 74 303 ATL 38.9 32.0 4.79
## 3 Alex Mack 76 311 ATL 37.9 32.0 4.79
## 4 Brandon Fu~ 76 306 ATL 37.2 32.0 4.79
## 5 Julio Jones 75 220 ATL 27.5 32.0 4.79
## 6 Logan Paul~ 77 268 ATL 31.8 32.0 4.79
## # ... with 1 more variable: standardizedbmi <dbl>
p <- ggplot(df5, aes(x=standardizedbmi, y=PlayerWeight)) +
geom_point(alpha=0.75, size=3) +
stat_smooth(data=within(df5, PossessionTeam <- NULL), color="grey", size=.5,
method="lm", formula = y ~ poly(x, 2), se=FALSE) +
stat_smooth(size=1.5, method="lm", formula = y ~ poly(x, 2), se=FALSE) +
scale_color_identity() +
scale_x_continuous(name="Standardized BMI\n(# of standard deviations from mean)",
breaks=c(-2, 0, 2), limit=c(-2.5, 2.5), labels=c("-2", "0", "+2")) +
scale_y_continuous(name="Player Height") +
facet_wrap(~PossessionTeam, ncol=5, scales="free_x") +
theme_fivethirtyeight()
p