mkolp0-" date: “11/30/2019” output: html_document —

library(knitr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.0.1     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ----------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(formattable)
## Warning: package 'formattable' was built under R version 3.5.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.5.3

Read Data using readr

We use a fixed commit because this version has a data issue.

df <- read_csv("https://raw.githubusercontent.com/willoutcault/Data607-Data-Acquisition/975ba8d48e0c3d8590bd9afa79e75632152aabfc/runningbacks%20consolidated.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Team = col_character(),
##   DisplayName = col_character(),
##   GameClock = col_time(format = ""),
##   PossessionTeam = col_character(),
##   OffenseFormation = col_character(),
##   OffensePersonnel = col_character(),
##   DefensePersonnel = col_character(),
##   PlayDirection = col_character(),
##   TimeHandoff = col_datetime(format = ""),
##   TimeSnap = col_datetime(format = ""),
##   PlayerHeight = col_character(),
##   Stadium = col_character()
## )
## See spec(...) for full column specifications.

Select Factors of Interest

df2 <- select(df, "DisplayName", "PlayerHeight","PossessionTeam", "PlayerWeight")

Seperate Feet and Inches in Height Column

The height column has actually been misparsed as a date (e.g. 02-Jun for 6 feet 2 inches). we can use Lubridate to parse this out and recover, but we have to be a bit clever because the misparsing reads as Jun-00 for someone 6 feet exactly. So we replace the 00 with 13 using stringr and then take the month compoennt as feet, and the inches component we check if the day is 13 and take that as 0 otherwise we use days directly.

df2$playerHeightDate <- lubridate::parse_date_time(str_replace(df2$PlayerHeight,"00","13"), c("d-b","b-d"))
df3 <- df2
df3$feet <- lubridate::month(df2$playerHeightDate)
df3$inches <- ifelse(lubridate::day(df2$playerHeightDate)==13,0,lubridate::day(df2$playerHeightDate)) 

Create Column with Total Inches

df4 <- df3 %>% mutate(Height = (feet*12) + inches) %>%
  select("DisplayName", "Height", "PlayerWeight", "PossessionTeam")

Use BMI Forumla to Calculate Player’s BMI

df5 <- df4 %>%
  mutate(bmi = 703*PlayerWeight/(Height^2))
kable(head(df5))
DisplayName Height PlayerWeight PossessionTeam bmi
Matt Ryan 76 217 ATL 26.41118
Andy Levitre 74 303 ATL 38.89865
Alex Mack 76 311 ATL 37.85197
Brandon Fusco 76 306 ATL 37.24342
Julio Jones 75 220 ATL 27.49511
Logan Paulsen 77 268 ATL 31.77669
df5 %>% ggplot(aes(bmi)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate Standardized Player BMI

df6 <- df5 %>% 
  summarize(
    meanbmi = mean(bmi),
    sdbmi = sd(bmi)
  )

df6
## # A tibble: 1 x 2
##   meanbmi sdbmi
##     <dbl> <dbl>
## 1    32.0  4.79
df5 <- df5 %>% 
  mutate(meanbmi = df6$meanbmi) %>% 
  mutate(sdbmi = df6$sdbmi) %>% 
  mutate(standardizedbmi = (bmi-meanbmi)/sdbmi)

head(df5)
## # A tibble: 6 x 8
##   DisplayName Height PlayerWeight PossessionTeam   bmi meanbmi sdbmi
##   <chr>        <dbl>        <dbl> <chr>          <dbl>   <dbl> <dbl>
## 1 Matt Ryan       76          217 ATL             26.4    32.0  4.79
## 2 Andy Levit~     74          303 ATL             38.9    32.0  4.79
## 3 Alex Mack       76          311 ATL             37.9    32.0  4.79
## 4 Brandon Fu~     76          306 ATL             37.2    32.0  4.79
## 5 Julio Jones     75          220 ATL             27.5    32.0  4.79
## 6 Logan Paul~     77          268 ATL             31.8    32.0  4.79
## # ... with 1 more variable: standardizedbmi <dbl>

Standardized BMI by Player Weight

p <- ggplot(df5, aes(x=standardizedbmi, y=PlayerWeight)) + 
    geom_point(alpha=0.75, size=3) + 
    stat_smooth(data=within(df5, PossessionTeam <- NULL), color="grey", size=.5,
                method="lm", formula = y ~ poly(x, 2), se=FALSE) +
    stat_smooth(size=1.5, method="lm", formula = y ~ poly(x, 2), se=FALSE) +
    scale_color_identity() +
    scale_x_continuous(name="Standardized BMI\n(# of standard deviations from mean)",
                       breaks=c(-2, 0, 2), limit=c(-2.5, 2.5), labels=c("-2", "0", "+2")) +
    scale_y_continuous(name="Player Height") +
    facet_wrap(~PossessionTeam, ncol=5, scales="free_x") +
    theme_fivethirtyeight() 

p