Data621 - HW 1

Load Libraries

#setwd("Code/HW_1")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
#library(forecast)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.0
## ✔ dials        1.1.0     ✔ tune         1.0.1
## ✔ infer        1.0.3     ✔ workflows    1.1.2
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.3     ✔ yardstick    1.1.0
## ✔ recipes      1.0.3     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(vip) # for variable importance
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

Load Data

test <- read_csv("moneyball-evaluation-data.csv")
## Rows: 259 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): INDEX, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATT...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
training <- read_csv("moneyball-training-data.csv")
## Rows: 2276 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (17): INDEX, TARGET_WINS, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Review Data

# Skim is a nice visualization tool
skimr::skim(training)
Data summary
Name training
Number of rows 2276
Number of columns 17
_______________________
Column type frequency:
numeric 17
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INDEX 0 1.00 1268.46 736.35 1 630.75 1270.5 1915.50 2535 ▇▇▇▇▇
TARGET_WINS 0 1.00 80.79 15.75 0 71.00 82.0 92.00 146 ▁▁▇▅▁
TEAM_BATTING_H 0 1.00 1469.27 144.59 891 1383.00 1454.0 1537.25 2554 ▁▇▂▁▁
TEAM_BATTING_2B 0 1.00 241.25 46.80 69 208.00 238.0 273.00 458 ▁▆▇▂▁
TEAM_BATTING_3B 0 1.00 55.25 27.94 0 34.00 47.0 72.00 223 ▇▇▂▁▁
TEAM_BATTING_HR 0 1.00 99.61 60.55 0 42.00 102.0 147.00 264 ▇▆▇▅▁
TEAM_BATTING_BB 0 1.00 501.56 122.67 0 451.00 512.0 580.00 878 ▁▁▇▇▁
TEAM_BATTING_SO 102 0.96 735.61 248.53 0 548.00 750.0 930.00 1399 ▁▆▇▇▁
TEAM_BASERUN_SB 131 0.94 124.76 87.79 0 66.00 101.0 156.00 697 ▇▃▁▁▁
TEAM_BASERUN_CS 772 0.66 52.80 22.96 0 38.00 49.0 62.00 201 ▃▇▁▁▁
TEAM_BATTING_HBP 2085 0.08 59.36 12.97 29 50.50 58.0 67.00 95 ▂▇▇▅▁
TEAM_PITCHING_H 0 1.00 1779.21 1406.84 1137 1419.00 1518.0 1682.50 30132 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 105.70 61.30 0 50.00 107.0 150.00 343 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 553.01 166.36 0 476.00 536.5 611.00 3645 ▇▁▁▁▁
TEAM_PITCHING_SO 102 0.96 817.73 553.09 0 615.00 813.5 968.00 19278 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 246.48 227.77 65 127.00 159.0 249.25 1898 ▇▁▁▁▁
TEAM_FIELDING_DP 286 0.87 146.39 26.23 52 131.00 149.0 164.00 228 ▁▂▇▆▁
skimr::skim(test)
Data summary
Name test
Number of rows 259
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INDEX 0 1.00 1263.93 693.29 9 708.0 1249.0 1832.50 2525 ▆▆▆▇▅
TEAM_BATTING_H 0 1.00 1469.39 150.66 819 1387.0 1455.0 1548.00 2170 ▁▂▇▁▁
TEAM_BATTING_2B 0 1.00 241.32 49.52 44 210.0 239.0 278.50 376 ▁▂▇▇▂
TEAM_BATTING_3B 0 1.00 55.91 27.14 14 35.0 52.0 72.00 155 ▇▇▃▁▁
TEAM_BATTING_HR 0 1.00 95.63 56.33 0 44.5 101.0 135.50 242 ▆▅▇▃▁
TEAM_BATTING_BB 0 1.00 498.96 120.59 15 436.5 509.0 565.50 792 ▁▁▅▇▁
TEAM_BATTING_SO 18 0.93 709.34 243.11 0 545.0 686.0 912.00 1268 ▁▃▇▇▂
TEAM_BASERUN_SB 13 0.95 123.70 93.39 0 59.0 92.0 151.75 580 ▇▃▁▁▁
TEAM_BASERUN_CS 87 0.66 52.32 23.10 0 38.0 49.5 63.00 154 ▂▇▃▁▁
TEAM_BATTING_HBP 240 0.07 62.37 12.71 42 53.5 62.0 67.50 96 ▃▇▅▁▁
TEAM_PITCHING_H 0 1.00 1813.46 1662.91 1155 1426.5 1515.0 1681.00 22768 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 102.15 57.65 0 52.0 104.0 142.50 336 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 552.42 172.95 136 471.0 526.0 606.50 2008 ▆▇▁▁▁
TEAM_PITCHING_SO 18 0.93 799.67 634.31 0 613.0 745.0 938.00 9963 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 249.75 230.90 73 131.0 163.0 252.00 1568 ▇▁▁▁▁
TEAM_FIELDING_DP 31 0.88 146.06 25.88 69 131.0 148.0 164.00 204 ▁▂▇▇▂
# Lets plot the data (BOxplot)
training_long <- training %>%
  select(-INDEX) %>%
  melt()
## No id variables; using all as measure variables
training_long %>%
  filter(complete.cases(.)) %>%
  ggplot(aes(x= variable, y=value)) +
  geom_boxplot(fill="#FF5733") +
  scale_y_log10(labels = label_comma()) +
  coord_flip() +
  theme_minimal() +
  labs(y="Statistic's Value", x="Statistic",
       title="Distribution of Important Baseball Statistics")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 78 rows containing non-finite values (`stat_boxplot()`).

# Another way to see the Boxplots
p <- ggplot(training_long, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")
## Warning: Removed 3478 rows containing non-finite values (`stat_boxplot()`).

# Plot densities
p <- ggplot(training_long, aes(value)) 
p + geom_density() + facet_wrap(~variable, scale="free")
## Warning: Removed 3478 rows containing non-finite values (`stat_density()`).

#res <- cor(training, use="pairwise.complete.obs")
#round(res, 2)
#don't run. doesn't look good
#ggpairs(training, title="correlogram with ggpairs()") 

Feature engineering and Data cleaning

#Replace NA with median of Column

training <- training %>% mutate_all(~ifelse(is.na(.x), median(.x, na.rm = TRUE), .x))

test <- test %>% mutate_all(~ifelse(is.na(.x), median(.x, na.rm = TRUE), .x))
# Let's check if any NA left
training %>% 
  summarise_all(~sum(is.na(.))) %>%
  t()
##                  [,1]
## INDEX               0
## TARGET_WINS         0
## TEAM_BATTING_H      0
## TEAM_BATTING_2B     0
## TEAM_BATTING_3B     0
## TEAM_BATTING_HR     0
## TEAM_BATTING_BB     0
## TEAM_BATTING_SO     0
## TEAM_BASERUN_SB     0
## TEAM_BASERUN_CS     0
## TEAM_BATTING_HBP    0
## TEAM_PITCHING_H     0
## TEAM_PITCHING_HR    0
## TEAM_PITCHING_BB    0
## TEAM_PITCHING_SO    0
## TEAM_FIELDING_E     0
## TEAM_FIELDING_DP    0
# ANother way to check NA's
colSums(is.na(training))
##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##                0                0                0                0 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##                0                0                0                0 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
##                0                0                0                0 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##                0                0                0                0 
## TEAM_FIELDING_DP 
##                0
# One final way to seel all NA's
test %>% 
  summarise_all(~sum(is.na(.))) %>%
  t()
##                  [,1]
## INDEX               0
## TEAM_BATTING_H      0
## TEAM_BATTING_2B     0
## TEAM_BATTING_3B     0
## TEAM_BATTING_HR     0
## TEAM_BATTING_BB     0
## TEAM_BATTING_SO     0
## TEAM_BASERUN_SB     0
## TEAM_BASERUN_CS     0
## TEAM_BATTING_HBP    0
## TEAM_PITCHING_H     0
## TEAM_PITCHING_HR    0
## TEAM_PITCHING_BB    0
## TEAM_PITCHING_SO    0
## TEAM_FIELDING_E     0
## TEAM_FIELDING_DP    0
#lets remove INDEX. Not needed
training <- training %>%
  dplyr::select(-INDEX)

test <- test %>%
  dplyr::select(-INDEX)

Modelling

# Create a split object TEST and TRAINING
set.seed(42)
my_split <- initial_split(training, prop = 0.9)

# Build training data set
my_training <- my_split %>% training()

# Build testing data set
my_test <- my_split %>% testing()
# Tidymodels recipe to preprocess data
recipe1 <- recipe(TARGET_WINS ~ ., data = my_training) %>% 
                      step_BoxCox(all_numeric(), -all_outcomes()) %>% 
                      step_normalize(all_numeric(), -all_outcomes())
# Tidymodels define regression model
lm_model <- linear_reg() %>% 
            set_engine('lm') %>% 
            set_mode('regression')
# Define a Tidymodels WORKFLOW
workflow1 <- workflow() %>% 
                        add_model(lm_model) %>% 
                        add_recipe(recipe1)

my_fit <- workflow1 %>% 
                   last_fit(my_split)
## ! train/test split: preprocessor 1/1: Non-positive values in selected variable., No Box-Cox transformation cou...
# Obtain performance metrics on test data
my_fit %>% collect_metrics()
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      13.2   Preprocessor1_Model1
## 2 rsq     standard       0.245 Preprocessor1_Model1
# Predict on Test DATA
my_fit$.workflow[[1]] %>%
  predict(test)
## # A tibble: 259 × 1
##    .pred
##    <dbl>
##  1  62.4
##  2  65.6
##  3  75.3
##  4  82.8
##  5  66.3
##  6  67.1
##  7  78.2
##  8  74.0
##  9  70.8
## 10  74.1
## # … with 249 more rows

Analysis of Results

# Obtain test set predictions data frame
results <- my_fit %>% 
                 collect_predictions()
# View results
results
## # A tibble: 228 × 5
##    id               .pred  .row TARGET_WINS .config             
##    <chr>            <dbl> <int>       <dbl> <chr>               
##  1 train/test split  67.4     5          82 Preprocessor1_Model1
##  2 train/test split  65.9     7          80 Preprocessor1_Model1
##  3 train/test split  82.6    38          82 Preprocessor1_Model1
##  4 train/test split  88.5    46          85 Preprocessor1_Model1
##  5 train/test split  71.0    60         107 Preprocessor1_Model1
##  6 train/test split  58.9    80          53 Preprocessor1_Model1
##  7 train/test split  70.4    81          63 Preprocessor1_Model1
##  8 train/test split  76.7    96          57 Preprocessor1_Model1
##  9 train/test split  76.6   104          67 Preprocessor1_Model1
## 10 train/test split  81.3   117          86 Preprocessor1_Model1
## # … with 218 more rows
# Plot results
ggplot(data = results,
       mapping = aes(x = .pred, y = TARGET_WINS)) +
  geom_point(color = '#006EA1', alpha = 0.25) +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +
  labs(title = 'Linear Regression Results - Training/Test Set',
       x = 'Predicted Wind',
       y = 'Actual Wins')

# Lets prepare data for further analysis of variable importance
training_baked <- recipe1 %>% 
                        prep() %>% 
                        bake(new_data = my_training)
## Warning: Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Warning: No Box-Cox transformation could be estimated for: `TEAM_BATTING_3B`,
## `TEAM_BATTING_HR`, `TEAM_BATTING_BB`, `TEAM_BATTING_SO`, `TEAM_BASERUN_SB`,
## `TEAM_BASERUN_CS`, `TEAM_PITCHING_HR`, `TEAM_PITCHING_BB`, `TEAM_PITCHING_SO`
# View results
training_baked
## # A tibble: 2,048 × 16
##    TEAM_BATTI…¹ TEAM_…² TEAM_…³ TEAM_…⁴ TEAM_B…⁵ TEAM_…⁶ TEAM_…⁷ TEAM_…⁸ TEAM_…⁹
##           <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1       -0.707 -1.08    -0.934   0.137  0.671    0.904   0.0391   0.853 -0.0134
##  2        1.51  -0.0587   1.49   -1.13   0.00740 -1.69    2.42    -0.143 -0.0134
##  3       -2.56  -1.84    -0.392  -1.48  -1.28     0.0558  0.265   -0.143 -0.0134
##  4        0.157  1.01    -0.934   1.93   0.444    0.884  -0.567   -1.09  -2.56  
##  5        0.128 -0.296   -0.536   0.268 -0.437    0.773  -0.234    0.119 -0.0134
##  6       -1.04  -1.03    -0.825  -0.357 -0.0654   0.634   0.586    1.32  -0.0134
##  7        0.193  0.445   -1.01    0.351 -0.0573   0.466  -0.472    0.381 -0.0134
##  8        0.997  0.872    0.366   0.433  0.0559  -0.371  -0.234   -0.143 -0.0134
##  9       -0.604  0.0900  -0.536   1.08   0.865    0.933  -0.900   -1.45  -0.0134
## 10       -0.884 -0.692   -0.645  -1.10  -0.356   -0.858   0.0629  -0.143 -0.0134
## # … with 2,038 more rows, 7 more variables: TEAM_PITCHING_H <dbl>,
## #   TEAM_PITCHING_HR <dbl>, TEAM_PITCHING_BB <dbl>, TEAM_PITCHING_SO <dbl>,
## #   TEAM_FIELDING_E <dbl>, TEAM_FIELDING_DP <dbl>, TARGET_WINS <dbl>, and
## #   abbreviated variable names ¹​TEAM_BATTING_H, ²​TEAM_BATTING_2B,
## #   ³​TEAM_BATTING_3B, ⁴​TEAM_BATTING_HR, ⁵​TEAM_BATTING_BB, ⁶​TEAM_BATTING_SO,
## #   ⁷​TEAM_BASERUN_SB, ⁸​TEAM_BASERUN_CS, ⁹​TEAM_BATTING_HBP
# Lets fit again with prepared data
lm_fit <- lm_model %>% 
                fit(TARGET_WINS ~ ., data = training_baked)
# Let view variable importance on our prediction model
vip(lm_fit)