Data621 - HW 1

Load Libraries

#setwd("Code/HW_1")
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

#library(forecast)
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.0
## ✔ dials        1.1.0     ✔ tune         1.0.1
## ✔ infer        1.0.3     ✔ workflows    1.1.2
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.3     ✔ yardstick    1.1.0
## ✔ recipes      1.0.3     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(vip) # for variable importance

## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi

Load Data

test <- read_csv("moneyball-evaluation-data.csv")

## Rows: 259 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): INDEX, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATT...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

training <- read_csv("moneyball-training-data.csv")

## Rows: 2276 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (17): INDEX, TARGET_WINS, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Review Data

# Skim is a nice visualization tool
skimr::skim(training)

Data summary
Name	training
Number of rows	2276
Number of columns	17
_______________________
Column type frequency:
numeric	17
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
INDEX	0	1.00	1268.46	736.35	1	630.75	1270.5	1915.50	2535	▇▇▇▇▇
TARGET_WINS	0	1.00	80.79	15.75	0	71.00	82.0	92.00	146	▁▁▇▅▁
TEAM_BATTING_H	0	1.00	1469.27	144.59	891	1383.00	1454.0	1537.25	2554	▁▇▂▁▁
TEAM_BATTING_2B	0	1.00	241.25	46.80	69	208.00	238.0	273.00	458	▁▆▇▂▁
TEAM_BATTING_3B	0	1.00	55.25	27.94	0	34.00	47.0	72.00	223	▇▇▂▁▁
TEAM_BATTING_HR	0	1.00	99.61	60.55	0	42.00	102.0	147.00	264	▇▆▇▅▁
TEAM_BATTING_BB	0	1.00	501.56	122.67	0	451.00	512.0	580.00	878	▁▁▇▇▁
TEAM_BATTING_SO	102	0.96	735.61	248.53	0	548.00	750.0	930.00	1399	▁▆▇▇▁
TEAM_BASERUN_SB	131	0.94	124.76	87.79	0	66.00	101.0	156.00	697	▇▃▁▁▁
TEAM_BASERUN_CS	772	0.66	52.80	22.96	0	38.00	49.0	62.00	201	▃▇▁▁▁
TEAM_BATTING_HBP	2085	0.08	59.36	12.97	29	50.50	58.0	67.00	95	▂▇▇▅▁
TEAM_PITCHING_H	0	1.00	1779.21	1406.84	1137	1419.00	1518.0	1682.50	30132	▇▁▁▁▁
TEAM_PITCHING_HR	0	1.00	105.70	61.30	0	50.00	107.0	150.00	343	▇▇▆▁▁
TEAM_PITCHING_BB	0	1.00	553.01	166.36	0	476.00	536.5	611.00	3645	▇▁▁▁▁
TEAM_PITCHING_SO	102	0.96	817.73	553.09	0	615.00	813.5	968.00	19278	▇▁▁▁▁
TEAM_FIELDING_E	0	1.00	246.48	227.77	65	127.00	159.0	249.25	1898	▇▁▁▁▁
TEAM_FIELDING_DP	286	0.87	146.39	26.23	52	131.00	149.0	164.00	228	▁▂▇▆▁

skimr::skim(test)

Data summary
Name	test
Number of rows	259
Number of columns	16
_______________________
Column type frequency:
numeric	16
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
INDEX	0	1.00	1263.93	693.29	9	708.0	1249.0	1832.50	2525	▆▆▆▇▅
TEAM_BATTING_H	0	1.00	1469.39	150.66	819	1387.0	1455.0	1548.00	2170	▁▂▇▁▁
TEAM_BATTING_2B	0	1.00	241.32	49.52	44	210.0	239.0	278.50	376	▁▂▇▇▂
TEAM_BATTING_3B	0	1.00	55.91	27.14	14	35.0	52.0	72.00	155	▇▇▃▁▁
TEAM_BATTING_HR	0	1.00	95.63	56.33	0	44.5	101.0	135.50	242	▆▅▇▃▁
TEAM_BATTING_BB	0	1.00	498.96	120.59	15	436.5	509.0	565.50	792	▁▁▅▇▁
TEAM_BATTING_SO	18	0.93	709.34	243.11	0	545.0	686.0	912.00	1268	▁▃▇▇▂
TEAM_BASERUN_SB	13	0.95	123.70	93.39	0	59.0	92.0	151.75	580	▇▃▁▁▁
TEAM_BASERUN_CS	87	0.66	52.32	23.10	0	38.0	49.5	63.00	154	▂▇▃▁▁
TEAM_BATTING_HBP	240	0.07	62.37	12.71	42	53.5	62.0	67.50	96	▃▇▅▁▁
TEAM_PITCHING_H	0	1.00	1813.46	1662.91	1155	1426.5	1515.0	1681.00	22768	▇▁▁▁▁
TEAM_PITCHING_HR	0	1.00	102.15	57.65	0	52.0	104.0	142.50	336	▇▇▆▁▁
TEAM_PITCHING_BB	0	1.00	552.42	172.95	136	471.0	526.0	606.50	2008	▆▇▁▁▁
TEAM_PITCHING_SO	18	0.93	799.67	634.31	0	613.0	745.0	938.00	9963	▇▁▁▁▁
TEAM_FIELDING_E	0	1.00	249.75	230.90	73	131.0	163.0	252.00	1568	▇▁▁▁▁
TEAM_FIELDING_DP	31	0.88	146.06	25.88	69	131.0	148.0	164.00	204	▁▂▇▇▂

# Lets plot the data (BOxplot)
training_long <- training %>%
  select(-INDEX) %>%
  melt()

## No id variables; using all as measure variables

training_long %>%
  filter(complete.cases(.)) %>%
  ggplot(aes(x= variable, y=value)) +
  geom_boxplot(fill="#FF5733") +
  scale_y_log10(labels = label_comma()) +
  coord_flip() +
  theme_minimal() +
  labs(y="Statistic's Value", x="Statistic",
       title="Distribution of Important Baseball Statistics")

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 78 rows containing non-finite values (`stat_boxplot()`).

# Another way to see the Boxplots
p <- ggplot(training_long, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")

## Warning: Removed 3478 rows containing non-finite values (`stat_boxplot()`).

# Plot densities
p <- ggplot(training_long, aes(value)) 
p + geom_density() + facet_wrap(~variable, scale="free")

## Warning: Removed 3478 rows containing non-finite values (`stat_density()`).

#res <- cor(training, use="pairwise.complete.obs")
#round(res, 2)

#don't run. doesn't look good
#ggpairs(training, title="correlogram with ggpairs()")

Feature engineering and Data cleaning

#Replace NA with median of Column

training <- training %>% mutate_all(~ifelse(is.na(.x), median(.x, na.rm = TRUE), .x))

test <- test %>% mutate_all(~ifelse(is.na(.x), median(.x, na.rm = TRUE), .x))

# Let's check if any NA left
training %>% 
  summarise_all(~sum(is.na(.))) %>%
  t()

##                  [,1]
## INDEX               0
## TARGET_WINS         0
## TEAM_BATTING_H      0
## TEAM_BATTING_2B     0
## TEAM_BATTING_3B     0
## TEAM_BATTING_HR     0
## TEAM_BATTING_BB     0
## TEAM_BATTING_SO     0
## TEAM_BASERUN_SB     0
## TEAM_BASERUN_CS     0
## TEAM_BATTING_HBP    0
## TEAM_PITCHING_H     0
## TEAM_PITCHING_HR    0
## TEAM_PITCHING_BB    0
## TEAM_PITCHING_SO    0
## TEAM_FIELDING_E     0
## TEAM_FIELDING_DP    0

# ANother way to check NA's
colSums(is.na(training))

##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##                0                0                0                0 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##                0                0                0                0 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
##                0                0                0                0 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##                0                0                0                0 
## TEAM_FIELDING_DP 
##                0

# One final way to seel all NA's
test %>% 
  summarise_all(~sum(is.na(.))) %>%
  t()

##                  [,1]
## INDEX               0
## TEAM_BATTING_H      0
## TEAM_BATTING_2B     0
## TEAM_BATTING_3B     0
## TEAM_BATTING_HR     0
## TEAM_BATTING_BB     0
## TEAM_BATTING_SO     0
## TEAM_BASERUN_SB     0
## TEAM_BASERUN_CS     0
## TEAM_BATTING_HBP    0
## TEAM_PITCHING_H     0
## TEAM_PITCHING_HR    0
## TEAM_PITCHING_BB    0
## TEAM_PITCHING_SO    0
## TEAM_FIELDING_E     0
## TEAM_FIELDING_DP    0

#lets remove INDEX. Not needed
training <- training %>%
  dplyr::select(-INDEX)

test <- test %>%
  dplyr::select(-INDEX)

Modelling

# Create a split object TEST and TRAINING
set.seed(42)
my_split <- initial_split(training, prop = 0.9)

# Build training data set
my_training <- my_split %>% training()

# Build testing data set
my_test <- my_split %>% testing()

# Tidymodels recipe to preprocess data
recipe1 <- recipe(TARGET_WINS ~ ., data = my_training) %>% 
                      step_BoxCox(all_numeric(), -all_outcomes()) %>% 
                      step_normalize(all_numeric(), -all_outcomes())

# Tidymodels define regression model
lm_model <- linear_reg() %>% 
            set_engine('lm') %>% 
            set_mode('regression')

# Define a Tidymodels WORKFLOW
workflow1 <- workflow() %>% 
                        add_model(lm_model) %>% 
                        add_recipe(recipe1)

my_fit <- workflow1 %>% 
                   last_fit(my_split)

## ! train/test split: preprocessor 1/1: Non-positive values in selected variable., No Box-Cox transformation cou...

# Obtain performance metrics on test data
my_fit %>% collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      13.2   Preprocessor1_Model1
## 2 rsq     standard       0.245 Preprocessor1_Model1

# Predict on Test DATA
my_fit$.workflow[[1]] %>%
  predict(test)

## # A tibble: 259 × 1
##    .pred
##    <dbl>
##  1  62.4
##  2  65.6
##  3  75.3
##  4  82.8
##  5  66.3
##  6  67.1
##  7  78.2
##  8  74.0
##  9  70.8
## 10  74.1
## # … with 249 more rows

Analysis of Results

# Obtain test set predictions data frame
results <- my_fit %>% 
                 collect_predictions()
# View results
results

## # A tibble: 228 × 5
##    id               .pred  .row TARGET_WINS .config             
##    <chr>            <dbl> <int>       <dbl> <chr>               
##  1 train/test split  67.4     5          82 Preprocessor1_Model1
##  2 train/test split  65.9     7          80 Preprocessor1_Model1
##  3 train/test split  82.6    38          82 Preprocessor1_Model1
##  4 train/test split  88.5    46          85 Preprocessor1_Model1
##  5 train/test split  71.0    60         107 Preprocessor1_Model1
##  6 train/test split  58.9    80          53 Preprocessor1_Model1
##  7 train/test split  70.4    81          63 Preprocessor1_Model1
##  8 train/test split  76.7    96          57 Preprocessor1_Model1
##  9 train/test split  76.6   104          67 Preprocessor1_Model1
## 10 train/test split  81.3   117          86 Preprocessor1_Model1
## # … with 218 more rows

# Plot results
ggplot(data = results,
       mapping = aes(x = .pred, y = TARGET_WINS)) +
  geom_point(color = '#006EA1', alpha = 0.25) +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +
  labs(title = 'Linear Regression Results - Training/Test Set',
       x = 'Predicted Wind',
       y = 'Actual Wins')

# Lets prepare data for further analysis of variable importance
training_baked <- recipe1 %>% 
                        prep() %>% 
                        bake(new_data = my_training)

## Warning: Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.

## Warning: No Box-Cox transformation could be estimated for: `TEAM_BATTING_3B`,
## `TEAM_BATTING_HR`, `TEAM_BATTING_BB`, `TEAM_BATTING_SO`, `TEAM_BASERUN_SB`,
## `TEAM_BASERUN_CS`, `TEAM_PITCHING_HR`, `TEAM_PITCHING_BB`, `TEAM_PITCHING_SO`

# View results
training_baked

## # A tibble: 2,048 × 16
##    TEAM_BATTI…¹ TEAM_…² TEAM_…³ TEAM_…⁴ TEAM_B…⁵ TEAM_…⁶ TEAM_…⁷ TEAM_…⁸ TEAM_…⁹
##           <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1       -0.707 -1.08    -0.934   0.137  0.671    0.904   0.0391   0.853 -0.0134
##  2        1.51  -0.0587   1.49   -1.13   0.00740 -1.69    2.42    -0.143 -0.0134
##  3       -2.56  -1.84    -0.392  -1.48  -1.28     0.0558  0.265   -0.143 -0.0134
##  4        0.157  1.01    -0.934   1.93   0.444    0.884  -0.567   -1.09  -2.56  
##  5        0.128 -0.296   -0.536   0.268 -0.437    0.773  -0.234    0.119 -0.0134
##  6       -1.04  -1.03    -0.825  -0.357 -0.0654   0.634   0.586    1.32  -0.0134
##  7        0.193  0.445   -1.01    0.351 -0.0573   0.466  -0.472    0.381 -0.0134
##  8        0.997  0.872    0.366   0.433  0.0559  -0.371  -0.234   -0.143 -0.0134
##  9       -0.604  0.0900  -0.536   1.08   0.865    0.933  -0.900   -1.45  -0.0134
## 10       -0.884 -0.692   -0.645  -1.10  -0.356   -0.858   0.0629  -0.143 -0.0134
## # … with 2,038 more rows, 7 more variables: TEAM_PITCHING_H <dbl>,
## #   TEAM_PITCHING_HR <dbl>, TEAM_PITCHING_BB <dbl>, TEAM_PITCHING_SO <dbl>,
## #   TEAM_FIELDING_E <dbl>, TEAM_FIELDING_DP <dbl>, TARGET_WINS <dbl>, and
## #   abbreviated variable names ¹TEAM_BATTING_H, ²TEAM_BATTING_2B,
## #   ³TEAM_BATTING_3B, ⁴TEAM_BATTING_HR, ⁵TEAM_BATTING_BB, ⁶TEAM_BATTING_SO,
## #   ⁷TEAM_BASERUN_SB, ⁸TEAM_BASERUN_CS, ⁹TEAM_BATTING_HBP

# Lets fit again with prepared data
lm_fit <- lm_model %>% 
                fit(TARGET_WINS ~ ., data = training_baked)

# Let view variable importance on our prediction model
vip(lm_fit)