Load Libraries
#setwd("Code/HW_1")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#library(forecast)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.1 ✔ rsample 1.1.0
## ✔ dials 1.1.0 ✔ tune 1.0.1
## ✔ infer 1.0.3 ✔ workflows 1.1.2
## ✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.3 ✔ yardstick 1.1.0
## ✔ recipes 1.0.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(vip) # for variable importance
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
Load Data
test <- read_csv("moneyball-evaluation-data.csv")
## Rows: 259 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): INDEX, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATT...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
training <- read_csv("moneyball-training-data.csv")
## Rows: 2276 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (17): INDEX, TARGET_WINS, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Review Data
# Skim is a nice visualization tool
skimr::skim(training)
Data summary
| Name |
training |
| Number of rows |
2276 |
| Number of columns |
17 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
17 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| INDEX |
0 |
1.00 |
1268.46 |
736.35 |
1 |
630.75 |
1270.5 |
1915.50 |
2535 |
▇▇▇▇▇ |
| TARGET_WINS |
0 |
1.00 |
80.79 |
15.75 |
0 |
71.00 |
82.0 |
92.00 |
146 |
▁▁▇▅▁ |
| TEAM_BATTING_H |
0 |
1.00 |
1469.27 |
144.59 |
891 |
1383.00 |
1454.0 |
1537.25 |
2554 |
▁▇▂▁▁ |
| TEAM_BATTING_2B |
0 |
1.00 |
241.25 |
46.80 |
69 |
208.00 |
238.0 |
273.00 |
458 |
▁▆▇▂▁ |
| TEAM_BATTING_3B |
0 |
1.00 |
55.25 |
27.94 |
0 |
34.00 |
47.0 |
72.00 |
223 |
▇▇▂▁▁ |
| TEAM_BATTING_HR |
0 |
1.00 |
99.61 |
60.55 |
0 |
42.00 |
102.0 |
147.00 |
264 |
▇▆▇▅▁ |
| TEAM_BATTING_BB |
0 |
1.00 |
501.56 |
122.67 |
0 |
451.00 |
512.0 |
580.00 |
878 |
▁▁▇▇▁ |
| TEAM_BATTING_SO |
102 |
0.96 |
735.61 |
248.53 |
0 |
548.00 |
750.0 |
930.00 |
1399 |
▁▆▇▇▁ |
| TEAM_BASERUN_SB |
131 |
0.94 |
124.76 |
87.79 |
0 |
66.00 |
101.0 |
156.00 |
697 |
▇▃▁▁▁ |
| TEAM_BASERUN_CS |
772 |
0.66 |
52.80 |
22.96 |
0 |
38.00 |
49.0 |
62.00 |
201 |
▃▇▁▁▁ |
| TEAM_BATTING_HBP |
2085 |
0.08 |
59.36 |
12.97 |
29 |
50.50 |
58.0 |
67.00 |
95 |
▂▇▇▅▁ |
| TEAM_PITCHING_H |
0 |
1.00 |
1779.21 |
1406.84 |
1137 |
1419.00 |
1518.0 |
1682.50 |
30132 |
▇▁▁▁▁ |
| TEAM_PITCHING_HR |
0 |
1.00 |
105.70 |
61.30 |
0 |
50.00 |
107.0 |
150.00 |
343 |
▇▇▆▁▁ |
| TEAM_PITCHING_BB |
0 |
1.00 |
553.01 |
166.36 |
0 |
476.00 |
536.5 |
611.00 |
3645 |
▇▁▁▁▁ |
| TEAM_PITCHING_SO |
102 |
0.96 |
817.73 |
553.09 |
0 |
615.00 |
813.5 |
968.00 |
19278 |
▇▁▁▁▁ |
| TEAM_FIELDING_E |
0 |
1.00 |
246.48 |
227.77 |
65 |
127.00 |
159.0 |
249.25 |
1898 |
▇▁▁▁▁ |
| TEAM_FIELDING_DP |
286 |
0.87 |
146.39 |
26.23 |
52 |
131.00 |
149.0 |
164.00 |
228 |
▁▂▇▆▁ |
Data summary
| Name |
test |
| Number of rows |
259 |
| Number of columns |
16 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
16 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| INDEX |
0 |
1.00 |
1263.93 |
693.29 |
9 |
708.0 |
1249.0 |
1832.50 |
2525 |
▆▆▆▇▅ |
| TEAM_BATTING_H |
0 |
1.00 |
1469.39 |
150.66 |
819 |
1387.0 |
1455.0 |
1548.00 |
2170 |
▁▂▇▁▁ |
| TEAM_BATTING_2B |
0 |
1.00 |
241.32 |
49.52 |
44 |
210.0 |
239.0 |
278.50 |
376 |
▁▂▇▇▂ |
| TEAM_BATTING_3B |
0 |
1.00 |
55.91 |
27.14 |
14 |
35.0 |
52.0 |
72.00 |
155 |
▇▇▃▁▁ |
| TEAM_BATTING_HR |
0 |
1.00 |
95.63 |
56.33 |
0 |
44.5 |
101.0 |
135.50 |
242 |
▆▅▇▃▁ |
| TEAM_BATTING_BB |
0 |
1.00 |
498.96 |
120.59 |
15 |
436.5 |
509.0 |
565.50 |
792 |
▁▁▅▇▁ |
| TEAM_BATTING_SO |
18 |
0.93 |
709.34 |
243.11 |
0 |
545.0 |
686.0 |
912.00 |
1268 |
▁▃▇▇▂ |
| TEAM_BASERUN_SB |
13 |
0.95 |
123.70 |
93.39 |
0 |
59.0 |
92.0 |
151.75 |
580 |
▇▃▁▁▁ |
| TEAM_BASERUN_CS |
87 |
0.66 |
52.32 |
23.10 |
0 |
38.0 |
49.5 |
63.00 |
154 |
▂▇▃▁▁ |
| TEAM_BATTING_HBP |
240 |
0.07 |
62.37 |
12.71 |
42 |
53.5 |
62.0 |
67.50 |
96 |
▃▇▅▁▁ |
| TEAM_PITCHING_H |
0 |
1.00 |
1813.46 |
1662.91 |
1155 |
1426.5 |
1515.0 |
1681.00 |
22768 |
▇▁▁▁▁ |
| TEAM_PITCHING_HR |
0 |
1.00 |
102.15 |
57.65 |
0 |
52.0 |
104.0 |
142.50 |
336 |
▇▇▆▁▁ |
| TEAM_PITCHING_BB |
0 |
1.00 |
552.42 |
172.95 |
136 |
471.0 |
526.0 |
606.50 |
2008 |
▆▇▁▁▁ |
| TEAM_PITCHING_SO |
18 |
0.93 |
799.67 |
634.31 |
0 |
613.0 |
745.0 |
938.00 |
9963 |
▇▁▁▁▁ |
| TEAM_FIELDING_E |
0 |
1.00 |
249.75 |
230.90 |
73 |
131.0 |
163.0 |
252.00 |
1568 |
▇▁▁▁▁ |
| TEAM_FIELDING_DP |
31 |
0.88 |
146.06 |
25.88 |
69 |
131.0 |
148.0 |
164.00 |
204 |
▁▂▇▇▂ |
# Lets plot the data (BOxplot)
training_long <- training %>%
select(-INDEX) %>%
melt()
## No id variables; using all as measure variables
training_long %>%
filter(complete.cases(.)) %>%
ggplot(aes(x= variable, y=value)) +
geom_boxplot(fill="#FF5733") +
scale_y_log10(labels = label_comma()) +
coord_flip() +
theme_minimal() +
labs(y="Statistic's Value", x="Statistic",
title="Distribution of Important Baseball Statistics")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 78 rows containing non-finite values (`stat_boxplot()`).

# Another way to see the Boxplots
p <- ggplot(training_long, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")
## Warning: Removed 3478 rows containing non-finite values (`stat_boxplot()`).

# Plot densities
p <- ggplot(training_long, aes(value))
p + geom_density() + facet_wrap(~variable, scale="free")
## Warning: Removed 3478 rows containing non-finite values (`stat_density()`).

#res <- cor(training, use="pairwise.complete.obs")
#round(res, 2)
#don't run. doesn't look good
#ggpairs(training, title="correlogram with ggpairs()")
Feature engineering and Data cleaning
#Replace NA with median of Column
training <- training %>% mutate_all(~ifelse(is.na(.x), median(.x, na.rm = TRUE), .x))
test <- test %>% mutate_all(~ifelse(is.na(.x), median(.x, na.rm = TRUE), .x))
# Let's check if any NA left
training %>%
summarise_all(~sum(is.na(.))) %>%
t()
## [,1]
## INDEX 0
## TARGET_WINS 0
## TEAM_BATTING_H 0
## TEAM_BATTING_2B 0
## TEAM_BATTING_3B 0
## TEAM_BATTING_HR 0
## TEAM_BATTING_BB 0
## TEAM_BATTING_SO 0
## TEAM_BASERUN_SB 0
## TEAM_BASERUN_CS 0
## TEAM_BATTING_HBP 0
## TEAM_PITCHING_H 0
## TEAM_PITCHING_HR 0
## TEAM_PITCHING_BB 0
## TEAM_PITCHING_SO 0
## TEAM_FIELDING_E 0
## TEAM_FIELDING_DP 0
# ANother way to check NA's
colSums(is.na(training))
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 0 0 0 0
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 0 0 0 0
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 0 0 0 0
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 0 0 0 0
## TEAM_FIELDING_DP
## 0
# One final way to seel all NA's
test %>%
summarise_all(~sum(is.na(.))) %>%
t()
## [,1]
## INDEX 0
## TEAM_BATTING_H 0
## TEAM_BATTING_2B 0
## TEAM_BATTING_3B 0
## TEAM_BATTING_HR 0
## TEAM_BATTING_BB 0
## TEAM_BATTING_SO 0
## TEAM_BASERUN_SB 0
## TEAM_BASERUN_CS 0
## TEAM_BATTING_HBP 0
## TEAM_PITCHING_H 0
## TEAM_PITCHING_HR 0
## TEAM_PITCHING_BB 0
## TEAM_PITCHING_SO 0
## TEAM_FIELDING_E 0
## TEAM_FIELDING_DP 0
#lets remove INDEX. Not needed
training <- training %>%
dplyr::select(-INDEX)
test <- test %>%
dplyr::select(-INDEX)
Modelling
# Create a split object TEST and TRAINING
set.seed(42)
my_split <- initial_split(training, prop = 0.9)
# Build training data set
my_training <- my_split %>% training()
# Build testing data set
my_test <- my_split %>% testing()
# Tidymodels recipe to preprocess data
recipe1 <- recipe(TARGET_WINS ~ ., data = my_training) %>%
step_BoxCox(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes())
# Tidymodels define regression model
lm_model <- linear_reg() %>%
set_engine('lm') %>%
set_mode('regression')
# Define a Tidymodels WORKFLOW
workflow1 <- workflow() %>%
add_model(lm_model) %>%
add_recipe(recipe1)
my_fit <- workflow1 %>%
last_fit(my_split)
## ! train/test split: preprocessor 1/1: Non-positive values in selected variable., No Box-Cox transformation cou...
# Obtain performance metrics on test data
my_fit %>% collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 13.2 Preprocessor1_Model1
## 2 rsq standard 0.245 Preprocessor1_Model1
# Predict on Test DATA
my_fit$.workflow[[1]] %>%
predict(test)
## # A tibble: 259 × 1
## .pred
## <dbl>
## 1 62.4
## 2 65.6
## 3 75.3
## 4 82.8
## 5 66.3
## 6 67.1
## 7 78.2
## 8 74.0
## 9 70.8
## 10 74.1
## # … with 249 more rows
Analysis of Results
# Obtain test set predictions data frame
results <- my_fit %>%
collect_predictions()
# View results
results
## # A tibble: 228 × 5
## id .pred .row TARGET_WINS .config
## <chr> <dbl> <int> <dbl> <chr>
## 1 train/test split 67.4 5 82 Preprocessor1_Model1
## 2 train/test split 65.9 7 80 Preprocessor1_Model1
## 3 train/test split 82.6 38 82 Preprocessor1_Model1
## 4 train/test split 88.5 46 85 Preprocessor1_Model1
## 5 train/test split 71.0 60 107 Preprocessor1_Model1
## 6 train/test split 58.9 80 53 Preprocessor1_Model1
## 7 train/test split 70.4 81 63 Preprocessor1_Model1
## 8 train/test split 76.7 96 57 Preprocessor1_Model1
## 9 train/test split 76.6 104 67 Preprocessor1_Model1
## 10 train/test split 81.3 117 86 Preprocessor1_Model1
## # … with 218 more rows
# Plot results
ggplot(data = results,
mapping = aes(x = .pred, y = TARGET_WINS)) +
geom_point(color = '#006EA1', alpha = 0.25) +
geom_abline(intercept = 0, slope = 1, color = 'orange') +
labs(title = 'Linear Regression Results - Training/Test Set',
x = 'Predicted Wind',
y = 'Actual Wins')

# Lets prepare data for further analysis of variable importance
training_baked <- recipe1 %>%
prep() %>%
bake(new_data = my_training)
## Warning: Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Non-positive values in selected variable.
## Warning: No Box-Cox transformation could be estimated for: `TEAM_BATTING_3B`,
## `TEAM_BATTING_HR`, `TEAM_BATTING_BB`, `TEAM_BATTING_SO`, `TEAM_BASERUN_SB`,
## `TEAM_BASERUN_CS`, `TEAM_PITCHING_HR`, `TEAM_PITCHING_BB`, `TEAM_PITCHING_SO`
# View results
training_baked
## # A tibble: 2,048 × 16
## TEAM_BATTI…¹ TEAM_…² TEAM_…³ TEAM_…⁴ TEAM_B…⁵ TEAM_…⁶ TEAM_…⁷ TEAM_…⁸ TEAM_…⁹
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.707 -1.08 -0.934 0.137 0.671 0.904 0.0391 0.853 -0.0134
## 2 1.51 -0.0587 1.49 -1.13 0.00740 -1.69 2.42 -0.143 -0.0134
## 3 -2.56 -1.84 -0.392 -1.48 -1.28 0.0558 0.265 -0.143 -0.0134
## 4 0.157 1.01 -0.934 1.93 0.444 0.884 -0.567 -1.09 -2.56
## 5 0.128 -0.296 -0.536 0.268 -0.437 0.773 -0.234 0.119 -0.0134
## 6 -1.04 -1.03 -0.825 -0.357 -0.0654 0.634 0.586 1.32 -0.0134
## 7 0.193 0.445 -1.01 0.351 -0.0573 0.466 -0.472 0.381 -0.0134
## 8 0.997 0.872 0.366 0.433 0.0559 -0.371 -0.234 -0.143 -0.0134
## 9 -0.604 0.0900 -0.536 1.08 0.865 0.933 -0.900 -1.45 -0.0134
## 10 -0.884 -0.692 -0.645 -1.10 -0.356 -0.858 0.0629 -0.143 -0.0134
## # … with 2,038 more rows, 7 more variables: TEAM_PITCHING_H <dbl>,
## # TEAM_PITCHING_HR <dbl>, TEAM_PITCHING_BB <dbl>, TEAM_PITCHING_SO <dbl>,
## # TEAM_FIELDING_E <dbl>, TEAM_FIELDING_DP <dbl>, TARGET_WINS <dbl>, and
## # abbreviated variable names ¹TEAM_BATTING_H, ²TEAM_BATTING_2B,
## # ³TEAM_BATTING_3B, ⁴TEAM_BATTING_HR, ⁵TEAM_BATTING_BB, ⁶TEAM_BATTING_SO,
## # ⁷TEAM_BASERUN_SB, ⁸TEAM_BASERUN_CS, ⁹TEAM_BATTING_HBP
# Lets fit again with prepared data
lm_fit <- lm_model %>%
fit(TARGET_WINS ~ ., data = training_baked)
# Let view variable importance on our prediction model
vip(lm_fit)
