library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df_train <- read.csv("moneyball-training-data.csv")
df_train <- df_train[, !names(df_train) %in% c("INDEX")]
head(df_train, 3)
df_eval <- read.csv("moneyball-evaluation-data.csv")
df_eval <- df_train[, !names(df_eval) %in% c("INDEX")]
head(df_eval, 3)
a. Summary of both the data: training dataset and evaluation dataset.
summary(df_train$TARGET_WINS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 71.00 82.00 80.79 92.00 146.00
Here, it can be observed that the TARGET_WINS has median 82 and mean 80.79.
Maximum score to win is 146. Interquartile range os 82-92 i.e., 10
summary_stats <- apply(df_train, 2, function(x) c(min = min(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
mean = mean(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
standar_deivaition = sd(x, na.rm = TRUE)))
summary_df <- as.data.frame(summary_stats)
summary_df <- t(summary_df)
colnames(summary_df) <- c("Minimum", "Median", "Mean", "Maximum", 'Standard Deviation')
summary_df
## Minimum Median Mean Maximum Standard Deviation
## TARGET_WINS 0 82.0 80.79086 146 15.75215
## TEAM_BATTING_H 891 1454.0 1469.26977 2554 144.59120
## TEAM_BATTING_2B 69 238.0 241.24692 458 46.80141
## TEAM_BATTING_3B 0 47.0 55.25000 223 27.93856
## TEAM_BATTING_HR 0 102.0 99.61204 264 60.54687
## TEAM_BATTING_BB 0 512.0 501.55888 878 122.67086
## TEAM_BATTING_SO 0 750.0 735.60534 1399 248.52642
## TEAM_BASERUN_SB 0 101.0 124.76177 697 87.79117
## TEAM_BASERUN_CS 0 49.0 52.80386 201 22.95634
## TEAM_BATTING_HBP 29 58.0 59.35602 95 12.96712
## TEAM_PITCHING_H 1137 1518.0 1779.21046 30132 1406.84293
## TEAM_PITCHING_HR 0 107.0 105.69859 343 61.29875
## TEAM_PITCHING_BB 0 536.5 553.00791 3645 166.35736
## TEAM_PITCHING_SO 0 813.5 817.73045 19278 553.08503
## TEAM_FIELDING_E 65 159.0 246.48067 1898 227.77097
## TEAM_FIELDING_DP 52 149.0 146.38794 228 26.22639
b. Distribution plot of the variable ‘TARGET_WINS’.
ggplot(data = df_train, aes(x= TARGET_WINS))+
geom_bar(color='blue', fill='lightgreen')+
labs(title="Bar plot of the dependent variable TARGET_WINS",
x="Dependent variable TARGET_WINS")
It can be seen the deistribution seems almost normal. Most of the data is around the mean.
# Convert dataframe to long format
df_long <- gather(df_train, key = "Variable", value = "Value")
# Create bar plots or histograms for each variable
ggplot(df_long, aes(x = Value)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
geom_bar(fill = "skyblue", color = "black", stat = "count", position = "identity") +
facet_wrap(~ Variable, scales = "free") +
labs(title = "Histograms and Bar plots for all variables") +
theme_minimal()
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_count()`).
summary(df_eval)
## TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## Min. : 891 Min. : 69.0 Min. : 0.00 Min. : 0.00
## 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00 1st Qu.: 42.00
## Median :1454 Median :238.0 Median : 47.00 Median :102.00
## Mean :1469 Mean :241.2 Mean : 55.25 Mean : 99.61
## 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00 3rd Qu.:147.00
## Max. :2554 Max. :458.0 Max. :223.00 Max. :264.00
##
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0 1st Qu.: 38.0
## Median :512.0 Median : 750.0 Median :101.0 Median : 49.0
## Mean :501.6 Mean : 735.6 Mean :124.8 Mean : 52.8
## 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0 3rd Qu.: 62.0
## Max. :878.0 Max. :1399.0 Max. :697.0 Max. :201.0
## NA's :102 NA's :131 NA's :772
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## Min. :29.00 Min. : 1137 Min. : 0.0 Min. : 0.0
## 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0 1st Qu.: 476.0
## Median :58.00 Median : 1518 Median :107.0 Median : 536.5
## Mean :59.36 Mean : 1779 Mean :105.7 Mean : 553.0
## 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0 3rd Qu.: 611.0
## Max. :95.00 Max. :30132 Max. :343.0 Max. :3645.0
## NA's :2085
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 0.0 Min. : 65.0 Min. : 52.0
## 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
## Median : 813.5 Median : 159.0 Median :149.0
## Mean : 817.7 Mean : 246.5 Mean :146.4
## 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
## Max. :19278.0 Max. :1898.0 Max. :228.0
## NA's :102 NA's :286
It can be seen that there are missing values in the training data. SInce there are large number of missing values, therefore, it would not be a wise decision to drop all of them. Hence, the missing values might be filled with mode, median or mean value depending on the nature of a feature
c. Check if there is any correlation among the target and features.
# Load necessary libraries
library(corrplot)
## corrplot 0.92 loaded
# Select all variables except INDEX
selected_vars <- df_train[, !names(df_train) %in% c("INDEX")]
# Compute correlation matrix
correlation_matrix <- cor(selected_vars, use="pairwise.complete.obs")
# Print correlation matrix
print(correlation_matrix)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## TARGET_WINS 1.00000000 0.388767521 0.28910365 0.142608411
## TEAM_BATTING_H 0.38876752 1.000000000 0.56284968 0.427696575
## TEAM_BATTING_2B 0.28910365 0.562849678 1.00000000 -0.107305824
## TEAM_BATTING_3B 0.14260841 0.427696575 -0.10730582 1.000000000
## TEAM_BATTING_HR 0.17615320 -0.006544685 0.43539729 -0.635566946
## TEAM_BATTING_BB 0.23255986 -0.072464013 0.25572610 -0.287235841
## TEAM_BATTING_SO -0.03175071 -0.463853571 0.16268519 -0.669781188
## TEAM_BASERUN_SB 0.13513892 0.123567797 -0.19975724 0.533506448
## TEAM_BASERUN_CS 0.02240407 0.016705668 -0.09981406 0.348764919
## TEAM_BATTING_HBP 0.07350424 -0.029112176 0.04608475 -0.174247154
## TEAM_PITCHING_H -0.10993705 0.302693709 0.02369219 0.194879411
## TEAM_PITCHING_HR 0.18901373 0.072853119 0.45455082 -0.567836679
## TEAM_PITCHING_BB 0.12417454 0.094193027 0.17805420 -0.002224148
## TEAM_PITCHING_SO -0.07843609 -0.252656790 0.06479231 -0.258818931
## TEAM_FIELDING_E -0.17648476 0.264902478 -0.23515099 0.509778447
## TEAM_FIELDING_DP -0.03485058 0.155383321 0.29087998 -0.323074847
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## TARGET_WINS 0.176153200 0.23255986 -0.03175071
## TEAM_BATTING_H -0.006544685 -0.07246401 -0.46385357
## TEAM_BATTING_2B 0.435397293 0.25572610 0.16268519
## TEAM_BATTING_3B -0.635566946 -0.28723584 -0.66978119
## TEAM_BATTING_HR 1.000000000 0.51373481 0.72706935
## TEAM_BATTING_BB 0.513734810 1.00000000 0.37975087
## TEAM_BATTING_SO 0.727069348 0.37975087 1.00000000
## TEAM_BASERUN_SB -0.453578426 -0.10511564 -0.25448923
## TEAM_BASERUN_CS -0.433793868 -0.13698837 -0.21788137
## TEAM_BATTING_HBP 0.106181160 0.04746007 0.22094219
## TEAM_PITCHING_H -0.250145481 -0.44977762 -0.37568637
## TEAM_PITCHING_HR 0.969371396 0.45955207 0.66717889
## TEAM_PITCHING_BB 0.136927564 0.48936126 0.03700514
## TEAM_PITCHING_SO 0.184707564 -0.02075682 0.41623330
## TEAM_FIELDING_E -0.587339098 -0.65597081 -0.58466444
## TEAM_FIELDING_DP 0.448985348 0.43087675 0.15488939
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP
## TARGET_WINS 0.13513892 0.02240407 0.07350424
## TEAM_BATTING_H 0.12356780 0.01670567 -0.02911218
## TEAM_BATTING_2B -0.19975724 -0.09981406 0.04608475
## TEAM_BATTING_3B 0.53350645 0.34876492 -0.17424715
## TEAM_BATTING_HR -0.45357843 -0.43379387 0.10618116
## TEAM_BATTING_BB -0.10511564 -0.13698837 0.04746007
## TEAM_BATTING_SO -0.25448923 -0.21788137 0.22094219
## TEAM_BASERUN_SB 1.00000000 0.65524480 -0.06400498
## TEAM_BASERUN_CS 0.65524480 1.00000000 -0.07051390
## TEAM_BATTING_HBP -0.06400498 -0.07051390 1.00000000
## TEAM_PITCHING_H 0.07328505 -0.05200781 -0.02769699
## TEAM_PITCHING_HR -0.41651072 -0.42256605 0.10675878
## TEAM_PITCHING_BB 0.14641513 -0.10696124 0.04785137
## TEAM_PITCHING_SO -0.13712861 -0.21022274 0.22157375
## TEAM_FIELDING_E 0.50963090 0.04832189 0.04178971
## TEAM_FIELDING_DP -0.49707763 -0.21424801 -0.07120824
## TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## TARGET_WINS -0.10993705 0.18901373 0.124174536
## TEAM_BATTING_H 0.30269371 0.07285312 0.094193027
## TEAM_BATTING_2B 0.02369219 0.45455082 0.178054204
## TEAM_BATTING_3B 0.19487941 -0.56783668 -0.002224148
## TEAM_BATTING_HR -0.25014548 0.96937140 0.136927564
## TEAM_BATTING_BB -0.44977762 0.45955207 0.489361263
## TEAM_BATTING_SO -0.37568637 0.66717889 0.037005141
## TEAM_BASERUN_SB 0.07328505 -0.41651072 0.146415134
## TEAM_BASERUN_CS -0.05200781 -0.42256605 -0.106961236
## TEAM_BATTING_HBP -0.02769699 0.10675878 0.047851371
## TEAM_PITCHING_H 1.00000000 -0.14161276 0.320676162
## TEAM_PITCHING_HR -0.14161276 1.00000000 0.221937505
## TEAM_PITCHING_BB 0.32067616 0.22193750 1.000000000
## TEAM_PITCHING_SO 0.26724807 0.20588053 0.488498653
## TEAM_FIELDING_E 0.66775901 -0.49314447 -0.022837561
## TEAM_FIELDING_DP -0.22865059 0.43917040 0.324457226
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## TARGET_WINS -0.07843609 -0.17648476 -0.03485058
## TEAM_BATTING_H -0.25265679 0.26490248 0.15538332
## TEAM_BATTING_2B 0.06479231 -0.23515099 0.29087998
## TEAM_BATTING_3B -0.25881893 0.50977845 -0.32307485
## TEAM_BATTING_HR 0.18470756 -0.58733910 0.44898535
## TEAM_BATTING_BB -0.02075682 -0.65597081 0.43087675
## TEAM_BATTING_SO 0.41623330 -0.58466444 0.15488939
## TEAM_BASERUN_SB -0.13712861 0.50963090 -0.49707763
## TEAM_BASERUN_CS -0.21022274 0.04832189 -0.21424801
## TEAM_BATTING_HBP 0.22157375 0.04178971 -0.07120824
## TEAM_PITCHING_H 0.26724807 0.66775901 -0.22865059
## TEAM_PITCHING_HR 0.20588053 -0.49314447 0.43917040
## TEAM_PITCHING_BB 0.48849865 -0.02283756 0.32445723
## TEAM_PITCHING_SO 1.00000000 -0.02329178 0.02615804
## TEAM_FIELDING_E -0.02329178 1.00000000 -0.49768495
## TEAM_FIELDING_DP 0.02615804 -0.49768495 1.00000000
# Create correlation plot
corrplot(correlation_matrix, method="circle", tl.cex = 0.7, tl.col = "black", diag = FALSE)
It can be seen that TARGET_WINS is correlated with “TEAM_BATTING_H”, “TEAM_BATTING_2B”, “TEAM_BATTING_BB”, “TEAM_PITCHING_HR” and negatively correlated with “TEAM_FIELDING_E”. Besides it “TEAM_BATTING_HR” is highly correlated with “TEAM_PITCHING_HR”. Similarly, we can find correaltion between other variables.
d.
glimpse(df_eval)
## Rows: 2,276
## Columns: 15
## $ TEAM_BATTING_H <int> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 1273, 1391,…
## $ TEAM_BATTING_2B <int> 194, 219, 232, 209, 186, 200, 179, 171, 197, 213, 179…
## $ TEAM_BATTING_3B <int> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 31, 41, 2…
## $ TEAM_BATTING_HR <int> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96, 82, 95,…
## $ TEAM_BATTING_BB <int> 143, 685, 602, 451, 472, 443, 525, 456, 447, 441, 374…
## $ TEAM_BATTING_SO <int> 842, 1075, 917, 922, 920, 973, 1062, 1027, 922, 827, …
## $ TEAM_BASERUN_SB <int> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, 119, 221…
## $ TEAM_BASERUN_CS <int> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 79, 109, …
## $ TEAM_BATTING_HBP <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H <int> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 1281, 1391,…
## $ TEAM_PITCHING_HR <int> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96, 86, 95,…
## $ TEAM_PITCHING_BB <int> 927, 689, 602, 454, 472, 443, 525, 459, 447, 441, 391…
## $ TEAM_PITCHING_SO <int> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 922, 827,…
## $ TEAM_FIELDING_E <int> 1011, 193, 175, 164, 138, 123, 136, 112, 127, 131, 11…
## $ TEAM_FIELDING_DP <int> NA, 155, 153, 156, 168, 149, 186, 136, 169, 159, 141,…
glimpse(df_train)
## Rows: 2,276
## Columns: 16
## $ TARGET_WINS <int> 39, 70, 86, 70, 82, 75, 80, 85, 86, 76, 78, 68, 72, 7…
## $ TEAM_BATTING_H <int> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 1273, 1391,…
## $ TEAM_BATTING_2B <int> 194, 219, 232, 209, 186, 200, 179, 171, 197, 213, 179…
## $ TEAM_BATTING_3B <int> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 31, 41, 2…
## $ TEAM_BATTING_HR <int> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96, 82, 95,…
## $ TEAM_BATTING_BB <int> 143, 685, 602, 451, 472, 443, 525, 456, 447, 441, 374…
## $ TEAM_BATTING_SO <int> 842, 1075, 917, 922, 920, 973, 1062, 1027, 922, 827, …
## $ TEAM_BASERUN_SB <int> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, 119, 221…
## $ TEAM_BASERUN_CS <int> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 79, 109, …
## $ TEAM_BATTING_HBP <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H <int> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 1281, 1391,…
## $ TEAM_PITCHING_HR <int> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96, 86, 95,…
## $ TEAM_PITCHING_BB <int> 927, 689, 602, 454, 472, 443, 525, 459, 447, 441, 391…
## $ TEAM_PITCHING_SO <int> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 922, 827,…
## $ TEAM_FIELDING_E <int> 1011, 193, 175, 164, 138, 123, 136, 112, 127, 131, 11…
## $ TEAM_FIELDING_DP <int> NA, 155, 153, 156, 168, 149, 186, 136, 169, 159, 141,…
There are missing values in the training as well as evaluation data. They needs to be imputed before making any regression model. It can be seen that the missing values are in the the following features:
#print("NA Values are present in the following variables:")
for (c in colnames(df_train)){
num_miss = sum(is.na(df_train[c]))
if (num_miss > 0){
print(sprintf("%s : %d", c, num_miss))
}
}
## [1] "TEAM_BATTING_SO : 102"
## [1] "TEAM_BASERUN_SB : 131"
## [1] "TEAM_BASERUN_CS : 772"
## [1] "TEAM_BATTING_HBP : 2085"
## [1] "TEAM_PITCHING_SO : 102"
## [1] "TEAM_FIELDING_DP : 286"
Imputing missing values
df_train$TEAM_BATTING_SO <- ifelse(is.na(df_train$TEAM_BATTING_SO),
ave(df_train$TEAM_BATTING_SO,
FUN = function(x) median(x, na.rm = TRUE)),
df_train$TEAM_BATTING_SO)
df_train$TEAM_PITCHING_SO <- ifelse(is.na(df_train$TEAM_PITCHING_SO),
ave(df_train$TEAM_PITCHING_SO,
FUN = function(x) median(x, na.rm = TRUE)),
df_train$TEAM_PITCHING_SO)
df_train$TEAM_FIELDING_DP <- ifelse(is.na(df_train$TEAM_FIELDING_DP),
ave(df_train$TEAM_FIELDING_DP,
FUN = function(x) median(x, na.rm = TRUE)),
df_train$TEAM_FIELDING_DP)
df_train$TEAM_BASERUN_CS <- ifelse(is.na(df_train$TEAM_BASERUN_CS ),
ave(df_train$TEAM_BASERUN_CS ,
FUN = function(x) median(x, na.rm = TRUE)),
df_train$TEAM_BASERUN_CS)
df_train$TEAM_BASERUN_SB <- ifelse(is.na(df_train$TEAM_BASERUN_SB ),
ave(df_train$TEAM_BASERUN_SB ,
FUN = function(x) median(x, na.rm = TRUE)),
df_train$TEAM_BASERUN_SB)
Now all the missing values has been addressed. One feature which has more than 70% missing values needs to be dropped. So, we have new dataframe as
df_train <- df_train|> select(-TEAM_BATTING_HBP)
Similalry, we have to preprocess the data for the of evalution data set too.
df_eval$TEAM_BATTING_SO <- ifelse(is.na(df_eval$TEAM_BATTING_SO),
ave(df_eval$TEAM_BATTING_SO,
FUN = function(x) median(x, na.rm = TRUE)),
df_eval$TEAM_BATTING_SO)
df_eval$TEAM_PITCHING_SO <- ifelse(is.na(df_eval$TEAM_PITCHING_SO),
ave(df_eval$TEAM_PITCHING_SO,
FUN = function(x) median(x, na.rm = TRUE)),
df_eval$TEAM_PITCHING_SO)
df_eval$TEAM_FIELDING_DP <- ifelse(is.na(df_eval$TEAM_FIELDING_DP),
ave(df_eval$TEAM_FIELDING_DP,
FUN = function(x) median(x, na.rm = TRUE)),
df_eval$TEAM_FIELDING_DP)
df_eval$TEAM_BASERUN_CS <- ifelse(is.na(df_eval$TEAM_BASERUN_CS ),
ave(df_eval$TEAM_BASERUN_CS ,
FUN = function(x) median(x, na.rm = TRUE)),
df_eval$TEAM_BASERUN_CS)
df_eval$TEAM_BASERUN_SB <- ifelse(is.na(df_eval$TEAM_BASERUN_SB ),
ave(df_eval$TEAM_BASERUN_SB ,
FUN = function(x) median(x, na.rm = TRUE)),
df_eval$TEAM_BASERUN_SB)
df_eval <- df_eval|> select(-TEAM_BATTING_HBP)
Sometime normalizing the data gives good result. so, we can normalise the features. But let’s try model building on the data itself.
To take all the variables, we put . after ~ in lm().
reg_model <- lm(data=df_train, TARGET_WINS ~.)
summary(reg_model)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.753 -8.626 0.120 8.395 58.561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.6421579 5.3902272 4.386 1.21e-05 ***
## TEAM_BATTING_H 0.0489152 0.0036949 13.239 < 2e-16 ***
## TEAM_BATTING_2B -0.0209575 0.0091783 -2.283 0.022501 *
## TEAM_BATTING_3B 0.0644788 0.0168040 3.837 0.000128 ***
## TEAM_BATTING_HR 0.0527325 0.0274915 1.918 0.055219 .
## TEAM_BATTING_BB 0.0104483 0.0058377 1.790 0.073621 .
## TEAM_BATTING_SO -0.0084323 0.0025461 -3.312 0.000941 ***
## TEAM_BASERUN_SB 0.0254236 0.0043565 5.836 6.12e-09 ***
## TEAM_BASERUN_CS -0.0110027 0.0157842 -0.697 0.485829
## TEAM_PITCHING_H -0.0008456 0.0003674 -2.302 0.021444 *
## TEAM_PITCHING_HR 0.0129626 0.0243894 0.531 0.595135
## TEAM_PITCHING_BB 0.0007798 0.0041571 0.188 0.851231
## TEAM_PITCHING_SO 0.0028156 0.0009219 3.054 0.002284 **
## TEAM_FIELDING_E -0.0195325 0.0024609 -7.937 3.23e-15 ***
## TEAM_FIELDING_DP -0.1217801 0.0129421 -9.410 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 2261 degrees of freedom
## Multiple R-squared: 0.3154, Adjusted R-squared: 0.3111
## F-statistic: 74.4 on 14 and 2261 DF, p-value: < 2.2e-16
R-squared is 0.31 which is not up good for the prediction. So, we will modify the model and use only those features which are highly correlated with our target variable.
Selecting the best features for our model.
best_pred <- df_train %>%
select(-TARGET_WINS) %>%
map_dbl(cor, y = df_train$TARGET_WINS) %>%
sort(decreasing = TRUE) %>%
.[1:4] %>%
names %>%
df_train[.]
reg_mod <- lm(df_train$TARGET_WINS ~ as.matrix(best_pred))
summary(reg_mod)
##
## Call:
## lm(formula = df_train$TARGET_WINS ~ as.matrix(best_pred))
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.658 -8.769 0.469 9.074 50.673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.400162 3.381685 -0.414 0.67888
## as.matrix(best_pred)TEAM_BATTING_H 0.046376 0.002566 18.070 < 2e-16 ***
## as.matrix(best_pred)TEAM_BATTING_2B -0.014324 0.008830 -1.622 0.10489
## as.matrix(best_pred)TEAM_BATTING_BB 0.031384 0.002732 11.489 < 2e-16 ***
## as.matrix(best_pred)TEAM_PITCHING_HR 0.016710 0.005897 2.834 0.00464 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.9 on 2271 degrees of freedom
## Multiple R-squared: 0.2223, Adjusted R-squared: 0.221
## F-statistic: 162.3 on 4 and 2271 DF, p-value: < 2.2e-16
R-squared is is still 0.22 which suggest that model only able to predict about 22% of the target correctly.
Now we can use the step wise regression.
reduced_model <- step(reg_model, direction = "backward")
## Start: AIC=11716.38
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_BB 1 6.0 386469 11714
## - TEAM_PITCHING_HR 1 48.3 386511 11715
## - TEAM_BASERUN_CS 1 83.1 386546 11715
## <none> 386463 11716
## - TEAM_BATTING_BB 1 547.5 387010 11718
## - TEAM_BATTING_HR 1 628.9 387091 11718
## - TEAM_BATTING_2B 1 891.2 387354 11720
## - TEAM_PITCHING_H 1 905.5 387368 11720
## - TEAM_PITCHING_SO 1 1594.2 388057 11724
## - TEAM_BATTING_SO 1 1874.9 388337 11725
## - TEAM_BATTING_3B 1 2516.6 388979 11729
## - TEAM_BASERUN_SB 1 5821.2 392284 11748
## - TEAM_FIELDING_E 1 10768.2 397231 11777
## - TEAM_FIELDING_DP 1 15134.0 401596 11802
## - TEAM_BATTING_H 1 29956.6 416419 11884
##
## Step: AIC=11714.42
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BASERUN_CS 1 84.8 386553 11713
## - TEAM_PITCHING_HR 1 90.0 386558 11713
## <none> 386469 11714
## - TEAM_BATTING_HR 1 742.4 387211 11717
## - TEAM_BATTING_2B 1 889.3 387358 11718
## - TEAM_PITCHING_H 1 1052.2 387521 11719
## - TEAM_BATTING_BB 1 1910.6 388379 11724
## - TEAM_BATTING_SO 1 2078.2 388547 11725
## - TEAM_BATTING_3B 1 2516.0 388984 11727
## - TEAM_PITCHING_SO 1 3247.0 389716 11732
## - TEAM_BASERUN_SB 1 6017.0 392486 11748
## - TEAM_FIELDING_E 1 10763.3 397232 11775
## - TEAM_FIELDING_DP 1 15128.1 401597 11800
## - TEAM_BATTING_H 1 29996.7 416465 11883
##
## Step: AIC=11712.92
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_HR 1 86.4 386640 11711
## <none> 386553 11713
## - TEAM_BATTING_HR 1 793.8 387347 11716
## - TEAM_BATTING_2B 1 912.6 387466 11716
## - TEAM_PITCHING_H 1 1080.6 387634 11717
## - TEAM_BATTING_BB 1 2005.6 388559 11723
## - TEAM_BATTING_SO 1 2079.5 388633 11723
## - TEAM_BATTING_3B 1 2555.4 389109 11726
## - TEAM_PITCHING_SO 1 3269.0 389822 11730
## - TEAM_BASERUN_SB 1 5983.2 392536 11746
## - TEAM_FIELDING_E 1 10870.9 397424 11774
## - TEAM_FIELDING_DP 1 15186.6 401740 11799
## - TEAM_BATTING_H 1 29953.0 416506 11881
##
## Step: AIC=11711.43
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 386640 11711
## - TEAM_BATTING_2B 1 929.4 387569 11715
## - TEAM_PITCHING_H 1 1001.0 387641 11715
## - TEAM_BATTING_BB 1 1999.1 388639 11721
## - TEAM_BATTING_SO 1 2060.9 388701 11722
## - TEAM_BATTING_3B 1 2739.4 389379 11726
## - TEAM_PITCHING_SO 1 3328.3 389968 11729
## - TEAM_BASERUN_SB 1 5986.1 392626 11744
## - TEAM_BATTING_HR 1 8364.1 395004 11758
## - TEAM_FIELDING_E 1 10786.9 397427 11772
## - TEAM_FIELDING_DP 1 15152.3 401792 11797
## - TEAM_BATTING_H 1 30558.9 417199 11883
summary(reduced_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.598 -8.593 0.085 8.445 58.582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.3440443 5.2338369 4.269 2.04e-05 ***
## TEAM_BATTING_H 0.0490922 0.0036699 13.377 < 2e-16 ***
## TEAM_BATTING_2B -0.0213744 0.0091626 -2.333 0.019746 *
## TEAM_BATTING_3B 0.0665763 0.0166230 4.005 6.40e-05 ***
## TEAM_BATTING_HR 0.0674046 0.0096315 6.998 3.40e-12 ***
## TEAM_BATTING_BB 0.0115464 0.0033748 3.421 0.000634 ***
## TEAM_BATTING_SO -0.0085211 0.0024529 -3.474 0.000523 ***
## TEAM_BASERUN_SB 0.0249207 0.0042092 5.920 3.70e-09 ***
## TEAM_PITCHING_H -0.0007770 0.0003209 -2.421 0.015552 *
## TEAM_PITCHING_SO 0.0029662 0.0006719 4.415 1.06e-05 ***
## TEAM_FIELDING_E -0.0190100 0.0023919 -7.948 2.97e-15 ***
## TEAM_FIELDING_DP -0.1217894 0.0129296 -9.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 2264 degrees of freedom
## Multiple R-squared: 0.3151, Adjusted R-squared: 0.3117
## F-statistic: 94.68 on 11 and 2264 DF, p-value: < 2.2e-16
This is our final model. We can add interaction term too. But it will be added to the model in next weeks.
A model is good if its R-squared value is like 80% or more. But for our model R-squared is only 0.31. But we can make prediction on the evaluation set and see which model is best for our purpose.
target_wins<-predict(reg_model, df_eval)
target_wins[1:10]
## 1 2 3 4 5 6 7 8
## 61.24500 76.98750 76.65953 73.16350 68.32659 70.32932 67.24805 72.43990
## 9 10
## 74.68282 67.58324
predict(reg_mod, df_eval)[1:10]
## 1 2 3 4 5 6 7 8
## 68.72638 82.25068 80.31951 75.70529 72.60349 70.49093 72.24339 71.43711
## 9 10
## 76.22122 69.93779
predict(reduced_model, df_eval)[1:10]
## 1 2 3 4 5 6 7 8
## 61.14358 76.95670 76.52266 72.94207 68.18250 70.35402 67.35859 72.28528
## 9 10
## 74.44931 67.30660
df_train$TARGET_WINS[1:10]
## [1] 39 70 86 70 82 75 80 85 86 76
All the three models values differ from the original values but the model with highest R-squared will be selected.