library(ggplot2)
library(tidyr)
library(MASS)
library(dplyr)##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)## corrplot 0.84 loaded
library(naniar)Loading the train and test data sets csv files
train <- read.csv("https://raw.githubusercontent.com/Harpreet1984/DATA621/master/HW1/moneyball-training-data.csv")
test <- read.csv("https://raw.githubusercontent.com/Harpreet1984/DATA621/master/HW1/moneyball-evaluation-data.csv")
train$INDEX <- NULL
test$INDEX <- NULL
cleanNames <- function(train) {
name_list <- names(train)
name_list <- gsub("TEAM_", "", name_list)
names(train) <- name_list
train
}
train <- cleanNames(train)
test <- cleanNames(test)There are 2,276 rows and 16 columns (features). Of all 16 columns, 0 are discrete, 16 are continuous, and 0 are all missing. There are 3,478 missing values out of 36,416 data points with highest missings from Batters hit by pitch column
summary(train)## TARGET_WINS BATTING_H BATTING_2B BATTING_3B
## Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00
## 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00
## Median : 82.00 Median :1454 Median :238.0 Median : 47.00
## Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25
## 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00
## Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00
##
## BATTING_HR BATTING_BB BATTING_SO BASERUN_SB
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0
## Median :102.00 Median :512.0 Median : 750.0 Median :101.0
## Mean : 99.61 Mean :501.6 Mean : 735.6 Mean :124.8
## 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0
## Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0
## NA's :102 NA's :131
## BASERUN_CS BATTING_HBP PITCHING_H PITCHING_HR
## Min. : 0.0 Min. :29.00 Min. : 1137 Min. : 0.0
## 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0
## Median : 49.0 Median :58.00 Median : 1518 Median :107.0
## Mean : 52.8 Mean :59.36 Mean : 1779 Mean :105.7
## 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0
## Max. :201.0 Max. :95.00 Max. :30132 Max. :343.0
## NA's :772 NA's :2085
## PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
## Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
## 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
## Median : 536.5 Median : 813.5 Median : 159.0 Median :149.0
## Mean : 553.0 Mean : 817.7 Mean : 246.5 Mean :146.4
## 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
## Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
## NA's :102 NA's :286
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
This shows very few variables are normally distributed.
Now the datpreparation
Here from the plots we can see outliers in PITCHING_H,PITCHING_BB and PITCHING_SO
Also, since BATTING_H is a combination of BATTING_2B, BATTING_3B, BATTING_HR (and also includes batted singles), we will create a new variable BATTING_1B equaling BATTING_H - BATTING_2B - BATTING_3B - BATTING_HR and after creating this we will remove BATTING_H
vis_miss(train)As we can see from our chart, we have a number of missing values. Since HBP has 92% missing values, we will remove that entirely.
train1 <- transform(train)
train1["BATTING_1B"] <- NA
train1$BATTING_1B = train1$BATTING_H - train1$BATTING_HR - train1$BATTING_3B - train1$BATTING_2B
train1 <- select(train1, -BATTING_HBP)
train1 <- select(train1, -BATTING_H)
#for(i in 1:ncol(train1)){
# train1[is.na(train1[,i]), i] <- mean(train1[,i], na.rm = TRUE)
#}
head(train1)## TARGET_WINS BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO
## 1 39 194 39 13 143 842
## 2 70 219 22 190 685 1075
## 3 86 232 35 137 602 917
## 4 70 209 38 96 451 922
## 5 82 186 27 102 472 920
## 6 75 200 36 92 443 973
## BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO
## 1 NA NA 9364 84 927 5456
## 2 37 28 1347 191 689 1082
## 3 46 27 1377 137 602 917
## 4 43 30 1396 97 454 928
## 5 49 39 1297 102 472 920
## 6 107 59 1279 92 443 973
## FIELDING_E FIELDING_DP BATTING_1B
## 1 1011 NA 1199
## 2 193 155 908
## 3 175 153 973
## 4 164 156 1044
## 5 138 168 982
## 6 123 149 951
M<-cor(train1)
corrplot(M, method="number")full.model <- lm (TARGET_WINS ~ . , data=train1)
reduced.full.model<- step (full.model, direction = "backward")## Start: AIC=6723.18
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB +
## BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_HR +
## PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B
##
## Df Sum of Sq RSS AIC
## - PITCHING_BB 1 0.8 134324 6721.2
## - PITCHING_HR 1 7.2 134330 6721.3
## - BATTING_SO 1 55.2 134378 6721.8
## - BATTING_1B 1 56.5 134380 6721.8
## - BATTING_BB 1 81.0 134404 6722.1
## - BATTING_HR 1 89.3 134412 6722.2
## - PITCHING_H 1 98.0 134421 6722.3
## <none> 134323 6723.2
## - PITCHING_SO 1 264.1 134587 6724.1
## - BATTING_2B 1 654.6 134978 6728.4
## - BASERUN_CS 1 746.8 135070 6729.4
## - BASERUN_SB 1 1557.8 135881 6738.3
## - BATTING_3B 1 3598.5 137922 6760.5
## - FIELDING_DP 1 6742.5 141066 6794.0
## - FIELDING_E 1 22427.4 156751 6950.6
##
## Step: AIC=6721.19
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB +
## BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_HR +
## PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B
##
## Df Sum of Sq RSS AIC
## - PITCHING_HR 1 6.4 134330 6719.3
## - BATTING_SO 1 56.2 134380 6719.8
## - BATTING_HR 1 115.7 134440 6720.5
## - BATTING_1B 1 147.2 134471 6720.8
## <none> 134324 6721.2
## - PITCHING_H 1 197.5 134521 6721.4
## - PITCHING_SO 1 266.3 134590 6722.1
## - BASERUN_CS 1 746.5 135070 6727.4
## - BATTING_2B 1 1192.3 135516 6732.3
## - BASERUN_SB 1 1564.2 135888 6736.4
## - BATTING_3B 1 4777.8 139102 6771.1
## - FIELDING_DP 1 6744.6 141069 6792.0
## - BATTING_BB 1 12568.9 146893 6852.1
## - FIELDING_E 1 22491.7 156816 6949.2
##
## Step: AIC=6719.26
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB +
## BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_SO +
## FIELDING_E + FIELDING_DP + BATTING_1B
##
## Df Sum of Sq RSS AIC
## - BATTING_SO 1 51.2 134382 6717.8
## - BATTING_1B 1 144.7 134475 6718.9
## <none> 134330 6719.3
## - PITCHING_H 1 202.0 134532 6719.5
## - PITCHING_SO 1 298.0 134628 6720.6
## - BASERUN_CS 1 742.6 135073 6725.5
## - BATTING_2B 1 1209.1 135539 6730.6
## - BASERUN_SB 1 1570.4 135901 6734.5
## - BATTING_3B 1 4772.6 139103 6769.1
## - BATTING_HR 1 5777.7 140108 6779.8
## - FIELDING_DP 1 6744.4 141075 6790.1
## - BATTING_BB 1 12606.9 146937 6850.6
## - FIELDING_E 1 22525.1 156855 6947.6
##
## Step: AIC=6717.83
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB +
## BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_SO + FIELDING_E +
## FIELDING_DP + BATTING_1B
##
## Df Sum of Sq RSS AIC
## <none> 134382 6717.8
## - BASERUN_CS 1 737.6 135119 6724.0
## - PITCHING_H 1 1355.1 135737 6730.7
## - BASERUN_SB 1 1575.6 135957 6733.2
## - BATTING_1B 1 1740.1 136122 6734.9
## - BATTING_2B 1 3312.6 137694 6752.0
## - FIELDING_DP 1 6779.2 141161 6789.0
## - BATTING_3B 1 6904.5 141286 6790.3
## - PITCHING_SO 1 7395.1 141777 6795.4
## - BATTING_BB 1 12619.7 147001 6849.2
## - BATTING_HR 1 21121.1 155503 6932.8
## - FIELDING_E 1 22552.0 156934 6946.4
summary(reduced.full.model)##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR +
## BATTING_BB + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_SO +
## FIELDING_E + FIELDING_DP + BATTING_1B, data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.5830 -6.7313 -0.1643 6.5323 27.8502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.446058 6.588864 8.870 < 2e-16 ***
## BATTING_2B -0.044326 0.007354 -6.028 2.10e-09 ***
## BATTING_3B 0.187121 0.021502 8.702 < 2e-16 ***
## BATTING_HR 0.123248 0.008097 15.221 < 2e-16 ***
## BATTING_BB 0.039484 0.003356 11.765 < 2e-16 ***
## BASERUN_SB 0.036034 0.008668 4.157 3.41e-05 ***
## BASERUN_CS 0.051768 0.018200 2.844 0.004511 **
## PITCHING_H 0.009070 0.002353 3.855 0.000121 ***
## PITCHING_SO -0.020827 0.002312 -9.006 < 2e-16 ***
## FIELDING_E -0.155970 0.009917 -15.728 < 2e-16 ***
## FIELDING_DP -0.113149 0.013121 -8.623 < 2e-16 ***
## BATTING_1B 0.025502 0.005837 4.369 1.34e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.548 on 1474 degrees of freedom
## (790 observations deleted due to missingness)
## Multiple R-squared: 0.4384, Adjusted R-squared: 0.4342
## F-statistic: 104.6 on 11 and 1474 DF, p-value: < 2.2e-16
significant.model <- lm (TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_SO + BASERUN_SB + PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B , data=train1)
reduced.significant.model<- step (significant.model, direction = "backward")## Start: AIC=8652.74
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_SO +
## BASERUN_SB + PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B
##
## Df Sum of Sq RSS AIC
## - PITCHING_SO 1 125 202787 8651.9
## <none> 202661 8652.7
## - BATTING_2B 1 517 203178 8655.4
## - BATTING_SO 1 3380 206042 8681.1
## - BATTING_1B 1 3440 206101 8681.6
## - FIELDING_DP 1 6946 209607 8712.6
## - BATTING_3B 1 13546 216207 8769.5
## - BASERUN_SB 1 20358 223019 8826.4
## - FIELDING_E 1 30721 233382 8909.7
## - BATTING_HR 1 39457 242119 8977.2
##
## Step: AIC=8651.88
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_SO +
## BASERUN_SB + FIELDING_E + FIELDING_DP + BATTING_1B
##
## Df Sum of Sq RSS AIC
## <none> 202787 8651.9
## - BATTING_2B 1 532 203318 8654.7
## - BATTING_1B 1 3451 206237 8680.8
## - FIELDING_DP 1 6967 209754 8711.9
## - BATTING_3B 1 13574 216361 8768.8
## - BATTING_SO 1 14521 217308 8776.8
## - BASERUN_SB 1 20238 223024 8824.4
## - FIELDING_E 1 31043 233829 8911.2
## - BATTING_HR 1 39475 242261 8976.3
summary(reduced.significant.model)##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR +
## BATTING_SO + BASERUN_SB + FIELDING_E + FIELDING_DP + BATTING_1B,
## data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.590 -7.615 0.266 7.397 30.783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.547640 5.911037 13.288 < 2e-16 ***
## BATTING_2B -0.014921 0.006819 -2.188 0.0288 *
## BATTING_3B 0.205199 0.018560 11.056 < 2e-16 ***
## BATTING_HR 0.150831 0.008000 18.853 < 2e-16 ***
## BATTING_SO -0.026701 0.002335 -11.435 < 2e-16 ***
## BASERUN_SB 0.076556 0.005671 13.499 < 2e-16 ***
## FIELDING_E -0.118339 0.007078 -16.719 < 2e-16 ***
## FIELDING_DP -0.099724 0.012591 -7.921 4.07e-15 ***
## BATTING_1B 0.024854 0.004459 5.574 2.86e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.54 on 1826 degrees of freedom
## (441 observations deleted due to missingness)
## Multiple R-squared: 0.3614, Adjusted R-squared: 0.3586
## F-statistic: 129.2 on 8 and 1826 DF, p-value: < 2.2e-16
Select top 10 high correlation predictor.
cors <- sapply(train1, cor, y=train1$TARGET_WINS)
mask <- (rank(-abs(cors)) <= 10 )
best10.pred <- train1[, mask]
best10.pred <- subset(best10.pred, select = c(-TARGET_WINS) )
summary(best10.pred)## BATTING_2B BATTING_3B BATTING_HR BATTING_BB
## Min. : 69.0 Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.:208.0 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0
## Median :238.0 Median : 47.00 Median :102.00 Median :512.0
## Mean :241.2 Mean : 55.25 Mean : 99.61 Mean :501.6
## 3rd Qu.:273.0 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0
## Max. :458.0 Max. :223.00 Max. :264.00 Max. :878.0
## PITCHING_H PITCHING_HR PITCHING_BB FIELDING_E
## Min. : 1137 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 1419 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 127.0
## Median : 1518 Median :107.0 Median : 536.5 Median : 159.0
## Mean : 1779 Mean :105.7 Mean : 553.0 Mean : 246.5
## 3rd Qu.: 1682 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 249.2
## Max. :30132 Max. :343.0 Max. :3645.0 Max. :1898.0
## BATTING_1B
## Min. : 709.0
## 1st Qu.: 990.8
## Median :1050.0
## Mean :1073.2
## 3rd Qu.:1129.0
## Max. :2112.0
full.model.best10 <- lm (TARGET_WINS ~ BATTING_2B+ BATTING_3B + BATTING_HR + BATTING_BB + PITCHING_H + PITCHING_HR + PITCHING_BB + FIELDING_E + BATTING_1B , data=train1)
reduced.model.best10<- step (full.model.best10, direction = "backward")## Start: AIC=11851.69
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB +
## PITCHING_H + PITCHING_HR + PITCHING_BB + FIELDING_E + BATTING_1B
##
## Df Sum of Sq RSS AIC
## - BATTING_BB 1 14 411954 11850
## - PITCHING_HR 1 116 412057 11850
## <none> 411940 11852
## - BATTING_2B 1 1662 413602 11859
## - PITCHING_H 1 2358 414299 11863
## - PITCHING_BB 1 2477 414417 11863
## - BATTING_HR 1 2783 414724 11865
## - FIELDING_E 1 8814 420755 11898
## - BATTING_3B 1 17481 429421 11944
## - BATTING_1B 1 41151 453091 12066
##
## Step: AIC=11849.77
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + PITCHING_H +
## PITCHING_HR + PITCHING_BB + FIELDING_E + BATTING_1B
##
## Df Sum of Sq RSS AIC
## - PITCHING_HR 1 103 412057 11848
## <none> 411954 11850
## - BATTING_2B 1 1665 413619 11857
## - PITCHING_H 1 2795 414749 11863
## - BATTING_HR 1 3080 415034 11865
## - PITCHING_BB 1 5071 417025 11876
## - FIELDING_E 1 9287 421241 11898
## - BATTING_3B 1 17535 429489 11943
## - BATTING_1B 1 41986 453940 12069
##
## Step: AIC=11848.34
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + PITCHING_H +
## PITCHING_BB + FIELDING_E + BATTING_1B
##
## Df Sum of Sq RSS AIC
## <none> 412057 11848
## - BATTING_2B 1 1661 413718 11856
## - PITCHING_H 1 2944 415001 11862
## - PITCHING_BB 1 5078 417135 11874
## - FIELDING_E 1 9853 421910 11900
## - BATTING_3B 1 17469 429526 11941
## - BATTING_HR 1 23141 435198 11971
## - BATTING_1B 1 42092 454149 12068
summary(reduced.model.best10)##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR +
## PITCHING_H + PITCHING_BB + FIELDING_E + BATTING_1B, data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.763 -8.861 0.095 8.860 55.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2713462 3.2775220 2.219 0.02662 *
## BATTING_2B 0.0226647 0.0074954 3.024 0.00252 **
## BATTING_3B 0.1495551 0.0152520 9.806 < 2e-16 ***
## BATTING_HR 0.0851691 0.0075466 11.286 < 2e-16 ***
## PITCHING_H -0.0013088 0.0003251 -4.026 5.87e-05 ***
## PITCHING_BB 0.0103207 0.0019522 5.287 1.36e-07 ***
## FIELDING_E -0.0166263 0.0022577 -7.364 2.48e-13 ***
## BATTING_1B 0.0484775 0.0031849 15.221 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.48 on 2268 degrees of freedom
## Multiple R-squared: 0.27, Adjusted R-squared: 0.2678
## F-statistic: 119.9 on 7 and 2268 DF, p-value: < 2.2e-16
Based on the above stats, Model1 with all the columns as feature is a best fit. As from all the three models it has the best RSquare, Adjusted RSquare (0.4342), F Stats and P values(less than 0.5). It also has the best RMSE score of 13.03416.
using the Model1 we can predict the dependent variable for the test data as follows
test1 <- transform(test)
test1["BATTING_1B"] <- NA
test1$BATTING_1B = test1$BATTING_H - test1$BATTING_HR - test1$BATTING_3B - test1$BATTING_2B
test1 <- select(test1, -BATTING_HBP)
test1 <- select(test1, -BATTING_H)
predictedValues <- predict (reduced.full.model, newdata=test1 )
summary (predictedValues)## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 58.65 75.74 81.39 81.14 86.90 113.36 89