## hits_allowed hits_made
## hits_allowed 1.0000000 -0.4390871
## hits_made -0.4390871 1.0000000
| category | mean | max |
|---|---|---|
| TARGET_WINS | 80.79086 | 146 |
| TEAM_BATTING_H | 1469.26977 | 2554 |
| TEAM_BATTING_2B | 241.24692 | 458 |
| TEAM_BATTING_3B | 55.25000 | 223 |
| TEAM_BATTING_HR | 99.61204 | 264 |
| TEAM_BATTING_BB | 501.55888 | 878 |
| TEAM_BATTING_SO | 735.60534 | 1399 |
| TEAM_BASERUN_SB | 124.76177 | 697 |
| TEAM_BASERUN_CS | 52.80386 | 201 |
| TEAM_BATTING_HBP | 59.35602 | 95 |
| TEAM_PITCHING_H | 1779.21046 | 30132 |
| TEAM_PITCHING_HR | 105.69859 | 343 |
| TEAM_PITCHING_BB | 553.00791 | 3645 |
| TEAM_PITCHING_SO | 817.73045 | 19278 |
| TEAM_FIELDING_E | 246.48067 | 1898 |
| TEAM_FIELDING_DP | 146.38794 | 228 |
## [1] 0.03060384
## [1] 0.03038121
## [1] 0.05366812
## [1] 0.05912852
## [1] 0.0005018284
## [1] 0.001442892
## [1] 0.03530215
## [1] 0.03262247
## [1] "TEAM_BATTING_2B"
## [1] "walks"
## [1] "homeruns"
## [1] "TEAM_FIELDING_E"
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.834 -8.846 0.075 8.553 66.299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.5825232 5.6642691 9.460 < 2e-16 ***
## TEAM_BATTING_2B 0.0604189 0.0077953 7.751 1.47e-14 ***
## TEAM_BATTING_3B 0.1839656 0.0167021 11.015 < 2e-16 ***
## TEAM_BASERUN_SB 0.0285153 0.0048300 5.904 4.19e-09 ***
## TEAM_BASERUN_CS -0.0022603 0.0177273 -0.128 0.8986
## TEAM_BATTING_HBP 0.1017990 0.0798753 1.274 0.2026
## TEAM_FIELDING_E -0.0185161 0.0026810 -6.906 6.73e-12 ***
## TEAM_FIELDING_DP -0.1111554 0.0143095 -7.768 1.29e-14 ***
## homeruns 0.0409544 0.0041357 9.903 < 2e-16 ***
## walks 0.0066878 0.0015437 4.332 1.55e-05 ***
## strikeouts -0.0028056 0.0005411 -5.185 2.38e-07 ***
## hits 0.0008156 0.0003286 2.482 0.0132 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.45 on 1923 degrees of freedom
## Multiple R-squared: 0.2549, Adjusted R-squared: 0.2507
## F-statistic: 59.82 on 11 and 1923 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.996 -9.950 0.694 10.088 74.327
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.768551 0.514328 162.870 < 2e-16 ***
## TEAM_FIELDING_E -0.012718 0.001564 -8.134 7.37e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.28 on 1933 degrees of freedom
## Multiple R-squared: 0.03309, Adjusted R-squared: 0.03259
## F-statistic: 66.16 on 1 and 1933 DF, p-value: 7.367e-16
## Start: AIC=10554.19
## TARGET_WINS ~ TEAM_FIELDING_E
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_3B 1 33411 418009 10407
## + TEAM_BATTING_2B 1 29715 421706 10424
## + TEAM_BASERUN_SB 1 16937 434483 10482
## + walks 1 11417 440004 10507
## + homeruns 1 5573 445847 10532
## + strikeouts 1 4079 447341 10539
## + TEAM_FIELDING_DP 1 3544 447877 10541
## + hits 1 2101 449320 10547
## <none> 451421 10554
## + TEAM_BASERUN_CS 1 281 451139 10555
## + TEAM_BATTING_HBP 1 223 451197 10555
##
## Step: AIC=10407.39
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B
##
## Df Sum of Sq RSS AIC
## + homeruns 1 32513 385497 10253
## + TEAM_BATTING_2B 1 30131 387879 10265
## + walks 1 11654 406356 10355
## + hits 1 6656 411354 10378
## + TEAM_BASERUN_SB 1 4577 413433 10388
## + TEAM_FIELDING_DP 1 963 417047 10405
## + TEAM_BASERUN_CS 1 597 417413 10407
## <none> 418009 10407
## + TEAM_BATTING_HBP 1 275 417735 10408
## + strikeouts 1 67 417942 10409
##
## Step: AIC=10252.71
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B + homeruns
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_2B 1 10934.6 374562 10199
## + TEAM_BASERUN_SB 1 8805.1 376691 10210
## + TEAM_FIELDING_DP 1 8365.3 377131 10212
## + walks 1 4381.4 381115 10233
## + hits 1 1764.8 383732 10246
## + TEAM_BASERUN_CS 1 666.8 384830 10251
## + strikeouts 1 423.1 385073 10253
## <none> 385497 10253
## + TEAM_BATTING_HBP 1 291.5 385205 10253
##
## Step: AIC=10199.03
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B + homeruns +
## TEAM_BATTING_2B
##
## Df Sum of Sq RSS AIC
## + TEAM_BASERUN_SB 1 10706.1 363856 10145
## + TEAM_FIELDING_DP 1 10298.7 364263 10147
## + walks 1 3425.5 371136 10183
## + strikeouts 1 428.7 374133 10199
## + TEAM_BASERUN_CS 1 407.9 374154 10199
## <none> 374562 10199
## + TEAM_BATTING_HBP 1 319.8 374242 10199
## + hits 1 262.9 374299 10200
##
## Step: AIC=10144.92
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B + homeruns +
## TEAM_BATTING_2B + TEAM_BASERUN_SB
##
## Df Sum of Sq RSS AIC
## + TEAM_FIELDING_DP 1 7510.6 356345 10107
## + strikeouts 1 1323.0 362533 10140
## + walks 1 1261.0 362595 10140
## + hits 1 1252.3 362603 10140
## <none> 363856 10145
## + TEAM_BATTING_HBP 1 351.1 363505 10145
## + TEAM_BASERUN_CS 1 8.2 363848 10147
##
## Step: AIC=10106.56
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B + homeruns +
## TEAM_BATTING_2B + TEAM_BASERUN_SB + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## + walks 1 2916.59 353429 10093
## + strikeouts 1 2086.61 354259 10097
## + hits 1 1429.72 354915 10101
## <none> 356345 10107
## + TEAM_BATTING_HBP 1 298.01 356047 10107
## + TEAM_BASERUN_CS 1 5.57 356340 10108
##
## Step: AIC=10092.66
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B + homeruns +
## TEAM_BATTING_2B + TEAM_BASERUN_SB + TEAM_FIELDING_DP + walks
##
## Df Sum of Sq RSS AIC
## + strikeouts 1 4192.1 349237 10072
## + hits 1 444.1 352984 10092
## <none> 353429 10093
## + TEAM_BATTING_HBP 1 267.5 353161 10093
## + TEAM_BASERUN_CS 1 1.4 353427 10095
##
## Step: AIC=10071.57
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B + homeruns +
## TEAM_BATTING_2B + TEAM_BASERUN_SB + TEAM_FIELDING_DP + walks +
## strikeouts
##
## Df Sum of Sq RSS AIC
## + hits 1 1096.30 348140 10068
## <none> 349237 10072
## + TEAM_BATTING_HBP 1 277.38 348959 10072
## + TEAM_BASERUN_CS 1 1.73 349235 10074
##
## Step: AIC=10067.48
## TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B + homeruns +
## TEAM_BATTING_2B + TEAM_BASERUN_SB + TEAM_FIELDING_DP + walks +
## strikeouts + hits
##
## Df Sum of Sq RSS AIC
## <none> 348140 10068
## + TEAM_BATTING_HBP 1 294.081 347846 10068
## + TEAM_BASERUN_CS 1 3.212 348137 10070
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_3B +
## homeruns + TEAM_BATTING_2B + TEAM_BASERUN_SB + TEAM_FIELDING_DP +
## walks + strikeouts + hits, data = training_set)
##
## Coefficients:
## (Intercept) TEAM_FIELDING_E TEAM_BATTING_3B homeruns
## 59.5241126 -0.0183741 0.1839054 0.0410832
## TEAM_BATTING_2B TEAM_BASERUN_SB TEAM_FIELDING_DP walks
## 0.0603142 0.0282536 -0.1115420 0.0067407
## strikeouts hits
## -0.0027997 0.0008054
## [1] "our forward selection model:"
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_FIELDING_E + homeruns + TEAM_BASERUN_SB + TEAM_FIELDING_DP +
## walks + strikeouts + hits, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.738 -8.734 0.081 8.587 66.325
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.5241126 2.8710360 20.733 < 2e-16 ***
## TEAM_BATTING_2B 0.0603142 0.0077819 7.751 1.47e-14 ***
## TEAM_BATTING_3B 0.1839054 0.0167005 11.012 < 2e-16 ***
## TEAM_FIELDING_E -0.0183741 0.0026154 -7.025 2.95e-12 ***
## homeruns 0.0410832 0.0040172 10.227 < 2e-16 ***
## TEAM_BASERUN_SB 0.0282536 0.0046811 6.036 1.89e-09 ***
## TEAM_FIELDING_DP -0.1115420 0.0143026 -7.799 1.02e-14 ***
## walks 0.0067407 0.0015375 4.384 1.23e-05 ***
## strikeouts -0.0027997 0.0005410 -5.175 2.51e-07 ***
## hits 0.0008054 0.0003271 2.462 0.0139 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.45 on 1925 degrees of freedom
## Multiple R-squared: 0.2543, Adjusted R-squared: 0.2508
## F-statistic: 72.94 on 9 and 1925 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_FIELDING_E + homeruns + TEAM_BASERUN_SB + walks + strikeouts +
## hits, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.818 -9.094 0.425 8.873 64.818
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.3852836 2.3605826 19.650 < 2e-16 ***
## TEAM_BATTING_2B 0.0566609 0.0078875 7.184 9.66e-13 ***
## TEAM_BATTING_3B 0.1883179 0.0169481 11.111 < 2e-16 ***
## TEAM_FIELDING_E -0.0193226 0.0026529 -7.284 4.71e-13 ***
## homeruns 0.0344643 0.0039871 8.644 < 2e-16 ***
## TEAM_BASERUN_SB 0.0348833 0.0046742 7.463 1.27e-13 ***
## walks 0.0039482 0.0015183 2.600 0.009382 **
## strikeouts -0.0021101 0.0005419 -3.894 0.000102 ***
## hits 0.0008347 0.0003322 2.513 0.012053 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.66 on 1926 degrees of freedom
## Multiple R-squared: 0.2308, Adjusted R-squared: 0.2276
## F-statistic: 72.22 on 8 and 1926 DF, p-value: < 2.2e-16
## [1] "root mean square error from one variable model"
## [1] 16.74202
## [1] "root mean square error from forward selection model"
## [1] 15.14482
## [1] "root mean square error from all variable model"
## [1] 15.18169
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(Metrics)))
suppressWarnings(suppressMessages(library(lmtest)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(e1071)))
suppressWarnings(suppressMessages(library(MASS)))
suppressWarnings(suppressMessages(library(corrplot)))
suppressWarnings(suppressMessages(library(kableExtra)))
suppressWarnings(suppressMessages(library(gridExtra)))
suppressWarnings(suppressMessages(library(dplyr)))
baseball_stats_raw<-read.csv('https://raw.githubusercontent.com/WigodskyD/data-sets/master/moneyball-training-data.csv')
baseball_set_aside<-baseball_stats_raw
hit_average_2018<-read.csv('https://raw.githubusercontent.com/WigodskyD/data-sets/master/hitsaverage2018.csv')
correl.matrix<-cor(baseball_stats_raw, use="pairwise.complete.obs")
correl.matrixb<-cor(baseball_stats_raw, use="complete.obs")
corrplot(correl.matrixb,method="color",type="upper")
box_cox_set<-baseball_stats_raw
corrplot(correl.matrix,method="color",type="upper")
print('correlation for base runners caught stealing')
cor(baseball_stats_raw$TARGET_WINS,baseball_stats_raw$TEAM_BASERUN_CS, use="complete.obs")
print('correlation for batters struck out')
cor(baseball_stats_raw$TARGET_WINS,baseball_stats_raw$TEAM_BATTING_SO, use="complete.obs")
print('correlation for fielding errors')
cor(baseball_stats_raw$TARGET_WINS,baseball_stats_raw$TEAM_FIELDING_E, use="complete.obs")
cor(hit_average_2018[,c(2,3)],use="complete.obs")
names_of_variable<-rep(0,16)
means_of_variable<-rep(0,16)
maxes_of_variable<-rep(0,16)
boxplot(baseball_stats_raw[c(2,4,5,6,9,10,11,13,17)],las=2,cex.axis=.5)
boxplot(baseball_stats_raw[c(3,7,8,12,14,15,16)],las=2,cex.axis=.5)
for (i in 2:17){
baseball_stats_raw[,i][is.na(baseball_stats_raw[,i])]<-mean(baseball_stats_raw[,i],na.rm=TRUE)
names_of_variable[i-1]<-colnames(baseball_stats_raw[i])
means_of_variable[i-1]<-mean(baseball_stats_raw[,i],na.rm=TRUE)
maxes_of_variable[i-1]<-max(baseball_stats_raw[,i],na.rm=TRUE)
}
summary.info<-c(1:16)
summary.table<-data.frame(cbind(summary.info,summary.info,summary.info))
colnames(summary.table)<-c('category', 'mean','max')
summary.table[,1]<-names_of_variable
summary.table[,2]<-means_of_variable
summary.table[,3]<-maxes_of_variable
kable(summary.table, "html") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = T, color = "white", background = "#D7261E") %>%
column_spec(2, bold = T, color = "#D7261E", background = "white") %>%
column_spec(3, bold = T, color = "#D7261E", background = "white")
plot.a<-ggplot(data=baseball_stats_raw)+ geom_point(aes(x=TARGET_WINS,y=TEAM_PITCHING_H),color='blue')+ theme(panel.background = element_rect(fill = '#d3dded'))
plot.b<-ggplot(data=baseball_stats_raw)+ geom_point(aes(x=TARGET_WINS,y=TEAM_PITCHING_H),color='blue')+ylim(0,10000)+ theme(panel.background = element_rect(fill = '#d3dded'))
grid.arrange(plot.a,plot.b,nrow=1)
plot.a<-ggplot(data=baseball_stats_raw)+ geom_point(aes(x=TARGET_WINS,y=TEAM_PITCHING_SO),color='maroon')+ theme(panel.background = element_rect(fill = '#d3dded'))
plot.b<-ggplot(data=baseball_stats_raw)+ geom_point(aes(x=TARGET_WINS,y=TEAM_PITCHING_SO),color='maroon')+ylim(800,1200)+ theme(panel.background = element_rect(fill = '#d3dded'))
grid.arrange(plot.a,plot.b,nrow=1)
ggplot()+geom_point(data=baseball_stats_raw, aes(y=TARGET_WINS,x=TEAM_BATTING_HR))+ theme(panel.background = element_rect(fill = '#d3ede6'))
ggplot()+geom_point(data=baseball_stats_raw, aes(y=TARGET_WINS,x=TEAM_BATTING_BB))+ theme(panel.background = element_rect(fill = '#d3ede6'))
ggplot()+geom_point(data=baseball_stats_raw, aes(y=TARGET_WINS,x=TEAM_BATTING_SO))+ theme(panel.background = element_rect(fill = '#d3ede6'))
ggplot()+geom_point(data=baseball_stats_raw, aes(y=TARGET_WINS,x=TEAM_PITCHING_HR))+ theme(panel.background = element_rect(fill = '#d3ede6'))
for (i in 1:17){
box_cox_set[,i][box_cox_set[,i]==0]<-mean(box_cox_set[,i],na.rm=TRUE)
box_cox_set[,i][is.na(box_cox_set[,i])]<-mean(box_cox_set[,i],na.rm=TRUE)
}
boxcox(box_cox_set$TARGET_WINS~box_cox_set$TEAM_BATTING_3B)
squarer<-function(a){return(a^2)}
roottaker<-function(a){return(a^.5)}
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_BATTING_HR)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_BATTING_HR<-sapply(baseball_stats_raw$TEAM_BATTING_HR,squarer)
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_BATTING_HR^2)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_BATTING_HR<-sapply(baseball_stats_raw$TEAM_BATTING_HR,roottaker)
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_BATTING_BB)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_BATTING_BB<-sapply(baseball_stats_raw$TEAM_BATTING_BB,squarer)
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_BATTING_BB^2)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_BATTING_BB<-sapply(baseball_stats_raw$TEAM_BATTING_BB,roottaker)
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_BATTING_SO)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_BATTING_SO<-sapply(baseball_stats_raw$TEAM_BATTING_SO,squarer)
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_BATTING_SO^2)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_BATTING_SO<-sapply(baseball_stats_raw$TEAM_BATTING_SO,roottaker)
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_PITCHING_HR)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_PITCHING_HR<-sapply(baseball_stats_raw$TEAM_PITCHING_HR,squarer)
lmnow<-lm(data=baseball_stats_raw, TARGET_WINS~TEAM_PITCHING_HR^2)
summary(lmnow)$adj.r.squared
baseball_stats_raw$TEAM_PITCHING_HR<-sapply(baseball_stats_raw$TEAM_PITCHING_HR,roottaker)
baseball_stats_raw %>%
mutate(homeruns=TEAM_BATTING_HR+TEAM_PITCHING_HR) %>%
mutate(walks=TEAM_BATTING_BB+TEAM_PITCHING_BB) %>%
mutate(strikeouts=TEAM_BATTING_SO+TEAM_PITCHING_SO) %>%
mutate(hits=TEAM_BATTING_H+TEAM_PITCHING_H)->baseball_stats_raw
regression_set<-baseball_stats_raw[,-c(1,3,6,7,8,12,13,14,15)]
head(regression_set)
set.seed(613)
test_set_index<-sample(seq_len(nrow(regression_set)),size=floor(.15*nrow(regression_set) ))
training_set<-regression_set[-test_set_index,]
test_set<-regression_set[test_set_index,]
t_values<-rep(0,11)
for (i in 2:12) {
spare_model<-lm(regression_set$TARGET_WINS~regression_set[,i])
summary(spare_model)
t_values[i-1]<-(summary(spare_model))$coefficients[6]
}
#to find 4 starting points for forward selection, we use t-tests
colnames(regression_set[which.max(abs(t_values))+1])
t_values[which.max(abs(t_values))]<-0
colnames(regression_set[which.max(abs(t_values))+1])
t_values[which.max(abs(t_values))]<-0
colnames(regression_set[which.max(abs(t_values))+1])
t_values[which.max(abs(t_values))]<-0
colnames(regression_set[which.max(abs(t_values))+1])
full_model<-lm(data=training_set,TARGET_WINS~.)
summary(full_model)
spare_model1<-lm(TARGET_WINS~TEAM_BATTING_2B,data=training_set)
#summary(spare_model1)
#step(spare_model1,scope=list(lower=spare_model1,upper=full_model) ,direction="forward")
spare_model2<-lm(TARGET_WINS~walks,data=training_set)
#summary(spare_model2)
#step(spare_model2,scope=list(lower=spare_model2,upper=full_model) ,direction="forward")
spare_model3<-lm(TARGET_WINS~homeruns,data=training_set)
#summary(spare_model3)
#step(spare_model3,scope=list(lower=spare_model3,upper=full_model) ,direction="forward")
spare_model4<-lm(TARGET_WINS~TEAM_FIELDING_E,data=training_set)
summary(spare_model4)
step(spare_model4,scope=list(lower=spare_model4,upper=full_model) ,direction="forward")
print("our forward selection model:")
forward_selection_model<-lm(TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_FIELDING_E + homeruns + TEAM_BASERUN_SB + TEAM_FIELDING_DP + walks + strikeouts + hits, data = training_set)
summary(forward_selection_model)
ggplot()+geom_point(data=baseball_stats_raw, aes(y=TARGET_WINS,x=TEAM_FIELDING_DP))+ theme(panel.background = element_rect(fill = '#d3ede6'))
lmtest<-lm(TARGET_WINS~TEAM_FIELDING_DP,data=regression_set)
#summary(lmtest)
forward_selection_model<-lm(TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_FIELDING_E + homeruns + TEAM_BASERUN_SB + walks + strikeouts + hits, data = training_set)
summary(forward_selection_model)
print("root mean square error from one variable model")
results_from_spare<-predict(spare_model4,test_set)
rmse(results_from_spare,test_set[,1])
print("root mean square error from forward selection model")
results_from_forward_selection<-predict(forward_selection_model,test_set)
rmse(results_from_forward_selection,test_set[,1])
print("root mean square error from all variable model")
results_from_full<-predict(full_model,test_set)
rmse(results_from_full,test_set[,1])
ggplot()+geom_point(aes(x=seq_along(resid(forward_selection_model)),y=resid(forward_selection_model)),color='#576da8')+labs(x='Residuals',y='')+ theme(panel.background = element_rect(fill = '#cfdaf7'))
baseball_evaluation_set<-read.csv('https://raw.githubusercontent.com/WigodskyD/data-sets/master/moneyball-evaluation-data.csv')
for (i in 2:16){
baseball_evaluation_set[,i][is.na(baseball_evaluation_set[,i])]<-mean(baseball_evaluation_set[,i],na.rm=TRUE)
}
baseball_evaluation_set %>%
mutate(homeruns=TEAM_BATTING_HR+TEAM_PITCHING_HR) %>%
mutate(walks=TEAM_BATTING_BB+TEAM_PITCHING_BB) %>%
mutate(strikeouts=TEAM_BATTING_SO+TEAM_PITCHING_SO) %>%
mutate(hits=TEAM_BATTING_H+TEAM_PITCHING_H)->baseball_evaluation_set
baseball_evaluation_set<-baseball_evaluation_set[,-c(1,2,5,6,7,11,12,13,14)]
evaluation_predictions<-predict(forward_selection_model,baseball_evaluation_set)
#write.csv(evaluation_predictions,file = "C:/Users/dawig/Desktop/Data621/moneyball_predictions.csv")