library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
setwd("~/PL3")
library(readr)
combined <- read_csv("combined.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Date = col_date(format = ""),
## HomeTeam = col_character(),
## AwayTeam = col_character()
## )
## See spec(...) for full column specifications.
MLdata <- combined %>%
select(-1:-3, -average_market_value_away, -average_market_value_home) %>%
na.omit()
set.seed(46)
split <- createDataPartition(MLdata$home_score_difference, p = 0.75, list = F)
train <- MLdata[split,]
test <- MLdata[-split,]
ctrl <- trainControl(method = "cv", number = 10)
fit.knn <- train(home_score_difference ~ ., data= train, method ="knn", trcontrol =ctrl)
fit.knn
## k-Nearest Neighbors
##
## 278 samples
## 90 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 278, 278, 278, 278, 278, 278, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.884326 0.1618779 1.503953
## 7 1.839607 0.1717020 1.467064
## 9 1.803513 0.1841629 1.440750
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
RMSE = 1.75
train.predict.knn <- predict(fit.knn, train)
test.predict.knn <- predict(fit.knn, test)
RMSE(test.predict.knn, test$home_score_difference)
## [1] 1.808535
RMSE = 1.8
fit.rf <- train(home_score_difference ~ ., data= train, method ="rf")
RMSE = 1.61
train.predict.rf <- predict(fit.rf, train)
test.predict.rf <- predict(fit.rf, test)
RMSE(test.predict.rf, test$home_score_difference)
## [1] 1.685726
RMSE = 1.68
fit.lm <- train(home_score_difference ~ ., data= train, method ="lm")
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
fit.lm
## Linear Regression
##
## 278 samples
## 90 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 278, 278, 278, 278, 278, 278, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2.643067 0.1100648 1.970173
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
RMSE = 2.56
train.predict.lm <- predict(fit.lm, train)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
test.predict.lm <- predict(fit.lm, test)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
RMSE(test.predict.lm, test$home_score_difference)
## [1] 2.00497
RMSE = 2.00
fit.svm <- train(home_score_difference ~ ., data= train, method ="svmLinear")
fit.svm
## Support Vector Machines with Linear Kernel
##
## 278 samples
## 90 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 278, 278, 278, 278, 278, 278, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2.203797 0.142527 1.724377
##
## Tuning parameter 'C' was held constant at a value of 1
RMSE = 2.1
train.predict.svm <- predict(fit.svm, train)
test.predict.svm <- predict(fit.svm, test)
RMSE(test.predict.svm, test$home_score_difference)
## [1] 1.950307
RMSE = 1.95
Let’s export our predictions
predict.rf <- predict(fit.rf, combined)
export <- combined %>%
slice(11:380) %>%
cbind(predict.rf) %>%
select(Date, HomeTeam, AwayTeam, home_score_difference, predict.rf) %>%
mutate(home_result = ifelse(home_score_difference > 0, "W", ifelse(home_score_difference == 0, sample("D", length(home_score_difference), replace = TRUE), "L"))) %>%
mutate(predicted_result = ifelse(predict.rf > 0.5, "W", ifelse(predict.rf < -0.5, sample("L", length(home_score_difference), replace = TRUE), "D")))
write.csv(export, "export.csv")