library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
setwd("~/PL3")
library(readr)
combined <- read_csv("combined.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeTeam = col_character(),
##   AwayTeam = col_character()
## )
## See spec(...) for full column specifications.
MLdata <- combined %>% 
  select(-1:-3, -average_market_value_away, -average_market_value_home) %>% 
  na.omit()
set.seed(46)
split <- createDataPartition(MLdata$home_score_difference, p = 0.75, list = F)
train <- MLdata[split,]
test <- MLdata[-split,]
ctrl <- trainControl(method = "cv", number = 10)

KNN

fit.knn <- train(home_score_difference ~ ., data= train, method ="knn", trcontrol =ctrl)
fit.knn
## k-Nearest Neighbors 
## 
## 278 samples
##  90 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 278, 278, 278, 278, 278, 278, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE      Rsquared   MAE     
##   5  1.884326  0.1618779  1.503953
##   7  1.839607  0.1717020  1.467064
##   9  1.803513  0.1841629  1.440750
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.

RMSE = 1.75

train.predict.knn <- predict(fit.knn, train)
test.predict.knn <- predict(fit.knn, test)
RMSE(test.predict.knn, test$home_score_difference)
## [1] 1.808535

RMSE = 1.8

RF

fit.rf <- train(home_score_difference ~ ., data= train, method ="rf")

RMSE = 1.61

train.predict.rf <- predict(fit.rf, train)
test.predict.rf <- predict(fit.rf, test)
RMSE(test.predict.rf, test$home_score_difference)
## [1] 1.685726

RMSE = 1.68

Linear

fit.lm <- train(home_score_difference ~ ., data= train, method ="lm")
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
fit.lm
## Linear Regression 
## 
## 278 samples
##  90 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 278, 278, 278, 278, 278, 278, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   2.643067  0.1100648  1.970173
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

RMSE = 2.56

train.predict.lm <- predict(fit.lm, train)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
test.predict.lm <- predict(fit.lm, test)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
RMSE(test.predict.lm, test$home_score_difference)
## [1] 2.00497

RMSE = 2.00

SVM

fit.svm <- train(home_score_difference ~ ., data= train, method ="svmLinear")
fit.svm
## Support Vector Machines with Linear Kernel 
## 
## 278 samples
##  90 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 278, 278, 278, 278, 278, 278, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   2.203797  0.142527  1.724377
## 
## Tuning parameter 'C' was held constant at a value of 1

RMSE = 2.1

train.predict.svm <- predict(fit.svm, train)
test.predict.svm <- predict(fit.svm, test)
RMSE(test.predict.svm, test$home_score_difference)
## [1] 1.950307

RMSE = 1.95

RF seems good

Let’s export our predictions

predict.rf <- predict(fit.rf, combined)

export <- combined %>% 
  slice(11:380) %>%   
  cbind(predict.rf) %>%
  select(Date, HomeTeam, AwayTeam, home_score_difference, predict.rf) %>% 
  mutate(home_result = ifelse(home_score_difference > 0, "W", ifelse(home_score_difference  == 0, sample("D", length(home_score_difference), replace = TRUE), "L"))) %>% 
 mutate(predicted_result = ifelse(predict.rf > 0.5, "W", ifelse(predict.rf  < -0.5, sample("L", length(home_score_difference), replace = TRUE), "D")))
write.csv(export, "export.csv")