December 20, 2018

We decided to use the FIFA World Cup Matches dataset in order to predict the winner of the 2018 World Cup.

Removing useless columns

Iaquinta[,"RoundID"] = NULL
Iaquinta[,"MatchID"] = NULL
Iaquinta[,"Referee"] = NULL
Iaquinta[,"Assistant.1"] = NULL
Iaquinta[,"Assistant.2"] = NULL
Iaquinta[,"Datetime"] = NULL
Iaquinta[,"Home.Team.Initials"] = NULL
Iaquinta[,"Away.Team.Initials"] = NULL

CHecking for missing values

sum(is.na(Iaquinta))
## [1] 22322

DISCUSSION ON MISSING VALUES

Handling missing values (1) - Missing values of categorical variables are replaced by the most frequent category in the variables

AL=function(x){
  for (i in 1:ncol(x)){
    if (is.numeric(x[,i])){
      x[,i][is.na(x[,i])]=mean(x[,i], na.rm=TRUE)
    }else{
      levels=unique(x[,i])
      x[,i][is.na(x[,i])]=levels[which.max(tabulate(match(x[,i], levels)))]
    }
  }
  return (x)
}
Iaquinta <- AL(Iaquinta)


#We had 22322 missing values in the first place
#This method brings the #of missing values to 0

Taking Care of the levels

levels(Iaquinta$Stage)
##  [1] ""                         "Final"                   
##  [3] "First round"              "Group 1"                 
##  [5] "Group 2"                  "Group 3"                 
##  [7] "Group 4"                  "Group 5"                 
##  [9] "Group 6"                  "Group A"                 
## [11] "Group B"                  "Group C"                 
## [13] "Group D"                  "Group E"                 
## [15] "Group F"                  "Group G"                 
## [17] "Group H"                  "Match for third place"   
## [19] "Play-off for third place" "Preliminary round"       
## [21] "Quarter-finals"           "Round of 16"             
## [23] "Semi-finals"              "Third place"
levels(Iaquinta$Stage)=c("Prelim", "Final", "Prelim", "Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Semi_Final","Semi_Final","Prelim","Quarter_Final","Round_16","Semi_Final","Semi_Final")
levels(Iaquinta$Stage)
## [1] "Prelim"        "Final"         "Semi_Final"    "Quarter_Final"
## [5] "Round_16"

Recoding categorical variable using one hot encoding (dummy encoding)

dummies_model <- dummyVars(Year ~., data=Iaquinta)
trainData_mat <- predict(dummies_model, newdata =Iaquinta)

trainData <- data.frame(trainData_mat)
trainData$Year <- Iaquinta$Year

VISUALIZATION AND GRAPHS

library(ggplot2)
ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Attendance, fill = Stage)) + facet_wrap(~Stage)

library(ggplot2)
ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Home.Team.Goals, fill = Stage)) + facet_wrap(~Stage)

library(ggplot2)
ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Away.Team.Goals, fill = Stage)) + facet_wrap(~Stage)

library(ggplot2)
ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Attendance, fill = Year)) + facet_wrap(~Year)

library(ggplot2)
ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Half.time.Home.Goals, fill = Stage)) + facet_wrap(~Stage)

library(ggplot2)
ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Half.time.Away.Goals, fill = Stage)) + facet_wrap(~Stage)

Al22 <- function(Iaquinta,var1,var2) {
  rt = ggplot(data=Iaquinta) + geom_bar(mapping = aes(x = Iaquinta[,var1], fill = Iaquinta[,var2]), position = "dodge")
  return(rt)
}

Al22(Iaquinta, 2, 2)

Al23 <- function(Iaquinta,var1,var2) {
  rt = ggplot(data=Iaquinta) + geom_density(mapping = aes(x = Iaquinta[,var1], fill = Iaquinta[,var2]), position = "dodge")
  return(rt)
}

Al23(Iaquinta, 2, 2)
## Warning: Width not defined. Set with `position_dodge(width = ?)`

Al25=function(x){
  for (i in 1:ncol(x)){
    if (is.numeric(x[,i])){
      print(ggplot(data=x)+geom_density(mapping=aes(x=x[,i]))+xlab(names(x)[i]))
    }
  }
}
Al25(Iaquinta)

Model Tuning - Random Forest with 10-fold cross validation

myGrid = expand.grid(mtry = c(1:2), splitrule = c("gini"),
                     min.node.size = c(1:2))

rf_Iaquinta10 <- train(Stage~.,data = Iaquinta, method = "ranger", 
               trControl = trainControl(method ="cv", number = 10, verboseIter = TRUE),
               tuneGrid = myGrid)
## + Fold01: mtry=1, splitrule=gini, min.node.size=1 
## - Fold01: mtry=1, splitrule=gini, min.node.size=1 
## + Fold01: mtry=2, splitrule=gini, min.node.size=1 
## - Fold01: mtry=2, splitrule=gini, min.node.size=1 
## + Fold01: mtry=1, splitrule=gini, min.node.size=2 
## - Fold01: mtry=1, splitrule=gini, min.node.size=2 
## + Fold01: mtry=2, splitrule=gini, min.node.size=2 
## - Fold01: mtry=2, splitrule=gini, min.node.size=2 
## + Fold02: mtry=1, splitrule=gini, min.node.size=1 
## - Fold02: mtry=1, splitrule=gini, min.node.size=1 
## + Fold02: mtry=2, splitrule=gini, min.node.size=1 
## - Fold02: mtry=2, splitrule=gini, min.node.size=1 
## + Fold02: mtry=1, splitrule=gini, min.node.size=2 
## - Fold02: mtry=1, splitrule=gini, min.node.size=2 
## + Fold02: mtry=2, splitrule=gini, min.node.size=2 
## - Fold02: mtry=2, splitrule=gini, min.node.size=2 
## + Fold03: mtry=1, splitrule=gini, min.node.size=1 
## - Fold03: mtry=1, splitrule=gini, min.node.size=1 
## + Fold03: mtry=2, splitrule=gini, min.node.size=1 
## - Fold03: mtry=2, splitrule=gini, min.node.size=1 
## + Fold03: mtry=1, splitrule=gini, min.node.size=2 
## - Fold03: mtry=1, splitrule=gini, min.node.size=2 
## + Fold03: mtry=2, splitrule=gini, min.node.size=2 
## - Fold03: mtry=2, splitrule=gini, min.node.size=2 
## + Fold04: mtry=1, splitrule=gini, min.node.size=1 
## - Fold04: mtry=1, splitrule=gini, min.node.size=1 
## + Fold04: mtry=2, splitrule=gini, min.node.size=1 
## - Fold04: mtry=2, splitrule=gini, min.node.size=1 
## + Fold04: mtry=1, splitrule=gini, min.node.size=2 
## - Fold04: mtry=1, splitrule=gini, min.node.size=2 
## + Fold04: mtry=2, splitrule=gini, min.node.size=2 
## - Fold04: mtry=2, splitrule=gini, min.node.size=2 
## + Fold05: mtry=1, splitrule=gini, min.node.size=1 
## - Fold05: mtry=1, splitrule=gini, min.node.size=1 
## + Fold05: mtry=2, splitrule=gini, min.node.size=1 
## - Fold05: mtry=2, splitrule=gini, min.node.size=1 
## + Fold05: mtry=1, splitrule=gini, min.node.size=2 
## - Fold05: mtry=1, splitrule=gini, min.node.size=2 
## + Fold05: mtry=2, splitrule=gini, min.node.size=2 
## - Fold05: mtry=2, splitrule=gini, min.node.size=2 
## + Fold06: mtry=1, splitrule=gini, min.node.size=1 
## - Fold06: mtry=1, splitrule=gini, min.node.size=1 
## + Fold06: mtry=2, splitrule=gini, min.node.size=1 
## - Fold06: mtry=2, splitrule=gini, min.node.size=1 
## + Fold06: mtry=1, splitrule=gini, min.node.size=2 
## - Fold06: mtry=1, splitrule=gini, min.node.size=2 
## + Fold06: mtry=2, splitrule=gini, min.node.size=2 
## - Fold06: mtry=2, splitrule=gini, min.node.size=2 
## + Fold07: mtry=1, splitrule=gini, min.node.size=1 
## - Fold07: mtry=1, splitrule=gini, min.node.size=1 
## + Fold07: mtry=2, splitrule=gini, min.node.size=1 
## - Fold07: mtry=2, splitrule=gini, min.node.size=1 
## + Fold07: mtry=1, splitrule=gini, min.node.size=2 
## - Fold07: mtry=1, splitrule=gini, min.node.size=2 
## + Fold07: mtry=2, splitrule=gini, min.node.size=2 
## - Fold07: mtry=2, splitrule=gini, min.node.size=2 
## + Fold08: mtry=1, splitrule=gini, min.node.size=1 
## - Fold08: mtry=1, splitrule=gini, min.node.size=1 
## + Fold08: mtry=2, splitrule=gini, min.node.size=1 
## - Fold08: mtry=2, splitrule=gini, min.node.size=1 
## + Fold08: mtry=1, splitrule=gini, min.node.size=2 
## - Fold08: mtry=1, splitrule=gini, min.node.size=2 
## + Fold08: mtry=2, splitrule=gini, min.node.size=2 
## - Fold08: mtry=2, splitrule=gini, min.node.size=2 
## + Fold09: mtry=1, splitrule=gini, min.node.size=1 
## - Fold09: mtry=1, splitrule=gini, min.node.size=1 
## + Fold09: mtry=2, splitrule=gini, min.node.size=1 
## - Fold09: mtry=2, splitrule=gini, min.node.size=1 
## + Fold09: mtry=1, splitrule=gini, min.node.size=2 
## - Fold09: mtry=1, splitrule=gini, min.node.size=2 
## + Fold09: mtry=2, splitrule=gini, min.node.size=2 
## - Fold09: mtry=2, splitrule=gini, min.node.size=2 
## + Fold10: mtry=1, splitrule=gini, min.node.size=1 
## - Fold10: mtry=1, splitrule=gini, min.node.size=1 
## + Fold10: mtry=2, splitrule=gini, min.node.size=1 
## - Fold10: mtry=2, splitrule=gini, min.node.size=1 
## + Fold10: mtry=1, splitrule=gini, min.node.size=2 
## - Fold10: mtry=1, splitrule=gini, min.node.size=2 
## + Fold10: mtry=2, splitrule=gini, min.node.size=2 
## - Fold10: mtry=2, splitrule=gini, min.node.size=2 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 1, splitrule = gini, min.node.size = 1 on full training set
rf_Iaquinta10
## Random Forest 
## 
## 4572 samples
##   11 predictor
##    5 classes: 'Prelim', 'Final', 'Semi_Final', 'Quarter_Final', 'Round_16' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4115, 4114, 4115, 4113, 4115, 4116, ... 
## Resampling results across tuning parameters:
## 
##   mtry  min.node.size  Accuracy   Kappa
##   1     1              0.9534159  0    
##   1     2              0.9534159  0    
##   2     1              0.9534159  0    
##   2     2              0.9534159  0    
## 
## Tuning parameter 'splitrule' was held constant at a value of gini
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 1, splitrule = gini
##  and min.node.size = 1.
#Best model with an accuracy of 0.9534138