December 20, 2018
Iaquinta[,"RoundID"] = NULL Iaquinta[,"MatchID"] = NULL Iaquinta[,"Referee"] = NULL Iaquinta[,"Assistant.1"] = NULL Iaquinta[,"Assistant.2"] = NULL Iaquinta[,"Datetime"] = NULL Iaquinta[,"Home.Team.Initials"] = NULL Iaquinta[,"Away.Team.Initials"] = NULL
sum(is.na(Iaquinta))
## [1] 22322
AL=function(x){
for (i in 1:ncol(x)){
if (is.numeric(x[,i])){
x[,i][is.na(x[,i])]=mean(x[,i], na.rm=TRUE)
}else{
levels=unique(x[,i])
x[,i][is.na(x[,i])]=levels[which.max(tabulate(match(x[,i], levels)))]
}
}
return (x)
}
Iaquinta <- AL(Iaquinta)
#We had 22322 missing values in the first place
#This method brings the #of missing values to 0
levels(Iaquinta$Stage)
## [1] "" "Final" ## [3] "First round" "Group 1" ## [5] "Group 2" "Group 3" ## [7] "Group 4" "Group 5" ## [9] "Group 6" "Group A" ## [11] "Group B" "Group C" ## [13] "Group D" "Group E" ## [15] "Group F" "Group G" ## [17] "Group H" "Match for third place" ## [19] "Play-off for third place" "Preliminary round" ## [21] "Quarter-finals" "Round of 16" ## [23] "Semi-finals" "Third place"
levels(Iaquinta$Stage)=c("Prelim", "Final", "Prelim", "Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Prelim","Semi_Final","Semi_Final","Prelim","Quarter_Final","Round_16","Semi_Final","Semi_Final")
levels(Iaquinta$Stage)
## [1] "Prelim" "Final" "Semi_Final" "Quarter_Final" ## [5] "Round_16"
dummies_model <- dummyVars(Year ~., data=Iaquinta) trainData_mat <- predict(dummies_model, newdata =Iaquinta) trainData <- data.frame(trainData_mat) trainData$Year <- Iaquinta$Year
library(ggplot2) ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Attendance, fill = Stage)) + facet_wrap(~Stage)
library(ggplot2) ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Home.Team.Goals, fill = Stage)) + facet_wrap(~Stage)
library(ggplot2) ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Away.Team.Goals, fill = Stage)) + facet_wrap(~Stage)
library(ggplot2) ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Attendance, fill = Year)) + facet_wrap(~Year)
library(ggplot2) ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Half.time.Home.Goals, fill = Stage)) + facet_wrap(~Stage)
library(ggplot2) ggplot(data = Iaquinta) + geom_density(mapping = aes(x = Half.time.Away.Goals, fill = Stage)) + facet_wrap(~Stage)
Al22 <- function(Iaquinta,var1,var2) {
rt = ggplot(data=Iaquinta) + geom_bar(mapping = aes(x = Iaquinta[,var1], fill = Iaquinta[,var2]), position = "dodge")
return(rt)
}
Al22(Iaquinta, 2, 2)
Al23 <- function(Iaquinta,var1,var2) {
rt = ggplot(data=Iaquinta) + geom_density(mapping = aes(x = Iaquinta[,var1], fill = Iaquinta[,var2]), position = "dodge")
return(rt)
}
Al23(Iaquinta, 2, 2)
## Warning: Width not defined. Set with `position_dodge(width = ?)`
Al25=function(x){
for (i in 1:ncol(x)){
if (is.numeric(x[,i])){
print(ggplot(data=x)+geom_density(mapping=aes(x=x[,i]))+xlab(names(x)[i]))
}
}
}
Al25(Iaquinta)
myGrid = expand.grid(mtry = c(1:2), splitrule = c("gini"),
min.node.size = c(1:2))
rf_Iaquinta10 <- train(Stage~.,data = Iaquinta, method = "ranger",
trControl = trainControl(method ="cv", number = 10, verboseIter = TRUE),
tuneGrid = myGrid)
## + Fold01: mtry=1, splitrule=gini, min.node.size=1 ## - Fold01: mtry=1, splitrule=gini, min.node.size=1 ## + Fold01: mtry=2, splitrule=gini, min.node.size=1 ## - Fold01: mtry=2, splitrule=gini, min.node.size=1 ## + Fold01: mtry=1, splitrule=gini, min.node.size=2 ## - Fold01: mtry=1, splitrule=gini, min.node.size=2 ## + Fold01: mtry=2, splitrule=gini, min.node.size=2 ## - Fold01: mtry=2, splitrule=gini, min.node.size=2 ## + Fold02: mtry=1, splitrule=gini, min.node.size=1 ## - Fold02: mtry=1, splitrule=gini, min.node.size=1 ## + Fold02: mtry=2, splitrule=gini, min.node.size=1 ## - Fold02: mtry=2, splitrule=gini, min.node.size=1 ## + Fold02: mtry=1, splitrule=gini, min.node.size=2 ## - Fold02: mtry=1, splitrule=gini, min.node.size=2 ## + Fold02: mtry=2, splitrule=gini, min.node.size=2 ## - Fold02: mtry=2, splitrule=gini, min.node.size=2 ## + Fold03: mtry=1, splitrule=gini, min.node.size=1 ## - Fold03: mtry=1, splitrule=gini, min.node.size=1 ## + Fold03: mtry=2, splitrule=gini, min.node.size=1 ## - Fold03: mtry=2, splitrule=gini, min.node.size=1 ## + Fold03: mtry=1, splitrule=gini, min.node.size=2 ## - Fold03: mtry=1, splitrule=gini, min.node.size=2 ## + Fold03: mtry=2, splitrule=gini, min.node.size=2 ## - Fold03: mtry=2, splitrule=gini, min.node.size=2 ## + Fold04: mtry=1, splitrule=gini, min.node.size=1 ## - Fold04: mtry=1, splitrule=gini, min.node.size=1 ## + Fold04: mtry=2, splitrule=gini, min.node.size=1 ## - Fold04: mtry=2, splitrule=gini, min.node.size=1 ## + Fold04: mtry=1, splitrule=gini, min.node.size=2 ## - Fold04: mtry=1, splitrule=gini, min.node.size=2 ## + Fold04: mtry=2, splitrule=gini, min.node.size=2 ## - Fold04: mtry=2, splitrule=gini, min.node.size=2 ## + Fold05: mtry=1, splitrule=gini, min.node.size=1 ## - Fold05: mtry=1, splitrule=gini, min.node.size=1 ## + Fold05: mtry=2, splitrule=gini, min.node.size=1 ## - Fold05: mtry=2, splitrule=gini, min.node.size=1 ## + Fold05: mtry=1, splitrule=gini, min.node.size=2 ## - Fold05: mtry=1, splitrule=gini, min.node.size=2 ## + Fold05: mtry=2, splitrule=gini, min.node.size=2 ## - Fold05: mtry=2, splitrule=gini, min.node.size=2 ## + Fold06: mtry=1, splitrule=gini, min.node.size=1 ## - Fold06: mtry=1, splitrule=gini, min.node.size=1 ## + Fold06: mtry=2, splitrule=gini, min.node.size=1 ## - Fold06: mtry=2, splitrule=gini, min.node.size=1 ## + Fold06: mtry=1, splitrule=gini, min.node.size=2 ## - Fold06: mtry=1, splitrule=gini, min.node.size=2 ## + Fold06: mtry=2, splitrule=gini, min.node.size=2 ## - Fold06: mtry=2, splitrule=gini, min.node.size=2 ## + Fold07: mtry=1, splitrule=gini, min.node.size=1 ## - Fold07: mtry=1, splitrule=gini, min.node.size=1 ## + Fold07: mtry=2, splitrule=gini, min.node.size=1 ## - Fold07: mtry=2, splitrule=gini, min.node.size=1 ## + Fold07: mtry=1, splitrule=gini, min.node.size=2 ## - Fold07: mtry=1, splitrule=gini, min.node.size=2 ## + Fold07: mtry=2, splitrule=gini, min.node.size=2 ## - Fold07: mtry=2, splitrule=gini, min.node.size=2 ## + Fold08: mtry=1, splitrule=gini, min.node.size=1 ## - Fold08: mtry=1, splitrule=gini, min.node.size=1 ## + Fold08: mtry=2, splitrule=gini, min.node.size=1 ## - Fold08: mtry=2, splitrule=gini, min.node.size=1 ## + Fold08: mtry=1, splitrule=gini, min.node.size=2 ## - Fold08: mtry=1, splitrule=gini, min.node.size=2 ## + Fold08: mtry=2, splitrule=gini, min.node.size=2 ## - Fold08: mtry=2, splitrule=gini, min.node.size=2 ## + Fold09: mtry=1, splitrule=gini, min.node.size=1 ## - Fold09: mtry=1, splitrule=gini, min.node.size=1 ## + Fold09: mtry=2, splitrule=gini, min.node.size=1 ## - Fold09: mtry=2, splitrule=gini, min.node.size=1 ## + Fold09: mtry=1, splitrule=gini, min.node.size=2 ## - Fold09: mtry=1, splitrule=gini, min.node.size=2 ## + Fold09: mtry=2, splitrule=gini, min.node.size=2 ## - Fold09: mtry=2, splitrule=gini, min.node.size=2 ## + Fold10: mtry=1, splitrule=gini, min.node.size=1 ## - Fold10: mtry=1, splitrule=gini, min.node.size=1 ## + Fold10: mtry=2, splitrule=gini, min.node.size=1 ## - Fold10: mtry=2, splitrule=gini, min.node.size=1 ## + Fold10: mtry=1, splitrule=gini, min.node.size=2 ## - Fold10: mtry=1, splitrule=gini, min.node.size=2 ## + Fold10: mtry=2, splitrule=gini, min.node.size=2 ## - Fold10: mtry=2, splitrule=gini, min.node.size=2 ## Aggregating results ## Selecting tuning parameters ## Fitting mtry = 1, splitrule = gini, min.node.size = 1 on full training set
rf_Iaquinta10
## Random Forest ## ## 4572 samples ## 11 predictor ## 5 classes: 'Prelim', 'Final', 'Semi_Final', 'Quarter_Final', 'Round_16' ## ## No pre-processing ## Resampling: Cross-Validated (10 fold) ## Summary of sample sizes: 4115, 4114, 4115, 4113, 4115, 4116, ... ## Resampling results across tuning parameters: ## ## mtry min.node.size Accuracy Kappa ## 1 1 0.9534159 0 ## 1 2 0.9534159 0 ## 2 1 0.9534159 0 ## 2 2 0.9534159 0 ## ## Tuning parameter 'splitrule' was held constant at a value of gini ## Accuracy was used to select the optimal model using the largest value. ## The final values used for the model were mtry = 1, splitrule = gini ## and min.node.size = 1.
#Best model with an accuracy of 0.9534138