Data Loading

Loads necessary R packages for neural networks, data manipulation, visualization, and model training. Sets random seed to 42 for reproducibility. Reads the soccer dataset from CSV file and converts the outcome variable to a categorical factor with three levels for classification modeling.

library(nnet)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(knitr)

set.seed(42)

data <- read.csv("soccer_matches.csv", stringsAsFactors = FALSE)
data$outcome <- factor(data$outcome, levels = c("Away Win", "Draw", "Home Win"))

cat("Dataset:", nrow(data), "matches x", ncol(data), "variables\n")
## Dataset: 1500 matches x 20 variables
kable(prop.table(table(data$outcome)) * 100, 
      col.names = c("Outcome", "Percentage"),
      caption = "Match Outcome Distribution")
Match Outcome Distribution
Outcome Percentage
Away Win 32.13333
Draw 23.33333
Home Win 44.53333

The dataset shows realistic soccer patterns with 44.5% home wins, 32.1% away wins, and 23.3% draws.

Feature Engineering

Creates four new features that capture relative advantages between teams: shot accuracy difference, possession difference, team quality difference, and disciplinary difference.

data$shot_accuracy_diff <- (data$home_shots_on_target / (data$home_shots + 0.1)) - 
                           (data$away_shots_on_target / (data$away_shots + 0.1))
data$possession_diff <- data$home_possession - data$away_possession
data$rank_diff <- data$away_team_rank - data$home_team_rank
data$total_cards_diff <- (data$home_yellow_cards + data$home_red_cards * 2) - 
                         (data$away_yellow_cards + data$away_red_cards * 2)

Exploratory Analysis

Boxplots showing how key statistics (shots on target, possession, rank difference) vary across different match outcomes to identify predictive patterns.

data %>%
  select(outcome, home_shots_on_target, home_possession, rank_diff) %>%
  pivot_longer(cols = -outcome, names_to = "metric", values_to = "value") %>%
  ggplot(aes(x = outcome, y = value, fill = outcome)) +
  geom_boxplot() +
  facet_wrap(~ metric, scales = "free_y") +
  labs(title = "Key Match Statistics by Outcome", x = "", y = "Value") +
  theme_minimal() +
  theme(legend.position = "none")

Data Preparation

Selects 19 features for modeling and splits the dataset into 70% training (1,051 matches) and 30% testing (449 matches) using stratified sampling to maintain outcome proportions.

feature_cols <- c("home_possession", "home_shots", "home_shots_on_target", 
                 "home_corners", "home_fouls", "home_yellow_cards", "home_red_cards",
                 "away_shots", "away_shots_on_target", "away_corners", 
                 "away_fouls", "away_yellow_cards", "away_red_cards",
                 "home_team_rank", "away_team_rank",
                 "shot_accuracy_diff", "possession_diff", "rank_diff", "total_cards_diff")

model_data <- data[, c(feature_cols, "outcome")]

train_index <- createDataPartition(model_data$outcome, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

preproc <- preProcess(train_data[, feature_cols], method = c("center", "scale"))
train_scaled <- predict(preproc, train_data)
test_scaled <- predict(preproc, test_data)

cat("Training set:", nrow(train_data), "| Test set:", nrow(test_data), "\n")
## Training set: 1051 | Test set: 449

Logistic Regression

Trains a multinomial logistic regression model on all features, makes predictions on the test set, and evaluates performance using a confusion matrix.

logit_model <- multinom(outcome ~ ., data = train_scaled, maxit = 500, trace = FALSE)

logit_pred <- predict(logit_model, newdata = test_scaled)
logit_cm <- confusionMatrix(logit_pred, test_scaled$outcome)

logit_accuracy <- logit_cm$overall['Accuracy']
cat("Logistic Regression Accuracy:", round(logit_accuracy * 100, 2), "%\n")
## Logistic Regression Accuracy: 61.92 %
kable(data.frame(
  Class = c("Away win", "Draw", "Home Win"),
  Precision = round(logit_cm$byClass[,5] * 100, 1),
  Recall = round(logit_cm$byClass[,6] * 100, 1),
  F1_Score = round(logit_cm$byClass[,7], 3)
), caption = "Logistic Regression Performance by Class")
Logistic Regression Performance by Class
Class Precision Recall F1_Score
Class: Away Win Away win 62.6 70.8 0.664
Class: Draw Draw 36.8 6.7 0.113
Class: Home Win Home Win 63.3 84.5 0.724

Neural Network

Trains a neural network with 10 hidden units and regularization (decay=0.01), makes predictions on the test set, and evaluates performance. accuracy.

nn_model <- nnet(outcome ~ ., 
                data = train_scaled, 
                size = 10,
                decay = 0.01,
                maxit = 500,
                trace = FALSE)

nn_pred <- predict(nn_model, newdata = test_scaled, type = "class")
nn_pred <- factor(nn_pred, levels = levels(test_scaled$outcome))
nn_cm <- confusionMatrix(nn_pred, test_scaled$outcome)

nn_accuracy <- nn_cm$overall['Accuracy']
cat("Neural Network Accuracy:", round(nn_accuracy * 100, 2), "%\n")
## Neural Network Accuracy: 51.89 %

Creates a formatted table showing how well the logistic regression model performs for each outcome class (Away Win, Draw, Home Win).

kable(data.frame(
  Class = c("Away win", "Draw", "Home Win"),
  Precision = round(nn_cm$byClass[,5] * 100, 1),
  Recall = round(nn_cm$byClass[,6] * 100, 1),
  F1_Score = round(nn_cm$byClass[,7], 3)
), caption = "Neural Network Performance by Class")
Neural Network Performance by Class
Class Precision Recall F1_Score
Class: Away Win Away win 57.7 54.9 0.562
Class: Draw Draw 27.8 21.0 0.239
Class: Home Win Home Win 56.7 66.0 0.610

Model Comparison

Comparison table showing accuracy and improvement over random baseline (33.3%) for both models. Logistic regression outperforms neural network by 10 percentage points.

comparison <- data.frame(
  Model = c("Random Baseline", "Logistic Regression", "Neural Network"),
  Accuracy = c(33.33, round(logit_accuracy * 100, 2), round(nn_accuracy * 100, 2)),
  Improvement = c(0, round((logit_accuracy - 0.333) * 100, 2),
                  round((nn_accuracy - 0.333) * 100, 2))
)

kable(comparison, caption = "Model Performance Comparison")
Model Performance Comparison
Model Accuracy Improvement
Random Baseline 33.33 0.00
Logistic Regression 61.92 28.62
Neural Network 51.89 18.59

Bar chart comparing model accuracies with a horizontal line showing the random baseline, visually demonstrating that both models substantially beat random guessing.

barplot(comparison$Accuracy,
        names.arg = comparison$Model,
        main = "Model Accuracy Comparison",
        ylab = "Accuracy (%)",
        col = c("#d62728", "#1f77b4", "#ff7f0e"),
        ylim = c(0, 100))
abline(h = 33.33, col = "red", lty = 2, lwd = 2)
text(2, 35, "Random Baseline", col = "red", cex = 0.9)