Data Loading

Loads necessary R packages for neural networks, data manipulation, visualization, and model training. Sets random seed to 42 for reproducibility. Reads the soccer dataset from CSV file and converts the outcome variable to a categorical factor with three levels for classification modeling.

library(nnet)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(knitr)

set.seed(42)

data <- read.csv("soccer_matches.csv", stringsAsFactors = FALSE)
data$outcome <- factor(data$outcome, levels = c("Away Win", "Draw", "Home Win"))

cat("Dataset:", nrow(data), "matches x", ncol(data), "variables\n")

## Dataset: 1500 matches x 20 variables

kable(prop.table(table(data$outcome)) * 100, 
      col.names = c("Outcome", "Percentage"),
      caption = "Match Outcome Distribution")

Match Outcome Distribution
Outcome	Percentage
Away Win	32.13333
Draw	23.33333
Home Win	44.53333

The dataset shows realistic soccer patterns with 44.5% home wins, 32.1% away wins, and 23.3% draws.

Feature Engineering

Creates four new features that capture relative advantages between teams: shot accuracy difference, possession difference, team quality difference, and disciplinary difference.

data$shot_accuracy_diff <- (data$home_shots_on_target / (data$home_shots + 0.1)) - 
                           (data$away_shots_on_target / (data$away_shots + 0.1))
data$possession_diff <- data$home_possession - data$away_possession
data$rank_diff <- data$away_team_rank - data$home_team_rank
data$total_cards_diff <- (data$home_yellow_cards + data$home_red_cards * 2) - 
                         (data$away_yellow_cards + data$away_red_cards * 2)

Exploratory Analysis

Boxplots showing how key statistics (shots on target, possession, rank difference) vary across different match outcomes to identify predictive patterns.

data %>%
  select(outcome, home_shots_on_target, home_possession, rank_diff) %>%
  pivot_longer(cols = -outcome, names_to = "metric", values_to = "value") %>%
  ggplot(aes(x = outcome, y = value, fill = outcome)) +
  geom_boxplot() +
  facet_wrap(~ metric, scales = "free_y") +
  labs(title = "Key Match Statistics by Outcome", x = "", y = "Value") +
  theme_minimal() +
  theme(legend.position = "none")

Data Preparation

Selects 19 features for modeling and splits the dataset into 70% training (1,051 matches) and 30% testing (449 matches) using stratified sampling to maintain outcome proportions.

feature_cols <- c("home_possession", "home_shots", "home_shots_on_target", 
                 "home_corners", "home_fouls", "home_yellow_cards", "home_red_cards",
                 "away_shots", "away_shots_on_target", "away_corners", 
                 "away_fouls", "away_yellow_cards", "away_red_cards",
                 "home_team_rank", "away_team_rank",
                 "shot_accuracy_diff", "possession_diff", "rank_diff", "total_cards_diff")

model_data <- data[, c(feature_cols, "outcome")]

train_index <- createDataPartition(model_data$outcome, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

preproc <- preProcess(train_data[, feature_cols], method = c("center", "scale"))
train_scaled <- predict(preproc, train_data)
test_scaled <- predict(preproc, test_data)

cat("Training set:", nrow(train_data), "| Test set:", nrow(test_data), "\n")

## Training set: 1051 | Test set: 449

Logistic Regression

Trains a multinomial logistic regression model on all features, makes predictions on the test set, and evaluates performance using a confusion matrix.

logit_model <- multinom(outcome ~ ., data = train_scaled, maxit = 500, trace = FALSE)

logit_pred <- predict(logit_model, newdata = test_scaled)
logit_cm <- confusionMatrix(logit_pred, test_scaled$outcome)

logit_accuracy <- logit_cm$overall['Accuracy']
cat("Logistic Regression Accuracy:", round(logit_accuracy * 100, 2), "%\n")

## Logistic Regression Accuracy: 61.92 %

kable(data.frame(
  Class = c("Away win", "Draw", "Home Win"),
  Precision = round(logit_cm$byClass[,5] * 100, 1),
  Recall = round(logit_cm$byClass[,6] * 100, 1),
  F1_Score = round(logit_cm$byClass[,7], 3)
), caption = "Logistic Regression Performance by Class")

Logistic Regression Performance by Class
	Class	Precision	Recall	F1_Score
Class: Away Win	Away win	62.6	70.8	0.664
Class: Draw	Draw	36.8	6.7	0.113
Class: Home Win	Home Win	63.3	84.5	0.724

Neural Network

Trains a neural network with 10 hidden units and regularization (decay=0.01), makes predictions on the test set, and evaluates performance. accuracy.

nn_model <- nnet(outcome ~ ., 
                data = train_scaled, 
                size = 10,
                decay = 0.01,
                maxit = 500,
                trace = FALSE)

nn_pred <- predict(nn_model, newdata = test_scaled, type = "class")
nn_pred <- factor(nn_pred, levels = levels(test_scaled$outcome))
nn_cm <- confusionMatrix(nn_pred, test_scaled$outcome)

nn_accuracy <- nn_cm$overall['Accuracy']
cat("Neural Network Accuracy:", round(nn_accuracy * 100, 2), "%\n")

## Neural Network Accuracy: 51.89 %

Creates a formatted table showing how well the logistic regression model performs for each outcome class (Away Win, Draw, Home Win).

kable(data.frame(
  Class = c("Away win", "Draw", "Home Win"),
  Precision = round(nn_cm$byClass[,5] * 100, 1),
  Recall = round(nn_cm$byClass[,6] * 100, 1),
  F1_Score = round(nn_cm$byClass[,7], 3)
), caption = "Neural Network Performance by Class")

Neural Network Performance by Class
	Class	Precision	Recall	F1_Score
Class: Away Win	Away win	57.7	54.9	0.562
Class: Draw	Draw	27.8	21.0	0.239
Class: Home Win	Home Win	56.7	66.0	0.610

Model Comparison

Comparison table showing accuracy and improvement over random baseline (33.3%) for both models. Logistic regression outperforms neural network by 10 percentage points.

comparison <- data.frame(
  Model = c("Random Baseline", "Logistic Regression", "Neural Network"),
  Accuracy = c(33.33, round(logit_accuracy * 100, 2), round(nn_accuracy * 100, 2)),
  Improvement = c(0, round((logit_accuracy - 0.333) * 100, 2),
                  round((nn_accuracy - 0.333) * 100, 2))
)

kable(comparison, caption = "Model Performance Comparison")

Model Performance Comparison
Model	Accuracy	Improvement
Random Baseline	33.33	0.00
Logistic Regression	61.92	28.62
Neural Network	51.89	18.59

Bar chart comparing model accuracies with a horizontal line showing the random baseline, visually demonstrating that both models substantially beat random guessing.

barplot(comparison$Accuracy,
        names.arg = comparison$Model,
        main = "Model Accuracy Comparison",
        ylab = "Accuracy (%)",
        col = c("#d62728", "#1f77b4", "#ff7f0e"),
        ylim = c(0, 100))
abline(h = 33.33, col = "red", lty = 2, lwd = 2)
text(2, 35, "Random Baseline", col = "red", cex = 0.9)

Predicting Soccer Match Outcomes with Machine Learning

Michae Robinson

December 14, 2025