Loads necessary R packages for neural networks, data manipulation, visualization, and model training. Sets random seed to 42 for reproducibility. Reads the soccer dataset from CSV file and converts the outcome variable to a categorical factor with three levels for classification modeling.
library(nnet)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
set.seed(42)
data <- read.csv("soccer_matches.csv", stringsAsFactors = FALSE)
data$outcome <- factor(data$outcome, levels = c("Away Win", "Draw", "Home Win"))
cat("Dataset:", nrow(data), "matches x", ncol(data), "variables\n")
## Dataset: 1500 matches x 20 variables
kable(prop.table(table(data$outcome)) * 100,
col.names = c("Outcome", "Percentage"),
caption = "Match Outcome Distribution")
| Outcome | Percentage |
|---|---|
| Away Win | 32.13333 |
| Draw | 23.33333 |
| Home Win | 44.53333 |
The dataset shows realistic soccer patterns with 44.5% home wins, 32.1% away wins, and 23.3% draws.
Creates four new features that capture relative advantages between teams: shot accuracy difference, possession difference, team quality difference, and disciplinary difference.
data$shot_accuracy_diff <- (data$home_shots_on_target / (data$home_shots + 0.1)) -
(data$away_shots_on_target / (data$away_shots + 0.1))
data$possession_diff <- data$home_possession - data$away_possession
data$rank_diff <- data$away_team_rank - data$home_team_rank
data$total_cards_diff <- (data$home_yellow_cards + data$home_red_cards * 2) -
(data$away_yellow_cards + data$away_red_cards * 2)
Boxplots showing how key statistics (shots on target, possession, rank difference) vary across different match outcomes to identify predictive patterns.
data %>%
select(outcome, home_shots_on_target, home_possession, rank_diff) %>%
pivot_longer(cols = -outcome, names_to = "metric", values_to = "value") %>%
ggplot(aes(x = outcome, y = value, fill = outcome)) +
geom_boxplot() +
facet_wrap(~ metric, scales = "free_y") +
labs(title = "Key Match Statistics by Outcome", x = "", y = "Value") +
theme_minimal() +
theme(legend.position = "none")
Selects 19 features for modeling and splits the dataset into 70% training (1,051 matches) and 30% testing (449 matches) using stratified sampling to maintain outcome proportions.
feature_cols <- c("home_possession", "home_shots", "home_shots_on_target",
"home_corners", "home_fouls", "home_yellow_cards", "home_red_cards",
"away_shots", "away_shots_on_target", "away_corners",
"away_fouls", "away_yellow_cards", "away_red_cards",
"home_team_rank", "away_team_rank",
"shot_accuracy_diff", "possession_diff", "rank_diff", "total_cards_diff")
model_data <- data[, c(feature_cols, "outcome")]
train_index <- createDataPartition(model_data$outcome, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
preproc <- preProcess(train_data[, feature_cols], method = c("center", "scale"))
train_scaled <- predict(preproc, train_data)
test_scaled <- predict(preproc, test_data)
cat("Training set:", nrow(train_data), "| Test set:", nrow(test_data), "\n")
## Training set: 1051 | Test set: 449
Trains a multinomial logistic regression model on all features, makes predictions on the test set, and evaluates performance using a confusion matrix.
logit_model <- multinom(outcome ~ ., data = train_scaled, maxit = 500, trace = FALSE)
logit_pred <- predict(logit_model, newdata = test_scaled)
logit_cm <- confusionMatrix(logit_pred, test_scaled$outcome)
logit_accuracy <- logit_cm$overall['Accuracy']
cat("Logistic Regression Accuracy:", round(logit_accuracy * 100, 2), "%\n")
## Logistic Regression Accuracy: 61.92 %
kable(data.frame(
Class = c("Away win", "Draw", "Home Win"),
Precision = round(logit_cm$byClass[,5] * 100, 1),
Recall = round(logit_cm$byClass[,6] * 100, 1),
F1_Score = round(logit_cm$byClass[,7], 3)
), caption = "Logistic Regression Performance by Class")
| Class | Precision | Recall | F1_Score | |
|---|---|---|---|---|
| Class: Away Win | Away win | 62.6 | 70.8 | 0.664 |
| Class: Draw | Draw | 36.8 | 6.7 | 0.113 |
| Class: Home Win | Home Win | 63.3 | 84.5 | 0.724 |
Trains a neural network with 10 hidden units and regularization (decay=0.01), makes predictions on the test set, and evaluates performance. accuracy.
nn_model <- nnet(outcome ~ .,
data = train_scaled,
size = 10,
decay = 0.01,
maxit = 500,
trace = FALSE)
nn_pred <- predict(nn_model, newdata = test_scaled, type = "class")
nn_pred <- factor(nn_pred, levels = levels(test_scaled$outcome))
nn_cm <- confusionMatrix(nn_pred, test_scaled$outcome)
nn_accuracy <- nn_cm$overall['Accuracy']
cat("Neural Network Accuracy:", round(nn_accuracy * 100, 2), "%\n")
## Neural Network Accuracy: 51.89 %
Creates a formatted table showing how well the logistic regression model performs for each outcome class (Away Win, Draw, Home Win).
kable(data.frame(
Class = c("Away win", "Draw", "Home Win"),
Precision = round(nn_cm$byClass[,5] * 100, 1),
Recall = round(nn_cm$byClass[,6] * 100, 1),
F1_Score = round(nn_cm$byClass[,7], 3)
), caption = "Neural Network Performance by Class")
| Class | Precision | Recall | F1_Score | |
|---|---|---|---|---|
| Class: Away Win | Away win | 57.7 | 54.9 | 0.562 |
| Class: Draw | Draw | 27.8 | 21.0 | 0.239 |
| Class: Home Win | Home Win | 56.7 | 66.0 | 0.610 |
Comparison table showing accuracy and improvement over random baseline (33.3%) for both models. Logistic regression outperforms neural network by 10 percentage points.
comparison <- data.frame(
Model = c("Random Baseline", "Logistic Regression", "Neural Network"),
Accuracy = c(33.33, round(logit_accuracy * 100, 2), round(nn_accuracy * 100, 2)),
Improvement = c(0, round((logit_accuracy - 0.333) * 100, 2),
round((nn_accuracy - 0.333) * 100, 2))
)
kable(comparison, caption = "Model Performance Comparison")
| Model | Accuracy | Improvement |
|---|---|---|
| Random Baseline | 33.33 | 0.00 |
| Logistic Regression | 61.92 | 28.62 |
| Neural Network | 51.89 | 18.59 |
Bar chart comparing model accuracies with a horizontal line showing the random baseline, visually demonstrating that both models substantially beat random guessing.
barplot(comparison$Accuracy,
names.arg = comparison$Model,
main = "Model Accuracy Comparison",
ylab = "Accuracy (%)",
col = c("#d62728", "#1f77b4", "#ff7f0e"),
ylim = c(0, 100))
abline(h = 33.33, col = "red", lty = 2, lwd = 2)
text(2, 35, "Random Baseline", col = "red", cex = 0.9)