# Load necessary libraries
library(tidyverse)
# Import datasets (assuming they are in your current project directory)
season_stats <- read_csv("WomenSuperLeagueseason2022-23.csv")
Rows: 12 Columns: 16
── Column specification ───────────────────────────────
Delimiter: ","
chr (1): Squad
dbl (14): Rk, MP, W, D, L, GF, GA, GD, Pts, Pts/MP,...
num (1): Attendance
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
passing_stats <- read_csv("WomenSuperLeaguePassingstats2022-23.csv")
New names:
• `Cmp` -> `Cmp...5`
• `Att` -> `Att...6`
• `Cmp%` -> `Cmp%...7`
• `Cmp` -> `Cmp...10`
• `Att` -> `Att...11`
• `Cmp%` -> `Cmp%...12`
• `Cmp` -> `Cmp...13`
• `Att` -> `Att...14`
• `Cmp%` -> `Cmp%...15`
• `Cmp` -> `Cmp...16`
• `Att` -> `Att...17`
• `Cmp%` -> `Cmp%...18`
Rows: 12 Columns: 27
── Column specification ───────────────────────────────
Delimiter: ","
chr (1): Squad
dbl (26): Rank, # Pl, 90s, Cmp...5, Att...6, Cmp%.....
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pass_types <- read_csv("WomenSuperLeaguePassTypes2022-23.csv")
Rows: 12 Columns: 19
── Column specification ───────────────────────────────
Delimiter: ","
chr (1): Squad
dbl (18): Rank, # Pl, 90s, Att, Live, Dead, FK, TB,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview the data
head(season_stats)
head(passing_stats)
head(pass_types)
# Standardize column names to lowercase for consistency
season_stats <- season_stats %>% rename_all(tolower)
passing_stats <- passing_stats %>% rename_all(tolower)
pass_types <- pass_types %>% rename_all(tolower)
# Let's preview the squad names to ensure they align
unique(season_stats$squad)
[1] "Club Crest\xa0Chelsea"
[2] "Club Crest\xa0Manchester Utd"
[3] "Club Crest\xa0Arsenal"
[4] "Club Crest\xa0Manchester City"
[5] "Club Crest\xa0Aston Villa"
[6] "Club Crest\xa0Everton"
[7] "Club Crest\xa0Liverpool"
[8] "Club Crest\xa0West Ham"
[9] "Club Crest\xa0Tottenham"
[10] "Club Crest\xa0Leicester City WFC"
[11] "Club Crest\xa0Brighton"
[12] "Club Crest\xa0Reading"
unique(passing_stats$squad)
[1] "Arsenal" "Aston Villa"
[3] "Brighton" "Chelsea"
[5] "Everton" "Leicester City WFC"
[7] "Liverpool" "Manchester City"
[9] "Manchester Utd" "Reading"
[11] "Tottenham" "West Ham"
unique(pass_types$squad)
[1] "Arsenal" "Aston Villa"
[3] "Brighton" "Chelsea"
[5] "Everton" "Leicester City WFC"
[7] "Liverpool" "Manchester City"
[9] "Manchester Utd" "Reading"
[11] "Tottenham" "West Ham"
# Final cleanup to extract just the team name
season_stats <- season_stats %>%
mutate(squad = str_extract(squad, "[A-Za-z ]+$"))
# Check the result
unique(season_stats$squad)
[1] "Chelsea" "Manchester Utd"
[3] "Arsenal" "Manchester City"
[5] "Aston Villa" "Everton"
[7] "Liverpool" "West Ham"
[9] "Tottenham" "Leicester City WFC"
[11] "Brighton" "Reading"
# Merge all datasets on 'squad'
merged_data <- season_stats %>%
inner_join(passing_stats, by = "squad") %>%
inner_join(pass_types, by = "squad")
# View the structure of the merged data
glimpse(merged_data)
Rows: 12
Columns: 60
$ rk <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
$ squad <chr> "Chelsea", "Manchester Utd", "Ars…
$ mp <dbl> 22, 22, 22, 22, 22, 22, 22, 22, 2…
$ w <dbl> 19, 18, 15, 15, 11, 9, 6, 6, 5, 5…
$ d <dbl> 1, 2, 2, 2, 4, 3, 5, 3, 3, 1, 4, 2
$ l <dbl> 2, 2, 5, 5, 7, 10, 11, 13, 14, 16…
$ gf <dbl> 66, 56, 49, 50, 47, 29, 24, 23, 3…
$ ga <dbl> 15, 12, 16, 25, 37, 36, 39, 44, 4…
$ gd <dbl> 51, 44, 33, 25, 10, -7, -15, -21,…
$ pts <dbl> 58, 56, 47, 47, 37, 30, 23, 21, 1…
$ `pts/mp` <dbl> 2.64, 2.55, 2.14, 2.14, 1.68, 1.3…
$ xg <dbl> 43.8, 35.4, 42.7, 48.0, 28.0, 20.…
$ xga <dbl> 17.8, 13.0, 17.6, 17.2, 24.6, 31.…
$ xgd <dbl> 26.0, 22.4, 25.1, 30.8, 3.3, -11.…
$ `xgd/90` <dbl> 1.18, 1.02, 1.14, 1.40, 0.15, -0.…
$ attendance <dbl> 5616, 9648, 13996, 6559, 4199, 25…
$ rank.x <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
$ `# pl.x` <dbl> 25, 24, 27, 24, 26, 23, 27, 26, 2…
$ `90s.x` <dbl> 22, 22, 22, 22, 22, 22, 22, 22, 2…
$ cmp...5 <dbl> 9939, 10120, 9705, 10460, 8431, 8…
$ att...6 <dbl> 12582, 12518, 12030, 12748, 10922…
$ `cmp%...7` <dbl> 79.0, 80.8, 80.7, 82.1, 77.2, 78.…
$ totdist <dbl> 191912, 186350, 184631, 189947, 1…
$ prgdist <dbl> 69839, 68111, 66769, 71843, 60024…
$ cmp...10 <dbl> 3509, 3904, 3540, 4203, 3562, 355…
$ att...11 <dbl> 4219, 4556, 4122, 4828, 4251, 421…
$ `cmp%...12` <dbl> 83.2, 85.7, 85.9, 87.1, 83.8, 84.…
$ cmp...13 <dbl> 5064, 4944, 4911, 5062, 3834, 443…
$ att...14 <dbl> 5924, 5665, 5656, 5779, 4575, 521…
$ `cmp%...15` <dbl> 85.5, 87.3, 86.8, 87.6, 83.8, 85.…
$ cmp...16 <dbl> 1216, 1044, 1119, 989, 882, 744, …
$ att...17 <dbl> 1961, 1741, 1793, 1665, 1603, 143…
$ `cmp%...18` <dbl> 62.0, 60.0, 62.4, 59.4, 55.0, 51.…
$ ast <dbl> 44, 41, 31, 38, 33, 20, 15, 12, 2…
$ xag <dbl> 29.8, 24.9, 29.2, 36.6, 19.0, 14.…
$ xa <dbl> 33.2, 29.3, 31.8, 36.0, 20.1, 14.…
$ `a-xag` <dbl> 14.2, 16.1, 1.8, 1.4, 14.0, 5.2, …
$ kp <dbl> 286, 259, 295, 349, 202, 172, 144…
$ `3-jan` <dbl> 814, 754, 817, 907, 691, 545, 441…
$ ppa <dbl> 265, 256, 261, 276, 161, 128, 113…
$ crspa <dbl> 45, 58, 45, 93, 44, 31, 42, 25, 3…
$ prgp <dbl> 1004, 979, 1026, 1062, 831, 716, …
$ rank.y <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
$ `# pl.y` <dbl> 25, 24, 27, 24, 26, 23, 27, 26, 2…
$ `90s.y` <dbl> 22, 22, 22, 22, 22, 22, 22, 22, 2…
$ att <dbl> 12582, 12518, 12030, 12748, 10922…
$ live <dbl> 11440, 11375, 10911, 11630, 9768,…
$ dead <dbl> 1076, 1074, 1046, 1088, 1086, 112…
$ fk <dbl> 239, 249, 217, 232, 224, 278, 204…
$ tb <dbl> 17, 12, 15, 11, 6, 5, 4, 2, 7, 1,…
$ sw <dbl> 43, 32, 50, 30, 38, 19, 21, 22, 4…
$ crs <dbl> 380, 406, 375, 519, 425, 268, 307…
$ ti <dbl> 546, 567, 536, 511, 553, 490, 578…
$ ck <dbl> 128, 116, 156, 179, 115, 100, 82,…
$ `in` <dbl> 66, 35, 51, 84, 46, 48, 48, 25, 2…
$ out <dbl> 24, 29, 62, 23, 37, 10, 21, 33, 1…
$ str <dbl> 5, 23, 12, 24, 18, 12, 3, 2, 18, …
$ cmp <dbl> 9939, 10120, 9705, 10460, 8431, 8…
$ off <dbl> 66, 69, 73, 30, 68, 32, 33, 46, 4…
$ blocks <dbl> 226, 212, 214, 218, 268, 277, 200…
library(corrplot)
corrplot 0.95 loaded
# Select numeric columns of interest
eda_vars <- merged_data %>%
select(squad, pts, xg, `cmp%...7`, kp, prgp, xag, xa, `3-jan`, ppa, crspa, att...6, prgdist)
cor_data <- eda_vars %>% select(-squad)
# Create and plot the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
corrplot::corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)

# Fit a multiple linear regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)
# View the summary (coefficients, R², p-values)
summary(lm_pts)
Call:
lm(formula = pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)
Residuals:
Min 1Q Median 3Q Max
-5.9704 -2.0161 0.2005 2.6937 5.0021
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.43087 43.67086 0.239 0.8192
kp -0.18506 0.17667 -1.047 0.3352
prgp 0.13956 0.04537 3.076 0.0218 *
xag -0.17197 2.17614 -0.079 0.9396
xg 0.33561 1.04540 0.321 0.7591
`cmp%...7` -0.68601 0.77439 -0.886 0.4098
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.893 on 6 degrees of freedom
Multiple R-squared: 0.9534, Adjusted R-squared: 0.9145
F-statistic: 24.53 on 5 and 6 DF, p-value: 0.0006311
# Select key metrics for PCA
pca_vars <- merged_data %>%
select(kp, prgp, xag, xg, `cmp%...7`, prgdist)
# Run PCA (standardizing variables)
pca_res <- prcomp(pca_vars, scale. = TRUE)
# Summarize
summary(pca_res)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 2.3789 0.5208 0.1913 0.14666
Proportion of Variance 0.9432 0.0452 0.0061 0.00358
Cumulative Proportion 0.9432 0.9884 0.9945 0.99806
PC5 PC6
Standard deviation 0.09579 0.04991
Proportion of Variance 0.00153 0.00042
Cumulative Proportion 0.99958 1.00000
# Scree plot to pick number of components
plot(pca_res, type = "l", main = "PCA Scree Plot")

# Biplot of PC1 vs PC2
biplot(pca_res, scale = 0)

# View PCA loadings (contributions of each variable to the principal components)
pca_res$rotation[, 1] # PC1 loadings
kp prgp xag xg cmp%...7
-0.4106414 -0.4167559 -0.4132278 -0.4076769 -0.3860316
prgdist
-0.4143792
set.seed(42)
# Scale and cluster on the same PCA variables
scaled_vars <- scale(pca_vars)
km_res <- kmeans(scaled_vars, centers = 3) # try 2–4 clusters to see what makes sense
# Add cluster assignment back to data
merged_data$cluster <- factor(km_res$cluster)
# Visualize cluster membership
library(ggplot2)
ggplot(merged_data, aes(x = prgp, y = xg, color = cluster, label = squad)) +
geom_point(size = 4) +
geom_text(vjust = -0.75, size = 3) +
labs(title = "Team Clusters by Progressive Passes & xG",
x = "Progressive Passes",
y = "Expected Goals") +
theme_minimal()

# Fit the regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)
# Predict and plot
merged_data$predicted_pts <- predict(lm_pts)
ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
geom_point(size = 4, color = "#1D3557") +
geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
geom_text_repel(aes(label = squad), size = 4.5) +
labs(title = "Actual vs Predicted Points",
x = "Predicted Points (Linear Model)",
y = "Actual Points",
subtitle = "Based on key attacking metrics") +
theme_minimal(base_size = 14)
`geom_smooth()` using formula = 'y ~ x'

#load libraries
library(ggplot2)
library(reshape2)
# Select key variables
cor_vars <- merged_data %>%
select(pts, xg, xag, kp, prgp, `cmp%...7`, prgdist)
# Compute and reshape the correlation matrix
cor_matrix <- round(cor(cor_vars, use = "complete.obs"), 2)
cor_melted <- melt(cor_matrix)
# Plot with yellow midpoint
ggplot(cor_melted, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white", size = 0.8) +
geom_text(aes(label = value), color = "black", size = 4) +
scale_fill_gradient2(low = "#E63946", mid = "#FFD700", high = "#1D3557",
midpoint = 0.6, limits = c(0, 1), space = "Lab") +
labs(title = "Correlation Heatmap of Key Performance Metrics",
x = "", y = "", fill = "Correlation") +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
axis.text.y = element_text(face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)

# Team Comparison Bar Chart: Key Passes
ggplot(merged_data, aes(x = reorder(squad, kp), y = kp, fill = cluster)) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
scale_fill_manual(values = c("High-Performers" = "#E63946", "Mid-Tier" = "#F4A261", "Low-Performers" = "#457B9D")) +
labs(title = "Key Passes by Team", x = "Team", y = "Key Passes") +
theme_minimal(base_size = 14)

# Load PCA tools
library(ggplot2)
library(ggrepel)
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Prepare PCA data
pca_vars <- merged_data %>%
select(kp, prgp, xag, xg, `cmp%...7`, prgdist)
# Perform PCA with scaling
pca_res <- prcomp(pca_vars, scale. = TRUE)
# Create a biplot using factoextra
fviz_pca_biplot(
pca_res,
label = "var", # show variable arrows
habillage = merged_data$cluster, # color by cluster
addEllipses = TRUE, # optional: adds confidence ellipses around clusters
col.var = "black", # arrow color
repel = TRUE, # prevent overlapping labels
labelsize = 5,
pointshape = 21,
geom = "point",
palette = c("#E63946", "#F4A261", "#457B9D")
) +
labs(title = "PCA Biplot: Women's Super League Playing Styles") +
theme_minimal(base_size = 14)
Too few points to calculate an ellipse

library(fmsb)
# Choose teams to compare
teams_to_compare <- c("Chelsea", "Aston Villa", "Reading")
# Select key metrics to display
radar_data <- merged_data %>%
filter(squad %in% teams_to_compare) %>%
select(squad, prgp, xg, xag, kp, `cmp%...7`) %>%
column_to_rownames("squad")
# Normalize: add max and min rows for radar plot bounds
radar_max <- apply(radar_data, 2, max)
radar_min <- apply(radar_data, 2, min)
radar_plot_data <- rbind(radar_max, radar_min, radar_data)
# Plot
colors <- c("#E63946", "#F4A261", "#457B9D")
radarchart(radar_plot_data,
axistype = 1,
pcol = colors,
pfcol = adjustcolor(colors, alpha.f = 0.25),
plwd = 2,
cglcol = "gray",
cglty = 1,
axislabcol = "gray",
vlcex = 0.9,
title = "Team Style Comparison Radar Chart")
legend("topright", legend = rownames(radar_data), col = colors, lty = 1, lwd = 2, bty = "n")

# Already fitted model: lm_pts
merged_data$predicted_pts <- predict(lm_pts)
ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
geom_point(size = 4, color = "#1D3557") +
geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
geom_text_repel(aes(label = squad), size = 4.5) +
labs(title = "Actual vs Predicted Points",
subtitle = paste("R-squared:", round(summary(lm_pts)$adj.r.squared, 2)),
x = "Predicted Points (Regression Model)",
y = "Actual Points") +
theme_minimal(base_size = 14)
`geom_smooth()` using formula = 'y ~ x'

library(knitr)
library(kableExtra)
Attaching package: ‘kableExtra’
The following object is masked from ‘package:dplyr’:
group_rows
summary_table <- merged_data %>%
select(squad, cluster, prgp, xg, kp, pts) %>%
arrange(desc(pts))
# Display as a styled table
kable(summary_table, format = "markdown", digits = 1,
col.names = c("Team", "Cluster", "Progressive Passes", "xG", "Key Passes", "Points")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
| Team |
Cluster |
Progressive Passes |
xG |
Key Passes |
Points |
| Chelsea |
High-Performers |
1004 |
43.8 |
286 |
58 |
| Manchester Utd |
High-Performers |
979 |
35.4 |
259 |
56 |
| Arsenal |
High-Performers |
1026 |
42.7 |
295 |
47 |
| Manchester City |
High-Performers |
1062 |
48.0 |
349 |
47 |
| Aston Villa |
Mid-Tier |
831 |
28.0 |
202 |
37 |
| Everton |
Mid-Tier |
716 |
20.5 |
172 |
30 |
| Liverpool |
Low-Performers |
565 |
21.2 |
144 |
23 |
| West Ham |
Low-Performers |
505 |
17.4 |
112 |
21 |
| Tottenham |
Mid-Tier |
607 |
19.9 |
155 |
18 |
| Leicester City WFC |
Low-Performers |
550 |
16.0 |
168 |
16 |
| Brighton |
Low-Performers |
554 |
23.4 |
160 |
16 |
| Reading |
Low-Performers |
504 |
16.5 |
140 |
11 |
write.csv(summary_table, "WSL_summary_table.csv", row.names = FALSE)
# Print all results from above visuals
# 1. Linear Regression Summary
cat("Linear Regression Summary: Predicting Points\n\n")
Linear Regression Summary: Predicting Points
summary(lm_pts)
Call:
lm(formula = pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)
Residuals:
Min 1Q Median 3Q Max
-5.9704 -2.0161 0.2005 2.6937 5.0021
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.43087 43.67086 0.239 0.8192
kp -0.18506 0.17667 -1.047 0.3352
prgp 0.13956 0.04537 3.076 0.0218 *
xag -0.17197 2.17614 -0.079 0.9396
xg 0.33561 1.04540 0.321 0.7591
`cmp%...7` -0.68601 0.77439 -0.886 0.4098
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.893 on 6 degrees of freedom
Multiple R-squared: 0.9534, Adjusted R-squared: 0.9145
F-statistic: 24.53 on 5 and 6 DF, p-value: 0.0006311
# 2. PCA Loadings
cat("\nPCA Loadings for Principal Component 1 (Attacking Style Dimension)\n")
PCA Loadings for Principal Component 1 (Attacking Style Dimension)
print(round(pca_res$rotation[, 1], 3))
kp prgp xag xg cmp%...7 prgdist
-0.411 -0.417 -0.413 -0.408 -0.386 -0.414
# 3. Cluster Membership by Team
cat("\nCluster Membership by Team\n")
Cluster Membership by Team
cluster_membership <- merged_data %>%
select(squad, cluster, pts, prgp, xg) %>%
arrange(cluster, desc(pts))
print(cluster_membership)
# 4. Cluster-Level Summary
cat("\nCluster-Level Summary: Average Team Statistics\n")
Cluster-Level Summary: Average Team Statistics
cluster_summary <- merged_data %>%
group_by(cluster) %>%
summarise(
Average_Points = round(mean(pts), 1),
Average_Progressive_Passes = round(mean(prgp), 1),
Average_xG = round(mean(xg), 1),
Average_Key_Passes = round(mean(kp), 1)
)
print(cluster_summary)
# 5. Top Performing Teams by Key Metrics
cat("\nTop 3 Teams by Progressive Passes, Expected Goals (xG), and Key Passes\n")
Top 3 Teams by Progressive Passes, Expected Goals (xG), and Key Passes
top_prgp <- merged_data %>% select(squad, prgp) %>% arrange(desc(prgp)) %>% head(3)
top_xg <- merged_data %>% select(squad, xg) %>% arrange(desc(xg)) %>% head(3)
top_kp <- merged_data %>% select(squad, kp) %>% arrange(desc(kp)) %>% head(3)
cat("\nTop Teams by Progressive Passes:\n")
Top Teams by Progressive Passes:
print(top_prgp)
cat("\nTop Teams by Expected Goals (xG):\n")
Top Teams by Expected Goals (xG):
print(top_xg)
cat("\nTop Teams by Key Passes:\n")
Top Teams by Key Passes:
print(top_kp)
# The end
---
title: "Progression Wins Games: A Tactical Analysis of Playing Styles in the Women’s Super League"
output: html_notebook
---

```{r}
# Load necessary libraries
library(tidyverse)

# Import datasets (assuming they are in current project directory)
season_stats <- read_csv("WomenSuperLeagueseason2022-23.csv")
passing_stats <- read_csv("WomenSuperLeaguePassingstats2022-23.csv")
pass_types <- read_csv("WomenSuperLeaguePassTypes2022-23.csv")

# Preview the data
head(season_stats)
head(passing_stats)
head(pass_types)
```
```{r}
# Standardize column names to lowercase for consistency
season_stats <- season_stats %>% rename_all(tolower)
passing_stats <- passing_stats %>% rename_all(tolower)
pass_types <- pass_types %>% rename_all(tolower)

# Let's preview the squad names to ensure they align
unique(season_stats$squad)
unique(passing_stats$squad)
unique(pass_types$squad)
```
```{r}
# Final cleanup to extract just the team name
season_stats <- season_stats %>%
  mutate(squad = str_extract(squad, "[A-Za-z ]+$"))

# Check the result
unique(season_stats$squad)
```

```{r}
# Merge all datasets on 'squad'
merged_data <- season_stats %>%
  inner_join(passing_stats, by = "squad") %>%
  inner_join(pass_types, by = "squad")

# View the structure of the merged data
glimpse(merged_data)
```

```{r}
library(corrplot)
```
```{r}
# Select numeric columns of interest
eda_vars <- merged_data %>%
  select(squad, pts, xg, `cmp%...7`, kp, prgp, xag, xa, `3-jan`, ppa, crspa, att...6, prgdist)

cor_data <- eda_vars %>% select(-squad)

# Create and plot the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
corrplot::corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)
```
```{r}
# Fit a multiple linear regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

# View the summary (coefficients, R², p-values)
summary(lm_pts)
```
```{r}
# Select key metrics for PCA
pca_vars <- merged_data %>% 
  select(kp, prgp, xag, xg, `cmp%...7`, prgdist)

# Run PCA (standardizing variables)
pca_res <- prcomp(pca_vars, scale. = TRUE)

# Summarize
summary(pca_res)

# Screen plot to pick number of components
plot(pca_res, type = "l", main = "PCA Scree Plot")

# Biplot of PC1 vs PC2
biplot(pca_res, scale = 0)
```
```{r}
# View PCA loadings (contributions of each variable to the principal components)
pca_res$rotation[, 1]  # PC1 loadings
```
```{r}
set.seed(42)
# Scale and cluster on the same PCA variables
scaled_vars <- scale(pca_vars)
km_res <- kmeans(scaled_vars, centers = 3)  # try 2–4 clusters to see what makes sense

# Add cluster assignment back to data
merged_data$cluster <- factor(km_res$cluster)

# Visualize cluster membership
library(ggplot2)
ggplot(merged_data, aes(x = prgp, y = xg, color = cluster, label = squad)) +
  geom_point(size = 4) +
  geom_text(vjust = -0.75, size = 3) +
  labs(title = "Team Clusters by Progressive Passes & xG",
       x = "Progressive Passes",
       y = "Expected Goals") +
  theme_minimal()
```

```{r}
# Fit the regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

# Predict and plot
merged_data$predicted_pts <- predict(lm_pts)

ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
  geom_point(size = 4, color = "#1D3557") +
  geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
  geom_text_repel(aes(label = squad), size = 4.5) +
  labs(title = "Actual vs Predicted Points",
       x = "Predicted Points (Linear Model)",
       y = "Actual Points",
       subtitle = "Based on key attacking metrics") +
  theme_minimal(base_size = 14)
```
```{r}
#load libraries
library(ggplot2)
library(reshape2)

# Select key variables
cor_vars <- merged_data %>%
  select(pts, xg, xag, kp, prgp, `cmp%...7`, prgdist)

# Compute and reshape the correlation matrix
cor_matrix <- round(cor(cor_vars, use = "complete.obs"), 2)
cor_melted <- melt(cor_matrix)

# Plot with yellow midpoint
ggplot(cor_melted, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white", size = 0.8) +
  geom_text(aes(label = value), color = "black", size = 4) +
  scale_fill_gradient2(low = "#E63946", mid = "#FFD700", high = "#1D3557",
                       midpoint = 0.6, limits = c(0, 1), space = "Lab") +
  labs(title = "Correlation Heatmap of Key Performance Metrics",
       x = "", y = "", fill = "Correlation") +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
    axis.text.y = element_text(face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
```

```{r}
# Team Comparison Bar Chart: Key Passes
ggplot(merged_data, aes(x = reorder(squad, kp), y = kp, fill = cluster)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = c("High-Performers" = "#E63946", "Mid-Tier" = "#F4A261", "Low-Performers" = "#457B9D")) +
  labs(title = "Key Passes by Team", x = "Team", y = "Key Passes") +
  theme_minimal(base_size = 14)
```
```{r}
# Load PCA tools
library(ggplot2)
library(ggrepel)
library(factoextra)  

# Prepare PCA data
pca_vars <- merged_data %>%
  select(kp, prgp, xag, xg, `cmp%...7`, prgdist)

# Perform PCA with scaling
pca_res <- prcomp(pca_vars, scale. = TRUE)

# Create a biplot using factoextra
fviz_pca_biplot(
  pca_res,
  label = "var",          # show variable arrows
  habillage = merged_data$cluster,  # color by cluster
  addEllipses = TRUE,     # optional: adds confidence ellipses around clusters
  col.var = "black",      # arrow color
  repel = TRUE,           # prevent overlapping labels
  labelsize = 5,
  pointshape = 21,
  geom = "point",
  palette = c("#E63946", "#F4A261", "#457B9D")
) +
  labs(title = "PCA Biplot: Women's Super League Playing Styles") +
  theme_minimal(base_size = 14)
```
```{r}
library(fmsb)

# Choose teams to compare
teams_to_compare <- c("Chelsea", "Aston Villa", "Reading")

# Select key metrics to display
radar_data <- merged_data %>%
  filter(squad %in% teams_to_compare) %>%
  select(squad, prgp, xg, xag, kp, `cmp%...7`) %>%
  column_to_rownames("squad")

# Normalize: add max and min rows for radar plot bounds
radar_max <- apply(radar_data, 2, max)
radar_min <- apply(radar_data, 2, min)
radar_plot_data <- rbind(radar_max, radar_min, radar_data)

# Plot
colors <- c("#E63946", "#F4A261", "#457B9D")
radarchart(radar_plot_data,
           axistype = 1,
           pcol = colors,
           pfcol = adjustcolor(colors, alpha.f = 0.25),
           plwd = 2,
           cglcol = "gray",
           cglty = 1,
           axislabcol = "gray",
           vlcex = 0.9,
           title = "Team Style Comparison Radar Chart")

legend("topright", legend = rownames(radar_data), col = colors, lty = 1, lwd = 2, bty = "n")
```
```{r}
# Already fitted model: lm_pts
merged_data$predicted_pts <- predict(lm_pts)

ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
  geom_point(size = 4, color = "#1D3557") +
  geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
  geom_text_repel(aes(label = squad), size = 4.5) +
  labs(title = "Actual vs Predicted Points",
       subtitle = paste("R-squared:", round(summary(lm_pts)$adj.r.squared, 2)),
       x = "Predicted Points (Regression Model)",
       y = "Actual Points") +
  theme_minimal(base_size = 14)
```
```{r}
library(knitr)
library(kableExtra)

summary_table <- merged_data %>%
  select(squad, cluster, prgp, xg, kp, pts) %>%
  arrange(desc(pts))

# Display as a styled table
kable(summary_table, format = "markdown", digits = 1,
      col.names = c("Team", "Cluster", "Progressive Passes", "xG", "Key Passes", "Points")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
```
```{r}
write.csv(summary_table, "WSL_summary_table.csv", row.names = FALSE)
```
```{r}
# Print all results from above visuals 
# 1. Linear Regression Summary
cat("Linear Regression Summary: Predicting Points\n\n")
summary(lm_pts)

# 2. PCA Loadings
cat("\nPCA Loadings for Principal Component 1 (Attacking Style Dimension)\n")
print(round(pca_res$rotation[, 1], 3))

# 3. Cluster Membership by Team
cat("\nCluster Membership by Team\n")
cluster_membership <- merged_data %>%
  select(squad, cluster, pts, prgp, xg) %>%
  arrange(cluster, desc(pts))
print(cluster_membership)

# 4. Cluster-Level Summary
cat("\nCluster-Level Summary: Average Team Statistics\n")
cluster_summary <- merged_data %>%
  group_by(cluster) %>%
  summarise(
    Average_Points = round(mean(pts), 1),
    Average_Progressive_Passes = round(mean(prgp), 1),
    Average_xG = round(mean(xg), 1),
    Average_Key_Passes = round(mean(kp), 1)
  )
print(cluster_summary)

# 5. Top Performing Teams by Key Metrics
cat("\nTop 3 Teams by Progressive Passes, Expected Goals (xG), and Key Passes\n")

top_prgp <- merged_data %>% select(squad, prgp) %>% arrange(desc(prgp)) %>% head(3)
top_xg   <- merged_data %>% select(squad, xg) %>% arrange(desc(xg)) %>% head(3)
top_kp   <- merged_data %>% select(squad, kp) %>% arrange(desc(kp)) %>% head(3)

cat("\nTop Teams by Progressive Passes:\n")
print(top_prgp)

cat("\nTop Teams by Expected Goals (xG):\n")
print(top_xg)

cat("\nTop Teams by Key Passes:\n")
print(top_kp)

# The end
```
