# Load necessary libraries
library(tidyverse)

# Import datasets (assuming they are in your current project directory)
season_stats <- read_csv("WomenSuperLeagueseason2022-23.csv")
Rows: 12 Columns: 16
── Column specification ───────────────────────────────
Delimiter: ","
chr  (1): Squad
dbl (14): Rk, MP, W, D, L, GF, GA, GD, Pts, Pts/MP,...
num  (1): Attendance

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
passing_stats <- read_csv("WomenSuperLeaguePassingstats2022-23.csv")
New names:
• `Cmp` -> `Cmp...5`
• `Att` -> `Att...6`
• `Cmp%` -> `Cmp%...7`
• `Cmp` -> `Cmp...10`
• `Att` -> `Att...11`
• `Cmp%` -> `Cmp%...12`
• `Cmp` -> `Cmp...13`
• `Att` -> `Att...14`
• `Cmp%` -> `Cmp%...15`
• `Cmp` -> `Cmp...16`
• `Att` -> `Att...17`
• `Cmp%` -> `Cmp%...18`
Rows: 12 Columns: 27
── Column specification ───────────────────────────────
Delimiter: ","
chr  (1): Squad
dbl (26): Rank, # Pl, 90s, Cmp...5, Att...6, Cmp%.....

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pass_types <- read_csv("WomenSuperLeaguePassTypes2022-23.csv")
Rows: 12 Columns: 19
── Column specification ───────────────────────────────
Delimiter: ","
chr  (1): Squad
dbl (18): Rank, # Pl, 90s, Att, Live, Dead, FK, TB,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview the data
head(season_stats)
head(passing_stats)
head(pass_types)
# Standardize column names to lowercase for consistency
season_stats <- season_stats %>% rename_all(tolower)
passing_stats <- passing_stats %>% rename_all(tolower)
pass_types <- pass_types %>% rename_all(tolower)

# Let's preview the squad names to ensure they align
unique(season_stats$squad)
 [1] "Club Crest\xa0Chelsea"           
 [2] "Club Crest\xa0Manchester Utd"    
 [3] "Club Crest\xa0Arsenal"           
 [4] "Club Crest\xa0Manchester City"   
 [5] "Club Crest\xa0Aston Villa"       
 [6] "Club Crest\xa0Everton"           
 [7] "Club Crest\xa0Liverpool"         
 [8] "Club Crest\xa0West Ham"          
 [9] "Club Crest\xa0Tottenham"         
[10] "Club Crest\xa0Leicester City WFC"
[11] "Club Crest\xa0Brighton"          
[12] "Club Crest\xa0Reading"           
unique(passing_stats$squad)
 [1] "Arsenal"            "Aston Villa"       
 [3] "Brighton"           "Chelsea"           
 [5] "Everton"            "Leicester City WFC"
 [7] "Liverpool"          "Manchester City"   
 [9] "Manchester Utd"     "Reading"           
[11] "Tottenham"          "West Ham"          
unique(pass_types$squad)
 [1] "Arsenal"            "Aston Villa"       
 [3] "Brighton"           "Chelsea"           
 [5] "Everton"            "Leicester City WFC"
 [7] "Liverpool"          "Manchester City"   
 [9] "Manchester Utd"     "Reading"           
[11] "Tottenham"          "West Ham"          
# Final cleanup to extract just the team name
season_stats <- season_stats %>%
  mutate(squad = str_extract(squad, "[A-Za-z ]+$"))

# Check the result
unique(season_stats$squad)
 [1] "Chelsea"            "Manchester Utd"    
 [3] "Arsenal"            "Manchester City"   
 [5] "Aston Villa"        "Everton"           
 [7] "Liverpool"          "West Ham"          
 [9] "Tottenham"          "Leicester City WFC"
[11] "Brighton"           "Reading"           
# Merge all datasets on 'squad'
merged_data <- season_stats %>%
  inner_join(passing_stats, by = "squad") %>%
  inner_join(pass_types, by = "squad")

# View the structure of the merged data
glimpse(merged_data)
Rows: 12
Columns: 60
$ rk          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
$ squad       <chr> "Chelsea", "Manchester Utd", "Ars…
$ mp          <dbl> 22, 22, 22, 22, 22, 22, 22, 22, 2…
$ w           <dbl> 19, 18, 15, 15, 11, 9, 6, 6, 5, 5…
$ d           <dbl> 1, 2, 2, 2, 4, 3, 5, 3, 3, 1, 4, 2
$ l           <dbl> 2, 2, 5, 5, 7, 10, 11, 13, 14, 16…
$ gf          <dbl> 66, 56, 49, 50, 47, 29, 24, 23, 3…
$ ga          <dbl> 15, 12, 16, 25, 37, 36, 39, 44, 4…
$ gd          <dbl> 51, 44, 33, 25, 10, -7, -15, -21,…
$ pts         <dbl> 58, 56, 47, 47, 37, 30, 23, 21, 1…
$ `pts/mp`    <dbl> 2.64, 2.55, 2.14, 2.14, 1.68, 1.3…
$ xg          <dbl> 43.8, 35.4, 42.7, 48.0, 28.0, 20.…
$ xga         <dbl> 17.8, 13.0, 17.6, 17.2, 24.6, 31.…
$ xgd         <dbl> 26.0, 22.4, 25.1, 30.8, 3.3, -11.…
$ `xgd/90`    <dbl> 1.18, 1.02, 1.14, 1.40, 0.15, -0.…
$ attendance  <dbl> 5616, 9648, 13996, 6559, 4199, 25…
$ rank.x      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
$ `# pl.x`    <dbl> 25, 24, 27, 24, 26, 23, 27, 26, 2…
$ `90s.x`     <dbl> 22, 22, 22, 22, 22, 22, 22, 22, 2…
$ cmp...5     <dbl> 9939, 10120, 9705, 10460, 8431, 8…
$ att...6     <dbl> 12582, 12518, 12030, 12748, 10922…
$ `cmp%...7`  <dbl> 79.0, 80.8, 80.7, 82.1, 77.2, 78.…
$ totdist     <dbl> 191912, 186350, 184631, 189947, 1…
$ prgdist     <dbl> 69839, 68111, 66769, 71843, 60024…
$ cmp...10    <dbl> 3509, 3904, 3540, 4203, 3562, 355…
$ att...11    <dbl> 4219, 4556, 4122, 4828, 4251, 421…
$ `cmp%...12` <dbl> 83.2, 85.7, 85.9, 87.1, 83.8, 84.…
$ cmp...13    <dbl> 5064, 4944, 4911, 5062, 3834, 443…
$ att...14    <dbl> 5924, 5665, 5656, 5779, 4575, 521…
$ `cmp%...15` <dbl> 85.5, 87.3, 86.8, 87.6, 83.8, 85.…
$ cmp...16    <dbl> 1216, 1044, 1119, 989, 882, 744, …
$ att...17    <dbl> 1961, 1741, 1793, 1665, 1603, 143…
$ `cmp%...18` <dbl> 62.0, 60.0, 62.4, 59.4, 55.0, 51.…
$ ast         <dbl> 44, 41, 31, 38, 33, 20, 15, 12, 2…
$ xag         <dbl> 29.8, 24.9, 29.2, 36.6, 19.0, 14.…
$ xa          <dbl> 33.2, 29.3, 31.8, 36.0, 20.1, 14.…
$ `a-xag`     <dbl> 14.2, 16.1, 1.8, 1.4, 14.0, 5.2, …
$ kp          <dbl> 286, 259, 295, 349, 202, 172, 144…
$ `3-jan`     <dbl> 814, 754, 817, 907, 691, 545, 441…
$ ppa         <dbl> 265, 256, 261, 276, 161, 128, 113…
$ crspa       <dbl> 45, 58, 45, 93, 44, 31, 42, 25, 3…
$ prgp        <dbl> 1004, 979, 1026, 1062, 831, 716, …
$ rank.y      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
$ `# pl.y`    <dbl> 25, 24, 27, 24, 26, 23, 27, 26, 2…
$ `90s.y`     <dbl> 22, 22, 22, 22, 22, 22, 22, 22, 2…
$ att         <dbl> 12582, 12518, 12030, 12748, 10922…
$ live        <dbl> 11440, 11375, 10911, 11630, 9768,…
$ dead        <dbl> 1076, 1074, 1046, 1088, 1086, 112…
$ fk          <dbl> 239, 249, 217, 232, 224, 278, 204…
$ tb          <dbl> 17, 12, 15, 11, 6, 5, 4, 2, 7, 1,…
$ sw          <dbl> 43, 32, 50, 30, 38, 19, 21, 22, 4…
$ crs         <dbl> 380, 406, 375, 519, 425, 268, 307…
$ ti          <dbl> 546, 567, 536, 511, 553, 490, 578…
$ ck          <dbl> 128, 116, 156, 179, 115, 100, 82,…
$ `in`        <dbl> 66, 35, 51, 84, 46, 48, 48, 25, 2…
$ out         <dbl> 24, 29, 62, 23, 37, 10, 21, 33, 1…
$ str         <dbl> 5, 23, 12, 24, 18, 12, 3, 2, 18, …
$ cmp         <dbl> 9939, 10120, 9705, 10460, 8431, 8…
$ off         <dbl> 66, 69, 73, 30, 68, 32, 33, 46, 4…
$ blocks      <dbl> 226, 212, 214, 218, 268, 277, 200…
library(corrplot)
corrplot 0.95 loaded
# Select numeric columns of interest
eda_vars <- merged_data %>%
  select(squad, pts, xg, `cmp%...7`, kp, prgp, xag, xa, `3-jan`, ppa, crspa, att...6, prgdist)

cor_data <- eda_vars %>% select(-squad)

# Create and plot the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
corrplot::corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)

# Fit a multiple linear regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

# View the summary (coefficients, R², p-values)
summary(lm_pts)

Call:
lm(formula = pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.9704 -2.0161  0.2005  2.6937  5.0021 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept) 10.43087   43.67086   0.239   0.8192  
kp          -0.18506    0.17667  -1.047   0.3352  
prgp         0.13956    0.04537   3.076   0.0218 *
xag         -0.17197    2.17614  -0.079   0.9396  
xg           0.33561    1.04540   0.321   0.7591  
`cmp%...7`  -0.68601    0.77439  -0.886   0.4098  
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.893 on 6 degrees of freedom
Multiple R-squared:  0.9534,    Adjusted R-squared:  0.9145 
F-statistic: 24.53 on 5 and 6 DF,  p-value: 0.0006311
# Select key metrics for PCA
pca_vars <- merged_data %>% 
  select(kp, prgp, xag, xg, `cmp%...7`, prgdist)

# Run PCA (standardizing variables)
pca_res <- prcomp(pca_vars, scale. = TRUE)

# Summarize
summary(pca_res)
Importance of components:
                          PC1    PC2    PC3     PC4
Standard deviation     2.3789 0.5208 0.1913 0.14666
Proportion of Variance 0.9432 0.0452 0.0061 0.00358
Cumulative Proportion  0.9432 0.9884 0.9945 0.99806
                           PC5     PC6
Standard deviation     0.09579 0.04991
Proportion of Variance 0.00153 0.00042
Cumulative Proportion  0.99958 1.00000
# Scree plot to pick number of components
plot(pca_res, type = "l", main = "PCA Scree Plot")


# Biplot of PC1 vs PC2
biplot(pca_res, scale = 0)

# View PCA loadings (contributions of each variable to the principal components)
pca_res$rotation[, 1]  # PC1 loadings
        kp       prgp        xag         xg   cmp%...7 
-0.4106414 -0.4167559 -0.4132278 -0.4076769 -0.3860316 
   prgdist 
-0.4143792 
set.seed(42)
# Scale and cluster on the same PCA variables
scaled_vars <- scale(pca_vars)
km_res <- kmeans(scaled_vars, centers = 3)  # try 2–4 clusters to see what makes sense

# Add cluster assignment back to data
merged_data$cluster <- factor(km_res$cluster)

# Visualize cluster membership
library(ggplot2)
ggplot(merged_data, aes(x = prgp, y = xg, color = cluster, label = squad)) +
  geom_point(size = 4) +
  geom_text(vjust = -0.75, size = 3) +
  labs(title = "Team Clusters by Progressive Passes & xG",
       x = "Progressive Passes",
       y = "Expected Goals") +
  theme_minimal()

# Fit the regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

# Predict and plot
merged_data$predicted_pts <- predict(lm_pts)

ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
  geom_point(size = 4, color = "#1D3557") +
  geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
  geom_text_repel(aes(label = squad), size = 4.5) +
  labs(title = "Actual vs Predicted Points",
       x = "Predicted Points (Linear Model)",
       y = "Actual Points",
       subtitle = "Based on key attacking metrics") +
  theme_minimal(base_size = 14)
`geom_smooth()` using formula = 'y ~ x'

#load libraries
library(ggplot2)
library(reshape2)

# Select key variables
cor_vars <- merged_data %>%
  select(pts, xg, xag, kp, prgp, `cmp%...7`, prgdist)

# Compute and reshape the correlation matrix
cor_matrix <- round(cor(cor_vars, use = "complete.obs"), 2)
cor_melted <- melt(cor_matrix)

# Plot with yellow midpoint
ggplot(cor_melted, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white", size = 0.8) +
  geom_text(aes(label = value), color = "black", size = 4) +
  scale_fill_gradient2(low = "#E63946", mid = "#FFD700", high = "#1D3557",
                       midpoint = 0.6, limits = c(0, 1), space = "Lab") +
  labs(title = "Correlation Heatmap of Key Performance Metrics",
       x = "", y = "", fill = "Correlation") +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
    axis.text.y = element_text(face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

# Team Comparison Bar Chart: Key Passes
ggplot(merged_data, aes(x = reorder(squad, kp), y = kp, fill = cluster)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = c("High-Performers" = "#E63946", "Mid-Tier" = "#F4A261", "Low-Performers" = "#457B9D")) +
  labs(title = "Key Passes by Team", x = "Team", y = "Key Passes") +
  theme_minimal(base_size = 14)

# Load PCA tools
library(ggplot2)
library(ggrepel)
library(factoextra)  
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Prepare PCA data
pca_vars <- merged_data %>%
  select(kp, prgp, xag, xg, `cmp%...7`, prgdist)

# Perform PCA with scaling
pca_res <- prcomp(pca_vars, scale. = TRUE)

# Create a biplot using factoextra
fviz_pca_biplot(
  pca_res,
  label = "var",          # show variable arrows
  habillage = merged_data$cluster,  # color by cluster
  addEllipses = TRUE,     # optional: adds confidence ellipses around clusters
  col.var = "black",      # arrow color
  repel = TRUE,           # prevent overlapping labels
  labelsize = 5,
  pointshape = 21,
  geom = "point",
  palette = c("#E63946", "#F4A261", "#457B9D")
) +
  labs(title = "PCA Biplot: Women's Super League Playing Styles") +
  theme_minimal(base_size = 14)
Too few points to calculate an ellipse

library(fmsb)

# Choose teams to compare
teams_to_compare <- c("Chelsea", "Aston Villa", "Reading")

# Select key metrics to display
radar_data <- merged_data %>%
  filter(squad %in% teams_to_compare) %>%
  select(squad, prgp, xg, xag, kp, `cmp%...7`) %>%
  column_to_rownames("squad")

# Normalize: add max and min rows for radar plot bounds
radar_max <- apply(radar_data, 2, max)
radar_min <- apply(radar_data, 2, min)
radar_plot_data <- rbind(radar_max, radar_min, radar_data)

# Plot
colors <- c("#E63946", "#F4A261", "#457B9D")
radarchart(radar_plot_data,
           axistype = 1,
           pcol = colors,
           pfcol = adjustcolor(colors, alpha.f = 0.25),
           plwd = 2,
           cglcol = "gray",
           cglty = 1,
           axislabcol = "gray",
           vlcex = 0.9,
           title = "Team Style Comparison Radar Chart")

legend("topright", legend = rownames(radar_data), col = colors, lty = 1, lwd = 2, bty = "n")

# Already fitted model: lm_pts
merged_data$predicted_pts <- predict(lm_pts)

ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
  geom_point(size = 4, color = "#1D3557") +
  geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
  geom_text_repel(aes(label = squad), size = 4.5) +
  labs(title = "Actual vs Predicted Points",
       subtitle = paste("R-squared:", round(summary(lm_pts)$adj.r.squared, 2)),
       x = "Predicted Points (Regression Model)",
       y = "Actual Points") +
  theme_minimal(base_size = 14)
`geom_smooth()` using formula = 'y ~ x'

library(knitr)
library(kableExtra)

Attaching package: ‘kableExtra’

The following object is masked from ‘package:dplyr’:

    group_rows
summary_table <- merged_data %>%
  select(squad, cluster, prgp, xg, kp, pts) %>%
  arrange(desc(pts))

# Display as a styled table
kable(summary_table, format = "markdown", digits = 1,
      col.names = c("Team", "Cluster", "Progressive Passes", "xG", "Key Passes", "Points")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Team Cluster Progressive Passes xG Key Passes Points
Chelsea High-Performers 1004 43.8 286 58
Manchester Utd High-Performers 979 35.4 259 56
Arsenal High-Performers 1026 42.7 295 47
Manchester City High-Performers 1062 48.0 349 47
Aston Villa Mid-Tier 831 28.0 202 37
Everton Mid-Tier 716 20.5 172 30
Liverpool Low-Performers 565 21.2 144 23
West Ham Low-Performers 505 17.4 112 21
Tottenham Mid-Tier 607 19.9 155 18
Leicester City WFC Low-Performers 550 16.0 168 16
Brighton Low-Performers 554 23.4 160 16
Reading Low-Performers 504 16.5 140 11
write.csv(summary_table, "WSL_summary_table.csv", row.names = FALSE)
# Print all results from above visuals 
# 1. Linear Regression Summary
cat("Linear Regression Summary: Predicting Points\n\n")
Linear Regression Summary: Predicting Points
summary(lm_pts)

Call:
lm(formula = pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.9704 -2.0161  0.2005  2.6937  5.0021 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept) 10.43087   43.67086   0.239   0.8192  
kp          -0.18506    0.17667  -1.047   0.3352  
prgp         0.13956    0.04537   3.076   0.0218 *
xag         -0.17197    2.17614  -0.079   0.9396  
xg           0.33561    1.04540   0.321   0.7591  
`cmp%...7`  -0.68601    0.77439  -0.886   0.4098  
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.893 on 6 degrees of freedom
Multiple R-squared:  0.9534,    Adjusted R-squared:  0.9145 
F-statistic: 24.53 on 5 and 6 DF,  p-value: 0.0006311
# 2. PCA Loadings
cat("\nPCA Loadings for Principal Component 1 (Attacking Style Dimension)\n")

PCA Loadings for Principal Component 1 (Attacking Style Dimension)
print(round(pca_res$rotation[, 1], 3))
      kp     prgp      xag       xg cmp%...7  prgdist 
  -0.411   -0.417   -0.413   -0.408   -0.386   -0.414 
# 3. Cluster Membership by Team
cat("\nCluster Membership by Team\n")

Cluster Membership by Team
cluster_membership <- merged_data %>%
  select(squad, cluster, pts, prgp, xg) %>%
  arrange(cluster, desc(pts))
print(cluster_membership)

# 4. Cluster-Level Summary
cat("\nCluster-Level Summary: Average Team Statistics\n")

Cluster-Level Summary: Average Team Statistics
cluster_summary <- merged_data %>%
  group_by(cluster) %>%
  summarise(
    Average_Points = round(mean(pts), 1),
    Average_Progressive_Passes = round(mean(prgp), 1),
    Average_xG = round(mean(xg), 1),
    Average_Key_Passes = round(mean(kp), 1)
  )
print(cluster_summary)

# 5. Top Performing Teams by Key Metrics
cat("\nTop 3 Teams by Progressive Passes, Expected Goals (xG), and Key Passes\n")

Top 3 Teams by Progressive Passes, Expected Goals (xG), and Key Passes
top_prgp <- merged_data %>% select(squad, prgp) %>% arrange(desc(prgp)) %>% head(3)
top_xg   <- merged_data %>% select(squad, xg) %>% arrange(desc(xg)) %>% head(3)
top_kp   <- merged_data %>% select(squad, kp) %>% arrange(desc(kp)) %>% head(3)

cat("\nTop Teams by Progressive Passes:\n")

Top Teams by Progressive Passes:
print(top_prgp)

cat("\nTop Teams by Expected Goals (xG):\n")

Top Teams by Expected Goals (xG):
print(top_xg)

cat("\nTop Teams by Key Passes:\n")

Top Teams by Key Passes:
print(top_kp)

# The end
---
title: "Progression Wins Games: A Tactical Analysis of Playing Styles in the Women’s Super League"
output: html_notebook
---

```{r}
# Load necessary libraries
library(tidyverse)

# Import datasets (assuming they are in current project directory)
season_stats <- read_csv("WomenSuperLeagueseason2022-23.csv")
passing_stats <- read_csv("WomenSuperLeaguePassingstats2022-23.csv")
pass_types <- read_csv("WomenSuperLeaguePassTypes2022-23.csv")

# Preview the data
head(season_stats)
head(passing_stats)
head(pass_types)
```
```{r}
# Standardize column names to lowercase for consistency
season_stats <- season_stats %>% rename_all(tolower)
passing_stats <- passing_stats %>% rename_all(tolower)
pass_types <- pass_types %>% rename_all(tolower)

# Let's preview the squad names to ensure they align
unique(season_stats$squad)
unique(passing_stats$squad)
unique(pass_types$squad)
```
```{r}
# Final cleanup to extract just the team name
season_stats <- season_stats %>%
  mutate(squad = str_extract(squad, "[A-Za-z ]+$"))

# Check the result
unique(season_stats$squad)
```

```{r}
# Merge all datasets on 'squad'
merged_data <- season_stats %>%
  inner_join(passing_stats, by = "squad") %>%
  inner_join(pass_types, by = "squad")

# View the structure of the merged data
glimpse(merged_data)
```

```{r}
library(corrplot)
```
```{r}
# Select numeric columns of interest
eda_vars <- merged_data %>%
  select(squad, pts, xg, `cmp%...7`, kp, prgp, xag, xa, `3-jan`, ppa, crspa, att...6, prgdist)

cor_data <- eda_vars %>% select(-squad)

# Create and plot the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
corrplot::corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)
```
```{r}
# Fit a multiple linear regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

# View the summary (coefficients, R², p-values)
summary(lm_pts)
```
```{r}
# Select key metrics for PCA
pca_vars <- merged_data %>% 
  select(kp, prgp, xag, xg, `cmp%...7`, prgdist)

# Run PCA (standardizing variables)
pca_res <- prcomp(pca_vars, scale. = TRUE)

# Summarize
summary(pca_res)

# Screen plot to pick number of components
plot(pca_res, type = "l", main = "PCA Scree Plot")

# Biplot of PC1 vs PC2
biplot(pca_res, scale = 0)
```
```{r}
# View PCA loadings (contributions of each variable to the principal components)
pca_res$rotation[, 1]  # PC1 loadings
```
```{r}
set.seed(42)
# Scale and cluster on the same PCA variables
scaled_vars <- scale(pca_vars)
km_res <- kmeans(scaled_vars, centers = 3)  # try 2–4 clusters to see what makes sense

# Add cluster assignment back to data
merged_data$cluster <- factor(km_res$cluster)

# Visualize cluster membership
library(ggplot2)
ggplot(merged_data, aes(x = prgp, y = xg, color = cluster, label = squad)) +
  geom_point(size = 4) +
  geom_text(vjust = -0.75, size = 3) +
  labs(title = "Team Clusters by Progressive Passes & xG",
       x = "Progressive Passes",
       y = "Expected Goals") +
  theme_minimal()
```

```{r}
# Fit the regression model
lm_pts <- lm(pts ~ kp + prgp + xag + xg + `cmp%...7`, data = merged_data)

# Predict and plot
merged_data$predicted_pts <- predict(lm_pts)

ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
  geom_point(size = 4, color = "#1D3557") +
  geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
  geom_text_repel(aes(label = squad), size = 4.5) +
  labs(title = "Actual vs Predicted Points",
       x = "Predicted Points (Linear Model)",
       y = "Actual Points",
       subtitle = "Based on key attacking metrics") +
  theme_minimal(base_size = 14)
```
```{r}
#load libraries
library(ggplot2)
library(reshape2)

# Select key variables
cor_vars <- merged_data %>%
  select(pts, xg, xag, kp, prgp, `cmp%...7`, prgdist)

# Compute and reshape the correlation matrix
cor_matrix <- round(cor(cor_vars, use = "complete.obs"), 2)
cor_melted <- melt(cor_matrix)

# Plot with yellow midpoint
ggplot(cor_melted, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white", size = 0.8) +
  geom_text(aes(label = value), color = "black", size = 4) +
  scale_fill_gradient2(low = "#E63946", mid = "#FFD700", high = "#1D3557",
                       midpoint = 0.6, limits = c(0, 1), space = "Lab") +
  labs(title = "Correlation Heatmap of Key Performance Metrics",
       x = "", y = "", fill = "Correlation") +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
    axis.text.y = element_text(face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
```

```{r}
# Team Comparison Bar Chart: Key Passes
ggplot(merged_data, aes(x = reorder(squad, kp), y = kp, fill = cluster)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = c("High-Performers" = "#E63946", "Mid-Tier" = "#F4A261", "Low-Performers" = "#457B9D")) +
  labs(title = "Key Passes by Team", x = "Team", y = "Key Passes") +
  theme_minimal(base_size = 14)
```
```{r}
# Load PCA tools
library(ggplot2)
library(ggrepel)
library(factoextra)  

# Prepare PCA data
pca_vars <- merged_data %>%
  select(kp, prgp, xag, xg, `cmp%...7`, prgdist)

# Perform PCA with scaling
pca_res <- prcomp(pca_vars, scale. = TRUE)

# Create a biplot using factoextra
fviz_pca_biplot(
  pca_res,
  label = "var",          # show variable arrows
  habillage = merged_data$cluster,  # color by cluster
  addEllipses = TRUE,     # optional: adds confidence ellipses around clusters
  col.var = "black",      # arrow color
  repel = TRUE,           # prevent overlapping labels
  labelsize = 5,
  pointshape = 21,
  geom = "point",
  palette = c("#E63946", "#F4A261", "#457B9D")
) +
  labs(title = "PCA Biplot: Women's Super League Playing Styles") +
  theme_minimal(base_size = 14)
```
```{r}
library(fmsb)

# Choose teams to compare
teams_to_compare <- c("Chelsea", "Aston Villa", "Reading")

# Select key metrics to display
radar_data <- merged_data %>%
  filter(squad %in% teams_to_compare) %>%
  select(squad, prgp, xg, xag, kp, `cmp%...7`) %>%
  column_to_rownames("squad")

# Normalize: add max and min rows for radar plot bounds
radar_max <- apply(radar_data, 2, max)
radar_min <- apply(radar_data, 2, min)
radar_plot_data <- rbind(radar_max, radar_min, radar_data)

# Plot
colors <- c("#E63946", "#F4A261", "#457B9D")
radarchart(radar_plot_data,
           axistype = 1,
           pcol = colors,
           pfcol = adjustcolor(colors, alpha.f = 0.25),
           plwd = 2,
           cglcol = "gray",
           cglty = 1,
           axislabcol = "gray",
           vlcex = 0.9,
           title = "Team Style Comparison Radar Chart")

legend("topright", legend = rownames(radar_data), col = colors, lty = 1, lwd = 2, bty = "n")
```
```{r}
# Already fitted model: lm_pts
merged_data$predicted_pts <- predict(lm_pts)

ggplot(merged_data, aes(x = predicted_pts, y = pts)) +
  geom_point(size = 4, color = "#1D3557") +
  geom_smooth(method = "lm", se = FALSE, color = "#E63946", linetype = "dashed") +
  geom_text_repel(aes(label = squad), size = 4.5) +
  labs(title = "Actual vs Predicted Points",
       subtitle = paste("R-squared:", round(summary(lm_pts)$adj.r.squared, 2)),
       x = "Predicted Points (Regression Model)",
       y = "Actual Points") +
  theme_minimal(base_size = 14)
```
```{r}
library(knitr)
library(kableExtra)

summary_table <- merged_data %>%
  select(squad, cluster, prgp, xg, kp, pts) %>%
  arrange(desc(pts))

# Display as a styled table
kable(summary_table, format = "markdown", digits = 1,
      col.names = c("Team", "Cluster", "Progressive Passes", "xG", "Key Passes", "Points")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
```
```{r}
write.csv(summary_table, "WSL_summary_table.csv", row.names = FALSE)
```
```{r}
# Print all results from above visuals 
# 1. Linear Regression Summary
cat("Linear Regression Summary: Predicting Points\n\n")
summary(lm_pts)

# 2. PCA Loadings
cat("\nPCA Loadings for Principal Component 1 (Attacking Style Dimension)\n")
print(round(pca_res$rotation[, 1], 3))

# 3. Cluster Membership by Team
cat("\nCluster Membership by Team\n")
cluster_membership <- merged_data %>%
  select(squad, cluster, pts, prgp, xg) %>%
  arrange(cluster, desc(pts))
print(cluster_membership)

# 4. Cluster-Level Summary
cat("\nCluster-Level Summary: Average Team Statistics\n")
cluster_summary <- merged_data %>%
  group_by(cluster) %>%
  summarise(
    Average_Points = round(mean(pts), 1),
    Average_Progressive_Passes = round(mean(prgp), 1),
    Average_xG = round(mean(xg), 1),
    Average_Key_Passes = round(mean(kp), 1)
  )
print(cluster_summary)

# 5. Top Performing Teams by Key Metrics
cat("\nTop 3 Teams by Progressive Passes, Expected Goals (xG), and Key Passes\n")

top_prgp <- merged_data %>% select(squad, prgp) %>% arrange(desc(prgp)) %>% head(3)
top_xg   <- merged_data %>% select(squad, xg) %>% arrange(desc(xg)) %>% head(3)
top_kp   <- merged_data %>% select(squad, kp) %>% arrange(desc(kp)) %>% head(3)

cat("\nTop Teams by Progressive Passes:\n")
print(top_prgp)

cat("\nTop Teams by Expected Goals (xG):\n")
print(top_xg)

cat("\nTop Teams by Key Passes:\n")
print(top_kp)

# The end
```
