library(readxl)
final_dataset_master <- read_excel("C:/Users/ashis/Downloads/final_dataset_master.xlsx")
View(final_dataset_master)
Load Libraries
# Libraries
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(cluster)
library(tidyr)
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
library(stats)
library(proxy)
## Warning: package 'proxy' was built under R version 4.4.3
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
Show how many rows/cols and column names
cat("Rows:", nrow(final_dataset_master), "Cols:", ncol(final_dataset_master), "\n\n")
## Rows: 13391 Cols: 36
colnames(final_dataset_master)
## [1] "normalized_name" "age" "player_height" "player_weight"
## [5] "college" "country" "draft_year" "draft_round"
## [9] "draft_number" "pts" "reb" "ast"
## [13] "season" "Pos.x" "MP.x" "G.x"
## [17] "eFG." "X3P" "X3PA" "X3P."
## [21] "X3PAr" "X2P" "X2PA" "X2P."
## [25] "FT" "FTA" "FT." "PER"
## [29] "TS." "TRB." "AST." "TOV."
## [33] "USG." "WS" "VORP" "BPM"
Data Cleaning
# Standardize column names to be safe
names(final_dataset_master) <- make.names(names(final_dataset_master))
# Helper: check if required columns exist
has_col <- function(x) x %in% names(final_dataset_master)
# Show a small preview
knitr::kable(head(final_dataset_master, 8))
| normalized_name | age | player_height | player_weight | college | country | draft_year | draft_round | draft_number | pts | reb | ast | season | Pos.x | MP.x | G.x | eFG. | X3P | X3PA | X3P. | X3PAr | X2P | X2PA | X2P. | FT | FTA | FT. | PER | TS. | TRB. | AST. | TOV. | USG. | WS | VORP | BPM |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Allen Iverson | 26 | 183.58 | 74.61 | Georgetown | USA | 1996 | 1 | 1 | 31.1 | 3.8 | 4.6 | 2000-01 | SG | 42.0 | 71 | 0.447 | 1.4 | 4.3 | 0.320 | 0.169 | 9.4 | 21.2 | 0.441 | 8.2 | 10.1 | 0.814 | 24.0 | 0.518 | 5.2 | 23.0 | 10.0 | 35.9 | 11.8 | 6.1 | 6.1 |
| Jerry Stackhouse | 26 | 198.15 | 99.23 | North Carolina | USA | 1995 | 1 | 3 | 29.8 | 3.9 | 5.1 | 2000-01 | SG | 40.2 | 80 | 0.445 | 2.1 | 5.9 | 0.351 | 0.245 | 7.6 | 18.2 | 0.418 | 8.3 | 10.1 | 0.822 | 21.8 | 0.521 | 5.2 | 25.8 | 12.5 | 35.2 | 9.2 | 5.8 | 5.1 |
| Shaquille O’Neal | 29 | 216.31 | 142.56 | Louisiana State | USA | 1992 | 1 | 1 | 28.7 | 12.7 | 3.7 | 2000-01 | C | 39.5 | 74 | 0.572 | 0.0 | 0.0 | 0.000 | 0.001 | 11.0 | 19.2 | 0.573 | 6.7 | 13.1 | 0.513 | 30.2 | 0.574 | 18.1 | 18.8 | 10.5 | 31.6 | 14.9 | 7.1 | 7.7 |
| Kobe Bryant | 22 | 201.60 | 95.20 | None | USA | 1996 | 1 | 13 | 28.5 | 5.9 | 5.0 | 2000-01 | SG | 40.9 | 68 | 0.484 | 0.9 | 2.9 | 0.305 | 0.132 | 9.4 | 19.3 | 0.489 | 7.0 | 8.2 | 0.853 | 24.5 | 0.552 | 8.1 | 23.0 | 11.1 | 31.8 | 11.3 | 4.7 | 4.8 |
| Vince Carter | 24 | 198.64 | 102.34 | North Carolina | USA | 1998 | 1 | 5 | 27.6 | 5.5 | 3.9 | 2000-01 | SF | 39.7 | 75 | 0.509 | 2.2 | 5.3 | 0.408 | 0.240 | 8.0 | 16.8 | 0.477 | 5.1 | 6.7 | 0.765 | 25.0 | 0.551 | 7.9 | 19.2 | 8.2 | 30.7 | 12.9 | 7.2 | 7.6 |
| Chris Webber | 28 | 208.72 | 111.14 | Michigan | USA | 1993 | 1 | 1 | 27.1 | 11.1 | 4.2 | 2000-01 | PF | 40.5 | 70 | 0.481 | 0.0 | 0.4 | 0.071 | 0.017 | 11.2 | 23.0 | 0.488 | 4.6 | 6.6 | 0.703 | 24.7 | 0.516 | 14.8 | 20.6 | 9.6 | 31.6 | 11.0 | 5.3 | 5.5 |
| Tracy McGrady | 22 | 204.02 | 95.02 | None | USA | 1997 | 1 | 9 | 26.8 | 7.5 | 4.6 | 2000-01 | SG | 40.1 | 77 | 0.474 | 0.8 | 2.2 | 0.355 | 0.096 | 9.5 | 20.2 | 0.468 | 5.6 | 7.6 | 0.733 | 24.9 | 0.521 | 10.4 | 22.8 | 9.1 | 31.2 | 12.2 | 7.0 | 7.0 |
| Paul Pierce | 23 | 198.30 | 104.59 | Kansas | USA | 1998 | 1 | 10 | 25.3 | 6.4 | 3.1 | 2000-01 | SF | 38.0 | 82 | 0.503 | 1.8 | 4.7 | 0.383 | 0.254 | 6.6 | 13.8 | 0.478 | 6.7 | 9.0 | 0.745 | 22.3 | 0.563 | 9.8 | 16.9 | 12.5 | 30.6 | 10.4 | 5.0 | 4.4 |
DESCRIPTIVE ANALYSIS:- 1)Position-wise Average Stats
# Use 'Pos.x' or try 'Pos' if exists
pos_col <- ifelse(has_col("Pos.x"), "Pos.x",
ifelse(has_col("Pos"), "Pos", NA))
num_stats <- intersect(c("pts","reb","ast","MP.x","PER","USG.","TS.","WS","VORP","BPM"),
names(final_dataset_master))
if (!is.na(pos_col)) {
desc_by_pos <- final_dataset_master %>%
filter(!is.na(.data[[pos_col]])) %>%
group_by(Position = .data[[pos_col]]) %>%
summarise(across(all_of(num_stats), ~ round(mean(.x, na.rm = TRUE), 2)))
knitr::kable(desc_by_pos)
} else {
cat("Position column not found (looked for 'Pos.x' or 'Pos'). Skipping position-wise summary.\n")
}
| Position | pts | reb | ast | MP.x | PER | USG. | TS. | WS | VORP | BPM |
|---|---|---|---|---|---|---|---|---|---|---|
| C | 6.97 | 5.02 | 0.94 | 18.05 | 13.93 | 17.36 | 0.53 | 2.77 | 0.51 | -2.01 |
| PF | 8.02 | 4.55 | 1.22 | 19.70 | 13.29 | 18.31 | 0.52 | 2.70 | 0.58 | -1.70 |
| PG | 8.72 | 2.30 | 3.66 | 21.24 | 12.90 | 19.80 | 0.50 | 2.51 | 0.76 | -1.30 |
| SF | 8.53 | 3.36 | 1.51 | 21.14 | 11.94 | 18.30 | 0.51 | 2.46 | 0.62 | -1.56 |
| SG | 8.92 | 2.49 | 1.91 | 20.84 | 11.73 | 19.51 | 0.51 | 2.24 | 0.56 | -1.79 |
2)Top 10 Players per Season (scorers, rebounders, assisters)
season_col <- ifelse(has_col("season"), "season",
ifelse(has_col("Season"), "Season", NA))
if (!is.na(season_col) && has_col("normalized_name") && has_col("pts") &&
has_col("reb") && has_col("ast")) {
seasons <- sort(unique(final_dataset_master[[season_col]]))
seasons_to_show <- tail(seasons, 3)
for (s in seasons_to_show) {
cat("\n### Season:", s, "\n")
df_s <- final_dataset_master %>% filter(.data[[season_col]] == s)
cat("\nTop 10 Scorers:\n")
print(df_s %>%
arrange(desc(pts)) %>%
select(any_of(c("normalized_name", "team", "pts"))) %>%
head(10))
cat("\nTop 10 Rebounders:\n")
print(df_s %>%
arrange(desc(reb)) %>%
select(any_of(c("normalized_name", "team", "reb"))) %>%
head(10))
cat("\nTop 10 Assisters:\n")
print(df_s %>%
arrange(desc(ast)) %>%
select(any_of(c("normalized_name", "team", "ast"))) %>%
head(10))
}
} else {
cat("Missing required columns (season, normalized_name, pts, reb, ast).\n")
}
##
## ### Season: 2021-22
##
## Top 10 Scorers:
## # A tibble: 10 × 2
## normalized_name pts
## <chr> <dbl>
## 1 Joel Embiid 30.6
## 2 LeBron James 30.3
## 3 Kevin Durant 29.9
## 4 Giannis Antetokounmpo 29.9
## 5 Trae Young 28.4
## 6 Luka Doncic 28.4
## 7 DeMar DeRozan 27.9
## 8 Kyrie Irving 27.4
## 9 Ja Morant 27.4
## 10 Nikola Jokic 27.1
##
## Top 10 Rebounders:
## # A tibble: 10 × 2
## normalized_name reb
## <chr> <dbl>
## 1 Rudy Gobert 14.7
## 2 Nikola Jokic 13.8
## 3 Domantas Sabonis 12.1
## 4 Jaylen Hoard 12
## 5 Clint Capela 11.9
## 6 Joel Embiid 11.7
## 7 Giannis Antetokounmpo 11.6
## 8 Jonas Valanciunas 11.4
## 9 Jusuf Nurkic 11.1
## 10 Nikola Vucevic 11
##
## Top 10 Assisters:
## # A tibble: 10 × 2
## normalized_name ast
## <chr> <dbl>
## 1 Chris Paul 10.8
## 2 James Harden 10.3
## 3 Trae Young 9.7
## 4 Dejounte Murray 9.2
## 5 Luka Doncic 8.7
## 6 Darius Garland 8.6
## 7 Tyrese Haliburton 8.2
## 8 Nikola Jokic 7.9
## 9 LaMelo Ball 7.6
## 10 Kyle Lowry 7.5
##
## ### Season: 2022-23
##
## Top 10 Scorers:
## # A tibble: 10 × 2
## normalized_name pts
## <chr> <dbl>
## 1 Joel Embiid 33.1
## 2 Luka Doncic 32.4
## 3 Damian Lillard 32.2
## 4 Shai Gilgeous-Alexander 31.4
## 5 Giannis Antetokounmpo 31.1
## 6 Jayson Tatum 30.1
## 7 Stephen Curry 29.4
## 8 Kevin Durant 29.1
## 9 LeBron James 28.9
## 10 Donovan Mitchell 28.3
##
## Top 10 Rebounders:
## # A tibble: 10 × 2
## normalized_name reb
## <chr> <dbl>
## 1 Anthony Davis 12.5
## 2 Domantas Sabonis 12.3
## 3 Giannis Antetokounmpo 11.8
## 4 Nikola Jokic 11.8
## 5 Rudy Gobert 11.6
## 6 Steven Adams 11.5
## 7 Nikola Vucevic 11
## 8 Clint Capela 11
## 9 Joel Embiid 10.2
## 10 Jonas Valanciunas 10.2
##
## Top 10 Assisters:
## # A tibble: 10 × 2
## normalized_name ast
## <chr> <dbl>
## 1 James Harden 10.7
## 2 Tyrese Haliburton 10.4
## 3 Trae Young 10.2
## 4 Nikola Jokic 9.8
## 5 Chris Paul 8.9
## 6 LaMelo Ball 8.4
## 7 Skylar Mays 8.3
## 8 Ja Morant 8.1
## 9 Luka Doncic 8
## 10 Darius Garland 7.8
##
## ### Season: 2023-24
##
## Top 10 Scorers:
## # A tibble: 10 × 2
## normalized_name pts
## <chr> <dbl>
## 1 Joel Embiid 34.7
## 2 Luka Doncic 33.9
## 3 Giannis Antetokounmpo 30.4
## 4 Shai Gilgeous-Alexander 30.1
## 5 Jalen Brunson 28.7
## 6 Devin Booker 27.1
## 7 Kevin Durant 27.1
## 8 Jayson Tatum 26.9
## 9 De'Aaron Fox 26.6
## 10 Donovan Mitchell 26.6
##
## Top 10 Rebounders:
## # A tibble: 10 × 2
## normalized_name reb
## <chr> <dbl>
## 1 Domantas Sabonis 13.7
## 2 Rudy Gobert 12.9
## 3 Anthony Davis 12.6
## 4 Nikola Jokic 12.4
## 5 Jalen Duren 11.6
## 6 Giannis Antetokounmpo 11.5
## 7 Deandre Ayton 11.1
## 8 Joel Embiid 11
## 9 Jusuf Nurkic 11
## 10 Victor Wembanyama 10.6
##
## Top 10 Assisters:
## # A tibble: 10 × 2
## normalized_name ast
## <chr> <dbl>
## 1 Tyrese Haliburton 10.9
## 2 Trae Young 10.8
## 3 Luka Doncic 9.8
## 4 Nikola Jokic 9
## 5 James Harden 8.5
## 6 LeBron James 8.3
## 7 Domantas Sabonis 8.2
## 8 Ja Morant 8.1
## 9 Fred VanVleet 8.1
## 10 LaMelo Ball 8
3)Trend over Time (Average PTS per season)
if (!is.na(season_col) && has_col("pts")) {
trend <- final_dataset_master %>%
group_by(Season = .data[[season_col]]) %>%
summarise(Avg_PTS = mean(pts, na.rm = TRUE), N = n()) %>%
arrange(Season)
ggplot(trend, aes(x = as.character(Season), y = Avg_PTS, group = 1)) +
geom_line() + geom_point() +
theme_minimal() +
labs(title = "Average Points per Game by Season", x = "Season", y = "Average PTS") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else {
cat("Season or pts column missing; cannot compute trend.\n")
}
4)Descriptive Table (Combined summary table for each team or position.)
if (has_col("team")) {
desc_by_team <- final_dataset_master %>%
group_by(team) %>%
summarise(across(all_of(num_stats), ~ round(mean(.x, na.rm = TRUE), 2)), Count = n()) %>%
arrange(desc(Count))
knitr::kable(head(desc_by_team, 25))
} else if (!is.na(pos_col)) {
knitr::kable(head(desc_by_pos, 25))
} else {
cat("Neither 'team' nor position column found.\n")
}
| Position | pts | reb | ast | MP.x | PER | USG. | TS. | WS | VORP | BPM |
|---|---|---|---|---|---|---|---|---|---|---|
| C | 6.97 | 5.02 | 0.94 | 18.05 | 13.93 | 17.36 | 0.53 | 2.77 | 0.51 | -2.01 |
| PF | 8.02 | 4.55 | 1.22 | 19.70 | 13.29 | 18.31 | 0.52 | 2.70 | 0.58 | -1.70 |
| PG | 8.72 | 2.30 | 3.66 | 21.24 | 12.90 | 19.80 | 0.50 | 2.51 | 0.76 | -1.30 |
| SF | 8.53 | 3.36 | 1.51 | 21.14 | 11.94 | 18.30 | 0.51 | 2.46 | 0.62 | -1.56 |
| SG | 8.92 | 2.49 | 1.91 | 20.84 | 11.73 | 19.51 | 0.51 | 2.24 | 0.56 | -1.79 |
DIAGNOSTIC ANALYSIS:- 1)Correlation Between Minutes & Performance
min_col <- ifelse(has_col("MP.x"), "MP.x", ifelse(has_col("MP"), "MP", NA))
if (!is.na(min_col) && has_col("pts") && has_col("ast")) {
p1 <- ggplot(final_dataset_master, aes(x = .data[[min_col]], y = pts)) +
geom_point(alpha = 0.4) + geom_smooth(method = "lm", se = TRUE) +
labs(x = min_col, y = "Points", title = paste("Minutes vs Points (corr =",
round(cor(final_dataset_master[[min_col]], final_dataset_master$pts, use = "complete.obs"), 2), ")")) +
theme_minimal()
print(p1)
p2 <- ggplot(final_dataset_master, aes(x = .data[[min_col]], y = ast)) +
geom_point(alpha = 0.4) + geom_smooth(method = "lm", se = TRUE) +
labs(x = min_col, y = "Assists", title = paste("Minutes vs Assists (corr =",
round(cor(final_dataset_master[[min_col]], final_dataset_master$ast, use = "complete.obs"), 2), ")")) +
theme_minimal()
print(p2)
cat("Correlation (Minutes, Points):\n"); print(cor.test(final_dataset_master[[min_col]], final_dataset_master$pts, use = "complete.obs"))
cat("\nCorrelation (Minutes, Assists):\n"); print(cor.test(final_dataset_master[[min_col]], final_dataset_master$ast, use = "complete.obs"))
} else {
cat("Missing minutes or points/assists column.\n")
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Correlation (Minutes, Points):
##
## Pearson's product-moment correlation
##
## data: final_dataset_master[[min_col]] and final_dataset_master$pts
## t = 223.43, df = 13389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8843501 0.8915156
## sample estimates:
## cor
## 0.8879867
##
##
## Correlation (Minutes, Assists):
##
## Pearson's product-moment correlation
##
## data: final_dataset_master[[min_col]] and final_dataset_master$ast
## t = 107.18, df = 13389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6703204 0.6885551
## sample estimates:
## cor
## 0.6795427
2)ANOVA Across Positions
pos_col <- pos_col # your position column (example: "Position")
if (!is.na(pos_col) && pos_col %in% colnames(final_dataset_master) && "pts" %in% colnames(final_dataset_master)) {
nba_for_anova <- final_dataset_master %>%
filter(!is.na(.data[[pos_col]]), !is.na(pts))
# ANOVA Model
aov_model <- aov(as.formula(paste("pts ~", pos_col)), data = nba_for_anova)
print(summary(aov_model))
# --- Visualization 1: Boxplot ---
p1 <- ggplot(nba_for_anova, aes(x = .data[[pos_col]], y = pts, fill = .data[[pos_col]])) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
labs(title = "ANOVA: Points Across Positions",
x = "Player Position", y = "Points Per Game")
print(p1)
# --- Visualization 2: Mean ± SE ---
mean_se_table <- nba_for_anova %>%
group_by(.data[[pos_col]]) %>%
summarise(mean_pts = mean(pts),
se = sd(pts)/sqrt(n()))
p2 <- ggplot(mean_se_table,
aes(x = .data[[pos_col]], y = mean_pts, fill = .data[[pos_col]])) +
geom_col(alpha = 0.8) +
geom_errorbar(aes(ymin = mean_pts - se, ymax = mean_pts + se), width = 0.2) +
theme_minimal() +
labs(title = "Mean Points ± SE by Position",
x = "Position", y = "Mean PTS")
print(p2)
# --- Visualization 3: Density Plot ---
p3 <- ggplot(nba_for_anova, aes(x = pts, fill = .data[[pos_col]])) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "Density Plot of Points by Position",
x = "Points", y = "Density")
print(p3)
} else {
cat("Position or pts missing; cannot run ANOVA.\n")
}
## Df Sum Sq Mean Sq F value Pr(>F)
## Pos.x 4 6616 1653.9 45.74 <2e-16 ***
## Residuals 13386 484022 36.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
3)Regression: Impact of Variables on Points
predictors <- c("ast","reb","MP.x","X3P.","X2P.","FT.","USG.","PER")
predictors_available <- intersect(predictors, names(final_dataset_master))
if (length(predictors_available) >= 2 && has_col("pts")) {
formula_text <- paste("pts ~", paste(predictors_available, collapse = " + "))
lm_model <- lm(as.formula(formula_text), data = final_dataset_master)
print(summary(lm_model))
par(mfrow = c(2,2)); plot(lm_model); par(mfrow = c(1,1))
} else {
cat("Not enough predictors available. Available:", paste(predictors_available, collapse = ", "), "\n")
}
##
## Call:
## lm(formula = as.formula(formula_text), data = final_dataset_master)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.5396 -0.9444 -0.1399 0.7236 10.5421
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.646906 0.095594 -90.455 < 2e-16 ***
## ast 0.074307 0.013318 5.579 2.46e-08 ***
## reb 0.036242 0.010946 3.311 0.000932 ***
## MP.x 0.415495 0.003350 124.028 < 2e-16 ***
## X3P. 0.988811 0.102826 9.616 < 2e-16 ***
## X2P. 0.374843 0.164344 2.281 0.022573 *
## FT. -0.408880 0.088738 -4.608 4.11e-06 ***
## USG. 0.351234 0.003185 110.263 < 2e-16 ***
## PER 0.120612 0.003617 33.345 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.779 on 13382 degrees of freedom
## Multiple R-squared: 0.9136, Adjusted R-squared: 0.9136
## F-statistic: 1.77e+04 on 8 and 13382 DF, p-value: < 2.2e-16
if (exists("lm_model")) {
# Add predictions
reg_df <- final_dataset_master %>%
mutate(pred_pts = predict(lm_model, final_dataset_master),
residuals = lm_model$residuals) %>%
filter(!is.na(pred_pts), !is.na(pts))
# Actual vs Predicted Plot
ggplot(reg_df, aes(x = pts, y = pred_pts)) +
geom_point(alpha = 0.6, color = "blue") +
geom_smooth(method = "lm") +
theme_minimal() +
labs(title = "Actual vs Predicted Points",
x = "Actual PTS", y = "Predicted PTS")
# Residual Plot
ggplot(reg_df, aes(x = pred_pts, y = residuals)) +
geom_point(alpha = 0.6, color = "red") +
geom_hline(yintercept = 0, linetype = "dashed") +
theme_minimal() +
labs(title = "Residual Plot", x = "Predicted PTS", y = "Residuals")
# Residual Distribution
ggplot(reg_df, aes(x = residuals)) +
geom_histogram(bins = 40, alpha = 0.6, fill = "purple") +
theme_minimal() +
labs(title = "Distribution of Regression Residuals", x = "Residuals", y = "Count")
} else {
cat("Regression model not found; skipping regression visualization.\n")
}
4)K-Means Clustering
# Select numeric columns only
num_cols <- final_dataset_master %>% select(where(is.numeric))
# Replace NA values with median
num_cols <- num_cols %>% mutate_all(~ ifelse(is.na(.), median(., na.rm = TRUE), .))
# Normalize data
num_norm <- as.data.frame(scale(num_cols))
# K-Means with 3 clusters
set.seed(123)
kmeans_model <- kmeans(num_norm, centers = 3, nstart = 25)
# Add cluster labels
kmeans_df <- num_norm %>% mutate(Cluster = as.factor(kmeans_model$cluster))
# PCA for visualization
pca_km <- prcomp(num_norm, scale. = TRUE)
pca_data <- data.frame(PC1 = pca_km$x[,1],
PC2 = pca_km$x[,2],
Cluster = kmeans_df$Cluster)
# Visualization
ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(size = 3, alpha = 0.8) +
theme_minimal() +
labs(title = "K-Means Clustering (k = 3)",
x = "Principal Component 1",
y = "Principal Component 2")
PREDICTIVE ANALYSIS:- 1)Predict Points (Linear Regression with train/test)
if (has_col("pts") && length(predictors_available) >= 2) {
model_df <- final_dataset_master %>% select(all_of(c("pts", predictors_available))) %>% na.omit()
set.seed(123)
idx <- sample(seq_len(nrow(model_df)), size = floor(0.8 * nrow(model_df)))
train_df <- model_df[idx, ]
test_df <- model_df[-idx, ]
lm_fit <- lm(pts ~ ., data = train_df)
preds <- predict(lm_fit, newdata = test_df)
rmse <- sqrt(mean((test_df$pts - preds)^2))
cat("Linear regression RMSE on test set:", round(rmse, 3), "\n")
print(summary(lm_fit))
} else {
cat("Insufficient data to train points prediction model.\n")
}
## Linear regression RMSE on test set: 1.81
##
## Call:
## lm(formula = pts ~ ., data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.5060 -0.9443 -0.1436 0.7196 10.4911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.628526 0.107832 -80.018 < 2e-16 ***
## ast 0.077700 0.014751 5.267 1.41e-07 ***
## reb 0.045818 0.012223 3.748 0.000179 ***
## MP.x 0.412386 0.003714 111.047 < 2e-16 ***
## X3P. 1.133206 0.114419 9.904 < 2e-16 ***
## X2P. 0.307272 0.185433 1.657 0.097538 .
## FT. -0.475194 0.098132 -4.842 1.30e-06 ***
## USG. 0.354561 0.003656 96.968 < 2e-16 ***
## PER 0.119451 0.004276 27.934 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.772 on 10703 degrees of freedom
## Multiple R-squared: 0.9138, Adjusted R-squared: 0.9138
## F-statistic: 1.419e+04 on 8 and 10703 DF, p-value: < 2.2e-16
2)KNN
library(class)
library(dplyr)
library(ggplot2)
# 1. Prepare dataset
required_cols <- c("Pos.x", "pts", "ast", "TRB.")
missing_cols <- setdiff(required_cols, colnames(final_dataset_master))
if (length(missing_cols) > 0) {
stop(paste("Missing required columns:", paste(missing_cols, collapse = ", ")))
}
final_dataset_master_knn <- final_dataset_master %>%
filter(!is.na(Pos.x), !is.na(pts), !is.na(ast), !is.na(TRB.))
# Select numeric features
features <- final_dataset_master_knn %>% select(pts, ast, TRB.)
# Manual normalization function
normalize <- function(x) {
if (min(x) == max(x)) return(rep(0.5, length(x)))
(x - min(x)) / (max(x) - min(x))
}
# Normalize
features_norm <- as.data.frame(lapply(features, normalize))
# Target variable
labels <- final_dataset_master_knn$Pos.x
# 2. Train-test split
set.seed(123)
index <- sample(1:nrow(features_norm), 0.8 * nrow(features_norm))
train_X <- features_norm[index, ]
test_X <- features_norm[-index, ]
train_y <- labels[index]
test_y <- labels[-index]
# 3. Run KNN Model
knn_pred <- knn(
train = train_X,
test = test_X,
cl = train_y,
k = 5
)
# Accuracy
accuracy <- mean(knn_pred == test_y)
print(paste("KNN Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "KNN Accuracy: 51.06 %"
# Confusion Matrix
print(table(Predicted = knn_pred, Actual = test_y))
## Actual
## Predicted C PF PG SF SG
## C 310 216 2 58 13
## PF 179 204 6 95 42
## PG 3 6 397 28 121
## SF 26 97 17 187 132
## SG 17 36 82 135 270
# 4. Visualization 1: Predicted Classes
plot_df <- data.frame(
pts = test_X$pts,
ast = test_X$ast,
pos_actual = test_y,
pos_pred = knn_pred
)
p1 <- ggplot(plot_df, aes(x = pts, y = ast, color = pos_pred)) +
geom_point(size = 3, alpha = 0.7) +
theme_minimal() +
labs(
title = "KNN Classification (Predicted Player Positions)",
x = "Normalized Points (PTS)",
y = "Normalized Assists (AST)",
color = "Predicted Position"
)
print(p1)
# 5. Visualization 2: Correct vs Incorrect
plot_df$correct <- ifelse(plot_df$pos_pred == plot_df$pos_actual,
"Correct", "Incorrect")
p2 <- ggplot(plot_df, aes(x = pts, y = ast, color = correct)) +
geom_point(size = 3, alpha = 0.9) +
scale_color_manual(values = c("Correct" = "green", "Incorrect" = "red")) +
theme_minimal() +
labs(
title = "KNN Accuracy: Correct vs Incorrect Predictions",
x = "Normalized Points (PTS)",
y = "Normalized Assists (AST)"
)
print(p2)
PRESCRIPTIVE ANALYSIS:- 1)What-If Analysis (simulate +10% minutes or +5% shooting)
if (exists("lm_fit")) {
sim_row <- na.omit(final_dataset_master %>% select(all_of(c("pts", predictors_available))) %>% slice(1))
sim_row <- as.data.frame(sim_row)
cat("Baseline prediction:\n"); print(predict(lm_fit, newdata = sim_row))
if ("MP.x" %in% predictors_available) {
sim_inc <- sim_row; sim_inc$MP.x <- sim_inc$MP.x * 1.10
cat("\n+10% Minutes prediction:\n"); print(predict(lm_fit, newdata = sim_inc))
}
if ("X3P." %in% predictors_available) {
sim_inc2 <- sim_row; sim_inc2$X3P. <- pmin(sim_inc2$X3P. * 1.05, 1)
cat("\n+5% 3P% prediction:\n"); print(predict(lm_fit, newdata = sim_inc2))
}
} else {
cat("No regression model available for what-if analysis.\n")
}
## Baseline prediction:
## 1
## 24.93009
##
## +10% Minutes prediction:
## 1
## 26.66211
##
## +5% 3P% prediction:
## 1
## 24.94822
2)Efficiency Ranking (custom score)
eff_cols <- intersect(c("pts","reb","ast","stl","blk","TOV.","PER","USG."),
names(final_dataset_master))
if (length(eff_cols) == 0) {
eff_cols <- intersect(c("pts","reb","ast","PER","USG."),
names(final_dataset_master))
}
if (length(eff_cols) > 0) {
# scale numeric columns manually (z-score)
scaled <- as.data.frame(scale(final_dataset_master[, eff_cols],
center = TRUE, scale = TRUE))
eff_score <- rowSums(replace(scaled, is.na(scaled), 0))
final_dataset_master$Efficiency_Score <- eff_score
knitr::kable(
head(
final_dataset_master %>%
arrange(desc(Efficiency_Score)) %>%
select(any_of(c("normalized_name", "team", "Efficiency_Score")))
, 20)
)
} else {
cat("No stats available to compute efficiency score.\n")
}
| normalized_name | Efficiency_Score |
|---|---|
| Russell Westbrook | 18.77880 |
| DeAndre Liggins | 18.21036 |
| Naz Mitrou-Long | 18.20062 |
| James Harden | 16.47019 |
| Nikola Jokic | 16.42779 |
| Joel Embiid | 16.38515 |
| Luka Doncic | 16.31151 |
| Giannis Antetokounmpo | 15.96270 |
| James Harden | 15.82265 |
| Jackie Butler | 15.69764 |
| Nikola Jokic | 15.41107 |
| Luka Doncic | 15.37704 |
| Giannis Antetokounmpo | 15.33708 |
| Russell Westbrook | 15.23959 |
| Nikola Jokic | 15.17979 |
| Luka Doncic | 15.06546 |
| Luka Doncic | 14.93069 |
| Russell Westbrook | 14.78594 |
| Russell Westbrook | 14.74759 |
| Giannis Antetokounmpo | 14.73592 |