NBA Player Stats

library(readxl)
final_dataset_master <- read_excel("C:/Users/ashis/Downloads/final_dataset_master.xlsx")
View(final_dataset_master)

Load Libraries

# Libraries

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.3

## corrplot 0.95 loaded

library(GGally)

## Warning: package 'GGally' was built under R version 4.4.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(cluster)
library(tidyr)
library(scales)

## Warning: package 'scales' was built under R version 4.4.3

library(stats)
library(proxy)

## Warning: package 'proxy' was built under R version 4.4.3

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

Show how many rows/cols and column names

cat("Rows:", nrow(final_dataset_master), "Cols:", ncol(final_dataset_master), "\n\n")

## Rows: 13391 Cols: 36

colnames(final_dataset_master)

##  [1] "normalized_name" "age"             "player_height"   "player_weight"  
##  [5] "college"         "country"         "draft_year"      "draft_round"    
##  [9] "draft_number"    "pts"             "reb"             "ast"            
## [13] "season"          "Pos.x"           "MP.x"            "G.x"            
## [17] "eFG."            "X3P"             "X3PA"            "X3P."           
## [21] "X3PAr"           "X2P"             "X2PA"            "X2P."           
## [25] "FT"              "FTA"             "FT."             "PER"            
## [29] "TS."             "TRB."            "AST."            "TOV."           
## [33] "USG."            "WS"              "VORP"            "BPM"

Data Cleaning

# Standardize column names to be safe

names(final_dataset_master) <- make.names(names(final_dataset_master))

# Helper: check if required columns exist

has_col <- function(x) x %in% names(final_dataset_master)

# Show a small preview

knitr::kable(head(final_dataset_master, 8))

normalized_name	age	player_height	player_weight	college	country	draft_year	draft_round	draft_number	pts	reb	ast	season	Pos.x	MP.x	G.x	eFG.	X3P	X3PA	X3P.	X3PAr	X2P	X2PA	X2P.	FT	FTA	FT.	PER	TS.	TRB.	AST.	TOV.	USG.	WS	VORP	BPM
Allen Iverson	26	183.58	74.61	Georgetown	USA	1996	1	1	31.1	3.8	4.6	2000-01	SG	42.0	71	0.447	1.4	4.3	0.320	0.169	9.4	21.2	0.441	8.2	10.1	0.814	24.0	0.518	5.2	23.0	10.0	35.9	11.8	6.1	6.1
Jerry Stackhouse	26	198.15	99.23	North Carolina	USA	1995	1	3	29.8	3.9	5.1	2000-01	SG	40.2	80	0.445	2.1	5.9	0.351	0.245	7.6	18.2	0.418	8.3	10.1	0.822	21.8	0.521	5.2	25.8	12.5	35.2	9.2	5.8	5.1
Shaquille O’Neal	29	216.31	142.56	Louisiana State	USA	1992	1	1	28.7	12.7	3.7	2000-01	C	39.5	74	0.572	0.0	0.0	0.000	0.001	11.0	19.2	0.573	6.7	13.1	0.513	30.2	0.574	18.1	18.8	10.5	31.6	14.9	7.1	7.7
Kobe Bryant	22	201.60	95.20	None	USA	1996	1	13	28.5	5.9	5.0	2000-01	SG	40.9	68	0.484	0.9	2.9	0.305	0.132	9.4	19.3	0.489	7.0	8.2	0.853	24.5	0.552	8.1	23.0	11.1	31.8	11.3	4.7	4.8
Vince Carter	24	198.64	102.34	North Carolina	USA	1998	1	5	27.6	5.5	3.9	2000-01	SF	39.7	75	0.509	2.2	5.3	0.408	0.240	8.0	16.8	0.477	5.1	6.7	0.765	25.0	0.551	7.9	19.2	8.2	30.7	12.9	7.2	7.6
Chris Webber	28	208.72	111.14	Michigan	USA	1993	1	1	27.1	11.1	4.2	2000-01	PF	40.5	70	0.481	0.0	0.4	0.071	0.017	11.2	23.0	0.488	4.6	6.6	0.703	24.7	0.516	14.8	20.6	9.6	31.6	11.0	5.3	5.5
Tracy McGrady	22	204.02	95.02	None	USA	1997	1	9	26.8	7.5	4.6	2000-01	SG	40.1	77	0.474	0.8	2.2	0.355	0.096	9.5	20.2	0.468	5.6	7.6	0.733	24.9	0.521	10.4	22.8	9.1	31.2	12.2	7.0	7.0
Paul Pierce	23	198.30	104.59	Kansas	USA	1998	1	10	25.3	6.4	3.1	2000-01	SF	38.0	82	0.503	1.8	4.7	0.383	0.254	6.6	13.8	0.478	6.7	9.0	0.745	22.3	0.563	9.8	16.9	12.5	30.6	10.4	5.0	4.4

DESCRIPTIVE ANALYSIS:- 1)Position-wise Average Stats

# Use 'Pos.x' or try 'Pos' if exists

pos_col <- ifelse(has_col("Pos.x"), "Pos.x",
ifelse(has_col("Pos"), "Pos", NA))

num_stats <- intersect(c("pts","reb","ast","MP.x","PER","USG.","TS.","WS","VORP","BPM"),
names(final_dataset_master))

if (!is.na(pos_col)) {
desc_by_pos <- final_dataset_master %>%
filter(!is.na(.data[[pos_col]])) %>%
group_by(Position = .data[[pos_col]]) %>%
summarise(across(all_of(num_stats), ~ round(mean(.x, na.rm = TRUE), 2)))
knitr::kable(desc_by_pos)
} else {
cat("Position column not found (looked for 'Pos.x' or 'Pos'). Skipping position-wise summary.\n")
}

Position	pts	reb	ast	MP.x	PER	USG.	TS.	WS	VORP	BPM
C	6.97	5.02	0.94	18.05	13.93	17.36	0.53	2.77	0.51	-2.01
PF	8.02	4.55	1.22	19.70	13.29	18.31	0.52	2.70	0.58	-1.70
PG	8.72	2.30	3.66	21.24	12.90	19.80	0.50	2.51	0.76	-1.30
SF	8.53	3.36	1.51	21.14	11.94	18.30	0.51	2.46	0.62	-1.56
SG	8.92	2.49	1.91	20.84	11.73	19.51	0.51	2.24	0.56	-1.79

2)Top 10 Players per Season (scorers, rebounders, assisters)

season_col <- ifelse(has_col("season"), "season",
                     ifelse(has_col("Season"), "Season", NA))

if (!is.na(season_col) && has_col("normalized_name") && has_col("pts") &&
    has_col("reb") && has_col("ast")) {
  
  seasons <- sort(unique(final_dataset_master[[season_col]]))
  seasons_to_show <- tail(seasons, 3)
  
  for (s in seasons_to_show) {
    cat("\n### Season:", s, "\n")
    
    df_s <- final_dataset_master %>% filter(.data[[season_col]] == s)
    
    cat("\nTop 10 Scorers:\n")
    print(df_s %>% 
            arrange(desc(pts)) %>% 
            select(any_of(c("normalized_name", "team", "pts"))) %>% 
            head(10))
    
    cat("\nTop 10 Rebounders:\n")
    print(df_s %>% 
            arrange(desc(reb)) %>% 
            select(any_of(c("normalized_name", "team", "reb"))) %>% 
            head(10))
    
    cat("\nTop 10 Assisters:\n")
    print(df_s %>% 
            arrange(desc(ast)) %>% 
            select(any_of(c("normalized_name", "team", "ast"))) %>% 
            head(10))
  }
  
} else {
  cat("Missing required columns (season, normalized_name, pts, reb, ast).\n")
}

## 
## ### Season: 2021-22 
## 
## Top 10 Scorers:
## # A tibble: 10 × 2
##    normalized_name         pts
##    <chr>                 <dbl>
##  1 Joel Embiid            30.6
##  2 LeBron James           30.3
##  3 Kevin Durant           29.9
##  4 Giannis Antetokounmpo  29.9
##  5 Trae Young             28.4
##  6 Luka Doncic            28.4
##  7 DeMar DeRozan          27.9
##  8 Kyrie Irving           27.4
##  9 Ja Morant              27.4
## 10 Nikola Jokic           27.1
## 
## Top 10 Rebounders:
## # A tibble: 10 × 2
##    normalized_name         reb
##    <chr>                 <dbl>
##  1 Rudy Gobert            14.7
##  2 Nikola Jokic           13.8
##  3 Domantas Sabonis       12.1
##  4 Jaylen Hoard           12  
##  5 Clint Capela           11.9
##  6 Joel Embiid            11.7
##  7 Giannis Antetokounmpo  11.6
##  8 Jonas Valanciunas      11.4
##  9 Jusuf Nurkic           11.1
## 10 Nikola Vucevic         11  
## 
## Top 10 Assisters:
## # A tibble: 10 × 2
##    normalized_name     ast
##    <chr>             <dbl>
##  1 Chris Paul         10.8
##  2 James Harden       10.3
##  3 Trae Young          9.7
##  4 Dejounte Murray     9.2
##  5 Luka Doncic         8.7
##  6 Darius Garland      8.6
##  7 Tyrese Haliburton   8.2
##  8 Nikola Jokic        7.9
##  9 LaMelo Ball         7.6
## 10 Kyle Lowry          7.5
## 
## ### Season: 2022-23 
## 
## Top 10 Scorers:
## # A tibble: 10 × 2
##    normalized_name           pts
##    <chr>                   <dbl>
##  1 Joel Embiid              33.1
##  2 Luka Doncic              32.4
##  3 Damian Lillard           32.2
##  4 Shai Gilgeous-Alexander  31.4
##  5 Giannis Antetokounmpo    31.1
##  6 Jayson Tatum             30.1
##  7 Stephen Curry            29.4
##  8 Kevin Durant             29.1
##  9 LeBron James             28.9
## 10 Donovan Mitchell         28.3
## 
## Top 10 Rebounders:
## # A tibble: 10 × 2
##    normalized_name         reb
##    <chr>                 <dbl>
##  1 Anthony Davis          12.5
##  2 Domantas Sabonis       12.3
##  3 Giannis Antetokounmpo  11.8
##  4 Nikola Jokic           11.8
##  5 Rudy Gobert            11.6
##  6 Steven Adams           11.5
##  7 Nikola Vucevic         11  
##  8 Clint Capela           11  
##  9 Joel Embiid            10.2
## 10 Jonas Valanciunas      10.2
## 
## Top 10 Assisters:
## # A tibble: 10 × 2
##    normalized_name     ast
##    <chr>             <dbl>
##  1 James Harden       10.7
##  2 Tyrese Haliburton  10.4
##  3 Trae Young         10.2
##  4 Nikola Jokic        9.8
##  5 Chris Paul          8.9
##  6 LaMelo Ball         8.4
##  7 Skylar Mays         8.3
##  8 Ja Morant           8.1
##  9 Luka Doncic         8  
## 10 Darius Garland      7.8
## 
## ### Season: 2023-24 
## 
## Top 10 Scorers:
## # A tibble: 10 × 2
##    normalized_name           pts
##    <chr>                   <dbl>
##  1 Joel Embiid              34.7
##  2 Luka Doncic              33.9
##  3 Giannis Antetokounmpo    30.4
##  4 Shai Gilgeous-Alexander  30.1
##  5 Jalen Brunson            28.7
##  6 Devin Booker             27.1
##  7 Kevin Durant             27.1
##  8 Jayson Tatum             26.9
##  9 De'Aaron Fox             26.6
## 10 Donovan Mitchell         26.6
## 
## Top 10 Rebounders:
## # A tibble: 10 × 2
##    normalized_name         reb
##    <chr>                 <dbl>
##  1 Domantas Sabonis       13.7
##  2 Rudy Gobert            12.9
##  3 Anthony Davis          12.6
##  4 Nikola Jokic           12.4
##  5 Jalen Duren            11.6
##  6 Giannis Antetokounmpo  11.5
##  7 Deandre Ayton          11.1
##  8 Joel Embiid            11  
##  9 Jusuf Nurkic           11  
## 10 Victor Wembanyama      10.6
## 
## Top 10 Assisters:
## # A tibble: 10 × 2
##    normalized_name     ast
##    <chr>             <dbl>
##  1 Tyrese Haliburton  10.9
##  2 Trae Young         10.8
##  3 Luka Doncic         9.8
##  4 Nikola Jokic        9  
##  5 James Harden        8.5
##  6 LeBron James        8.3
##  7 Domantas Sabonis    8.2
##  8 Ja Morant           8.1
##  9 Fred VanVleet       8.1
## 10 LaMelo Ball         8

3)Trend over Time (Average PTS per season)

if (!is.na(season_col) && has_col("pts")) {
trend <- final_dataset_master %>%
group_by(Season = .data[[season_col]]) %>%
summarise(Avg_PTS = mean(pts, na.rm = TRUE), N = n()) %>%
arrange(Season)
ggplot(trend, aes(x = as.character(Season), y = Avg_PTS, group = 1)) +
geom_line() + geom_point() +
theme_minimal() +
labs(title = "Average Points per Game by Season", x = "Season", y = "Average PTS") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else {
cat("Season or pts column missing; cannot compute trend.\n")
}

4)Descriptive Table (Combined summary table for each team or position.)

if (has_col("team")) {
desc_by_team <- final_dataset_master %>%
group_by(team) %>%
summarise(across(all_of(num_stats), ~ round(mean(.x, na.rm = TRUE), 2)), Count = n()) %>%
arrange(desc(Count))
knitr::kable(head(desc_by_team, 25))
} else if (!is.na(pos_col)) {
knitr::kable(head(desc_by_pos, 25))
} else {
cat("Neither 'team' nor position column found.\n")
}

Position	pts	reb	ast	MP.x	PER	USG.	TS.	WS	VORP	BPM
C	6.97	5.02	0.94	18.05	13.93	17.36	0.53	2.77	0.51	-2.01
PF	8.02	4.55	1.22	19.70	13.29	18.31	0.52	2.70	0.58	-1.70
PG	8.72	2.30	3.66	21.24	12.90	19.80	0.50	2.51	0.76	-1.30
SF	8.53	3.36	1.51	21.14	11.94	18.30	0.51	2.46	0.62	-1.56
SG	8.92	2.49	1.91	20.84	11.73	19.51	0.51	2.24	0.56	-1.79

DIAGNOSTIC ANALYSIS:- 1)Correlation Between Minutes & Performance

min_col <- ifelse(has_col("MP.x"), "MP.x", ifelse(has_col("MP"), "MP", NA))
if (!is.na(min_col) && has_col("pts") && has_col("ast")) {
p1 <- ggplot(final_dataset_master, aes(x = .data[[min_col]], y = pts)) +
geom_point(alpha = 0.4) + geom_smooth(method = "lm", se = TRUE) +
labs(x = min_col, y = "Points", title = paste("Minutes vs Points (corr =",
round(cor(final_dataset_master[[min_col]], final_dataset_master$pts, use = "complete.obs"), 2), ")")) +
theme_minimal()
print(p1)
p2 <- ggplot(final_dataset_master, aes(x = .data[[min_col]], y = ast)) +
geom_point(alpha = 0.4) + geom_smooth(method = "lm", se = TRUE) +
labs(x = min_col, y = "Assists", title = paste("Minutes vs Assists (corr =",
round(cor(final_dataset_master[[min_col]], final_dataset_master$ast, use = "complete.obs"), 2), ")")) +
theme_minimal()
print(p2)
cat("Correlation (Minutes, Points):\n"); print(cor.test(final_dataset_master[[min_col]], final_dataset_master$pts, use = "complete.obs"))
cat("\nCorrelation (Minutes, Assists):\n"); print(cor.test(final_dataset_master[[min_col]], final_dataset_master$ast, use = "complete.obs"))
} else {
cat("Missing minutes or points/assists column.\n")
}

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## Correlation (Minutes, Points):
## 
##  Pearson's product-moment correlation
## 
## data:  final_dataset_master[[min_col]] and final_dataset_master$pts
## t = 223.43, df = 13389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8843501 0.8915156
## sample estimates:
##       cor 
## 0.8879867 
## 
## 
## Correlation (Minutes, Assists):
## 
##  Pearson's product-moment correlation
## 
## data:  final_dataset_master[[min_col]] and final_dataset_master$ast
## t = 107.18, df = 13389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6703204 0.6885551
## sample estimates:
##       cor 
## 0.6795427

2)ANOVA Across Positions

pos_col <- pos_col  # your position column (example: "Position")

if (!is.na(pos_col) && pos_col %in% colnames(final_dataset_master) && "pts" %in% colnames(final_dataset_master)) {
  
  nba_for_anova <- final_dataset_master %>% 
    filter(!is.na(.data[[pos_col]]), !is.na(pts))
  
  # ANOVA Model
  aov_model <- aov(as.formula(paste("pts ~", pos_col)), data = nba_for_anova)
  print(summary(aov_model))
  
  # --- Visualization 1: Boxplot ---
  p1 <- ggplot(nba_for_anova, aes(x = .data[[pos_col]], y = pts, fill = .data[[pos_col]])) +
    geom_boxplot(alpha = 0.7) +
    theme_minimal() +
    labs(title = "ANOVA: Points Across Positions",
         x = "Player Position", y = "Points Per Game")
  print(p1)
  
  # --- Visualization 2: Mean ± SE ---
  mean_se_table <- nba_for_anova %>%
    group_by(.data[[pos_col]]) %>%
    summarise(mean_pts = mean(pts),
              se = sd(pts)/sqrt(n()))
  
  p2 <- ggplot(mean_se_table, 
               aes(x = .data[[pos_col]], y = mean_pts, fill = .data[[pos_col]])) +
    geom_col(alpha = 0.8) +
    geom_errorbar(aes(ymin = mean_pts - se, ymax = mean_pts + se), width = 0.2) +
    theme_minimal() +
    labs(title = "Mean Points ± SE by Position",
         x = "Position", y = "Mean PTS")
  print(p2)
  
  # --- Visualization 3: Density Plot ---
  p3 <- ggplot(nba_for_anova, aes(x = pts, fill = .data[[pos_col]])) +
    geom_density(alpha = 0.5) +
    theme_minimal() +
    labs(title = "Density Plot of Points by Position",
         x = "Points", y = "Density")
  print(p3)
  
} else {
  cat("Position or pts missing; cannot run ANOVA.\n")
}

##                Df Sum Sq Mean Sq F value Pr(>F)    
## Pos.x           4   6616  1653.9   45.74 <2e-16 ***
## Residuals   13386 484022    36.2                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3)Regression: Impact of Variables on Points

predictors <- c("ast","reb","MP.x","X3P.","X2P.","FT.","USG.","PER")
predictors_available <- intersect(predictors, names(final_dataset_master))
if (length(predictors_available) >= 2 && has_col("pts")) {
formula_text <- paste("pts ~", paste(predictors_available, collapse = " + "))
lm_model <- lm(as.formula(formula_text), data = final_dataset_master)
print(summary(lm_model))
par(mfrow = c(2,2)); plot(lm_model); par(mfrow = c(1,1))
} else {
cat("Not enough predictors available. Available:", paste(predictors_available, collapse = ", "), "\n")
}

## 
## Call:
## lm(formula = as.formula(formula_text), data = final_dataset_master)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5396  -0.9444  -0.1399   0.7236  10.5421 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.646906   0.095594 -90.455  < 2e-16 ***
## ast          0.074307   0.013318   5.579 2.46e-08 ***
## reb          0.036242   0.010946   3.311 0.000932 ***
## MP.x         0.415495   0.003350 124.028  < 2e-16 ***
## X3P.         0.988811   0.102826   9.616  < 2e-16 ***
## X2P.         0.374843   0.164344   2.281 0.022573 *  
## FT.         -0.408880   0.088738  -4.608 4.11e-06 ***
## USG.         0.351234   0.003185 110.263  < 2e-16 ***
## PER          0.120612   0.003617  33.345  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.779 on 13382 degrees of freedom
## Multiple R-squared:  0.9136, Adjusted R-squared:  0.9136 
## F-statistic: 1.77e+04 on 8 and 13382 DF,  p-value: < 2.2e-16

if (exists("lm_model")) {

  # Add predictions
  reg_df <- final_dataset_master %>% 
    mutate(pred_pts = predict(lm_model, final_dataset_master),
           residuals = lm_model$residuals) %>%
    filter(!is.na(pred_pts), !is.na(pts))

  # Actual vs Predicted Plot
  ggplot(reg_df, aes(x = pts, y = pred_pts)) +
    geom_point(alpha = 0.6, color = "blue") +
    geom_smooth(method = "lm") +
    theme_minimal() +
    labs(title = "Actual vs Predicted Points",
         x = "Actual PTS", y = "Predicted PTS")

  # Residual Plot
  ggplot(reg_df, aes(x = pred_pts, y = residuals)) +
    geom_point(alpha = 0.6, color = "red") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    theme_minimal() +
    labs(title = "Residual Plot", x = "Predicted PTS", y = "Residuals")

  # Residual Distribution
  ggplot(reg_df, aes(x = residuals)) +
    geom_histogram(bins = 40, alpha = 0.6, fill = "purple") +
    theme_minimal() +
    labs(title = "Distribution of Regression Residuals", x = "Residuals", y = "Count")

} else {
  cat("Regression model not found; skipping regression visualization.\n")
}

4)K-Means Clustering

# Select numeric columns only
num_cols <- final_dataset_master %>% select(where(is.numeric))

# Replace NA values with median
num_cols <- num_cols %>% mutate_all(~ ifelse(is.na(.), median(., na.rm = TRUE), .))

# Normalize data
num_norm <- as.data.frame(scale(num_cols))

# K-Means with 3 clusters
set.seed(123)
kmeans_model <- kmeans(num_norm, centers = 3, nstart = 25)

# Add cluster labels
kmeans_df <- num_norm %>% mutate(Cluster = as.factor(kmeans_model$cluster))

# PCA for visualization
pca_km <- prcomp(num_norm, scale. = TRUE)
pca_data <- data.frame(PC1 = pca_km$x[,1],
                       PC2 = pca_km$x[,2],
                       Cluster = kmeans_df$Cluster)

# Visualization
ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.8) +
  theme_minimal() +
  labs(title = "K-Means Clustering (k = 3)",
       x = "Principal Component 1",
       y = "Principal Component 2")

PREDICTIVE ANALYSIS:- 1)Predict Points (Linear Regression with train/test)

if (has_col("pts") && length(predictors_available) >= 2) {
model_df <- final_dataset_master %>% select(all_of(c("pts", predictors_available))) %>% na.omit()
set.seed(123)
idx <- sample(seq_len(nrow(model_df)), size = floor(0.8 * nrow(model_df)))
train_df <- model_df[idx, ]
test_df  <- model_df[-idx, ]
lm_fit <- lm(pts ~ ., data = train_df)
preds <- predict(lm_fit, newdata = test_df)
rmse <- sqrt(mean((test_df$pts - preds)^2))
cat("Linear regression RMSE on test set:", round(rmse, 3), "\n")
print(summary(lm_fit))
} else {
cat("Insufficient data to train points prediction model.\n")
}

## Linear regression RMSE on test set: 1.81 
## 
## Call:
## lm(formula = pts ~ ., data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5060  -0.9443  -0.1436   0.7196  10.4911 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.628526   0.107832 -80.018  < 2e-16 ***
## ast          0.077700   0.014751   5.267 1.41e-07 ***
## reb          0.045818   0.012223   3.748 0.000179 ***
## MP.x         0.412386   0.003714 111.047  < 2e-16 ***
## X3P.         1.133206   0.114419   9.904  < 2e-16 ***
## X2P.         0.307272   0.185433   1.657 0.097538 .  
## FT.         -0.475194   0.098132  -4.842 1.30e-06 ***
## USG.         0.354561   0.003656  96.968  < 2e-16 ***
## PER          0.119451   0.004276  27.934  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.772 on 10703 degrees of freedom
## Multiple R-squared:  0.9138, Adjusted R-squared:  0.9138 
## F-statistic: 1.419e+04 on 8 and 10703 DF,  p-value: < 2.2e-16

2)KNN

library(class)
library(dplyr)
library(ggplot2)

# 1. Prepare dataset

required_cols <- c("Pos.x", "pts", "ast", "TRB.")

missing_cols <- setdiff(required_cols, colnames(final_dataset_master))
if (length(missing_cols) > 0) {
  stop(paste("Missing required columns:", paste(missing_cols, collapse = ", ")))
}

final_dataset_master_knn <- final_dataset_master %>%
  filter(!is.na(Pos.x), !is.na(pts), !is.na(ast), !is.na(TRB.))

# Select numeric features
features <- final_dataset_master_knn %>% select(pts, ast, TRB.)

# Manual normalization function
normalize <- function(x) {
  if (min(x) == max(x)) return(rep(0.5, length(x))) 
  (x - min(x)) / (max(x) - min(x))
}

# Normalize
features_norm <- as.data.frame(lapply(features, normalize))

# Target variable
labels <- final_dataset_master_knn$Pos.x

# 2. Train-test split

set.seed(123)
index <- sample(1:nrow(features_norm), 0.8 * nrow(features_norm))

train_X <- features_norm[index, ]
test_X  <- features_norm[-index, ]
train_y <- labels[index]
test_y  <- labels[-index]

# 3. Run KNN Model

knn_pred <- knn(
  train = train_X,
  test  = test_X,
  cl    = train_y,
  k = 5
)

# Accuracy
accuracy <- mean(knn_pred == test_y)
print(paste("KNN Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "KNN Accuracy: 51.06 %"

# Confusion Matrix
print(table(Predicted = knn_pred, Actual = test_y))

##          Actual
## Predicted   C  PF  PG  SF  SG
##        C  310 216   2  58  13
##        PF 179 204   6  95  42
##        PG   3   6 397  28 121
##        SF  26  97  17 187 132
##        SG  17  36  82 135 270

# 4. Visualization 1: Predicted Classes

plot_df <- data.frame(
  pts = test_X$pts,
  ast = test_X$ast,
  pos_actual = test_y,
  pos_pred = knn_pred
)

p1 <- ggplot(plot_df, aes(x = pts, y = ast, color = pos_pred)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "KNN Classification (Predicted Player Positions)",
    x = "Normalized Points (PTS)",
    y = "Normalized Assists (AST)",
    color = "Predicted Position"
  )
print(p1)

# 5. Visualization 2: Correct vs Incorrect

plot_df$correct <- ifelse(plot_df$pos_pred == plot_df$pos_actual,
                          "Correct", "Incorrect")

p2 <- ggplot(plot_df, aes(x = pts, y = ast, color = correct)) +
  geom_point(size = 3, alpha = 0.9) +
  scale_color_manual(values = c("Correct" = "green", "Incorrect" = "red")) +
  theme_minimal() +
  labs(
    title = "KNN Accuracy: Correct vs Incorrect Predictions",
    x = "Normalized Points (PTS)",
    y = "Normalized Assists (AST)"
  )
print(p2)

PRESCRIPTIVE ANALYSIS:- 1)What-If Analysis (simulate +10% minutes or +5% shooting)

if (exists("lm_fit")) {
sim_row <- na.omit(final_dataset_master %>% select(all_of(c("pts", predictors_available))) %>% slice(1))
sim_row <- as.data.frame(sim_row)
cat("Baseline prediction:\n"); print(predict(lm_fit, newdata = sim_row))
if ("MP.x" %in% predictors_available) {
sim_inc <- sim_row; sim_inc$MP.x <- sim_inc$MP.x * 1.10
cat("\n+10% Minutes prediction:\n"); print(predict(lm_fit, newdata = sim_inc))
}
if ("X3P." %in% predictors_available) {
sim_inc2 <- sim_row; sim_inc2$X3P. <- pmin(sim_inc2$X3P. * 1.05, 1)
cat("\n+5% 3P% prediction:\n"); print(predict(lm_fit, newdata = sim_inc2))
}
} else {
cat("No regression model available for what-if analysis.\n")
}

## Baseline prediction:
##        1 
## 24.93009 
## 
## +10% Minutes prediction:
##        1 
## 26.66211 
## 
## +5% 3P% prediction:
##        1 
## 24.94822

2)Efficiency Ranking (custom score)

eff_cols <- intersect(c("pts","reb","ast","stl","blk","TOV.","PER","USG."), 
                      names(final_dataset_master))

if (length(eff_cols) == 0) {
  eff_cols <- intersect(c("pts","reb","ast","PER","USG."), 
                        names(final_dataset_master))
}

if (length(eff_cols) > 0) {

  # scale numeric columns manually (z-score)
  scaled <- as.data.frame(scale(final_dataset_master[, eff_cols], 
                                center = TRUE, scale = TRUE))

  eff_score <- rowSums(replace(scaled, is.na(scaled), 0))
  final_dataset_master$Efficiency_Score <- eff_score

  knitr::kable(
    head(
      final_dataset_master %>% 
        arrange(desc(Efficiency_Score)) %>% 
        select(any_of(c("normalized_name", "team", "Efficiency_Score")))
    , 20)
  )

} else {
  cat("No stats available to compute efficiency score.\n")
}

normalized_name	Efficiency_Score
Russell Westbrook	18.77880
DeAndre Liggins	18.21036
Naz Mitrou-Long	18.20062
James Harden	16.47019
Nikola Jokic	16.42779
Joel Embiid	16.38515
Luka Doncic	16.31151
Giannis Antetokounmpo	15.96270
James Harden	15.82265
Jackie Butler	15.69764
Nikola Jokic	15.41107
Luka Doncic	15.37704
Giannis Antetokounmpo	15.33708
Russell Westbrook	15.23959
Nikola Jokic	15.17979
Luka Doncic	15.06546
Luka Doncic	14.93069
Russell Westbrook	14.78594
Russell Westbrook	14.74759
Giannis Antetokounmpo	14.73592

NBA Player Stats

Ashish

2025-11-13