library(readxl)
final_dataset_master <- read_excel("C:/Users/ashis/Downloads/final_dataset_master.xlsx")
View(final_dataset_master)

Load Libraries

# Libraries

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(cluster)
library(tidyr)
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
library(stats)
library(proxy)
## Warning: package 'proxy' was built under R version 4.4.3
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix

Show how many rows/cols and column names

cat("Rows:", nrow(final_dataset_master), "Cols:", ncol(final_dataset_master), "\n\n")
## Rows: 13391 Cols: 36
colnames(final_dataset_master)
##  [1] "normalized_name" "age"             "player_height"   "player_weight"  
##  [5] "college"         "country"         "draft_year"      "draft_round"    
##  [9] "draft_number"    "pts"             "reb"             "ast"            
## [13] "season"          "Pos.x"           "MP.x"            "G.x"            
## [17] "eFG."            "X3P"             "X3PA"            "X3P."           
## [21] "X3PAr"           "X2P"             "X2PA"            "X2P."           
## [25] "FT"              "FTA"             "FT."             "PER"            
## [29] "TS."             "TRB."            "AST."            "TOV."           
## [33] "USG."            "WS"              "VORP"            "BPM"

Data Cleaning

# Standardize column names to be safe

names(final_dataset_master) <- make.names(names(final_dataset_master))

# Helper: check if required columns exist

has_col <- function(x) x %in% names(final_dataset_master)

# Show a small preview

knitr::kable(head(final_dataset_master, 8))
normalized_name age player_height player_weight college country draft_year draft_round draft_number pts reb ast season Pos.x MP.x G.x eFG. X3P X3PA X3P. X3PAr X2P X2PA X2P. FT FTA FT. PER TS. TRB. AST. TOV. USG. WS VORP BPM
Allen Iverson 26 183.58 74.61 Georgetown USA 1996 1 1 31.1 3.8 4.6 2000-01 SG 42.0 71 0.447 1.4 4.3 0.320 0.169 9.4 21.2 0.441 8.2 10.1 0.814 24.0 0.518 5.2 23.0 10.0 35.9 11.8 6.1 6.1
Jerry Stackhouse 26 198.15 99.23 North Carolina USA 1995 1 3 29.8 3.9 5.1 2000-01 SG 40.2 80 0.445 2.1 5.9 0.351 0.245 7.6 18.2 0.418 8.3 10.1 0.822 21.8 0.521 5.2 25.8 12.5 35.2 9.2 5.8 5.1
Shaquille O’Neal 29 216.31 142.56 Louisiana State USA 1992 1 1 28.7 12.7 3.7 2000-01 C 39.5 74 0.572 0.0 0.0 0.000 0.001 11.0 19.2 0.573 6.7 13.1 0.513 30.2 0.574 18.1 18.8 10.5 31.6 14.9 7.1 7.7
Kobe Bryant 22 201.60 95.20 None USA 1996 1 13 28.5 5.9 5.0 2000-01 SG 40.9 68 0.484 0.9 2.9 0.305 0.132 9.4 19.3 0.489 7.0 8.2 0.853 24.5 0.552 8.1 23.0 11.1 31.8 11.3 4.7 4.8
Vince Carter 24 198.64 102.34 North Carolina USA 1998 1 5 27.6 5.5 3.9 2000-01 SF 39.7 75 0.509 2.2 5.3 0.408 0.240 8.0 16.8 0.477 5.1 6.7 0.765 25.0 0.551 7.9 19.2 8.2 30.7 12.9 7.2 7.6
Chris Webber 28 208.72 111.14 Michigan USA 1993 1 1 27.1 11.1 4.2 2000-01 PF 40.5 70 0.481 0.0 0.4 0.071 0.017 11.2 23.0 0.488 4.6 6.6 0.703 24.7 0.516 14.8 20.6 9.6 31.6 11.0 5.3 5.5
Tracy McGrady 22 204.02 95.02 None USA 1997 1 9 26.8 7.5 4.6 2000-01 SG 40.1 77 0.474 0.8 2.2 0.355 0.096 9.5 20.2 0.468 5.6 7.6 0.733 24.9 0.521 10.4 22.8 9.1 31.2 12.2 7.0 7.0
Paul Pierce 23 198.30 104.59 Kansas USA 1998 1 10 25.3 6.4 3.1 2000-01 SF 38.0 82 0.503 1.8 4.7 0.383 0.254 6.6 13.8 0.478 6.7 9.0 0.745 22.3 0.563 9.8 16.9 12.5 30.6 10.4 5.0 4.4

DESCRIPTIVE ANALYSIS:- 1)Position-wise Average Stats

# Use 'Pos.x' or try 'Pos' if exists

pos_col <- ifelse(has_col("Pos.x"), "Pos.x",
ifelse(has_col("Pos"), "Pos", NA))

num_stats <- intersect(c("pts","reb","ast","MP.x","PER","USG.","TS.","WS","VORP","BPM"),
names(final_dataset_master))

if (!is.na(pos_col)) {
desc_by_pos <- final_dataset_master %>%
filter(!is.na(.data[[pos_col]])) %>%
group_by(Position = .data[[pos_col]]) %>%
summarise(across(all_of(num_stats), ~ round(mean(.x, na.rm = TRUE), 2)))
knitr::kable(desc_by_pos)
} else {
cat("Position column not found (looked for 'Pos.x' or 'Pos'). Skipping position-wise summary.\n")
}
Position pts reb ast MP.x PER USG. TS. WS VORP BPM
C 6.97 5.02 0.94 18.05 13.93 17.36 0.53 2.77 0.51 -2.01
PF 8.02 4.55 1.22 19.70 13.29 18.31 0.52 2.70 0.58 -1.70
PG 8.72 2.30 3.66 21.24 12.90 19.80 0.50 2.51 0.76 -1.30
SF 8.53 3.36 1.51 21.14 11.94 18.30 0.51 2.46 0.62 -1.56
SG 8.92 2.49 1.91 20.84 11.73 19.51 0.51 2.24 0.56 -1.79

2)Top 10 Players per Season (scorers, rebounders, assisters)

season_col <- ifelse(has_col("season"), "season",
                     ifelse(has_col("Season"), "Season", NA))

if (!is.na(season_col) && has_col("normalized_name") && has_col("pts") &&
    has_col("reb") && has_col("ast")) {
  
  seasons <- sort(unique(final_dataset_master[[season_col]]))
  seasons_to_show <- tail(seasons, 3)
  
  for (s in seasons_to_show) {
    cat("\n### Season:", s, "\n")
    
    df_s <- final_dataset_master %>% filter(.data[[season_col]] == s)
    
    cat("\nTop 10 Scorers:\n")
    print(df_s %>% 
            arrange(desc(pts)) %>% 
            select(any_of(c("normalized_name", "team", "pts"))) %>% 
            head(10))
    
    cat("\nTop 10 Rebounders:\n")
    print(df_s %>% 
            arrange(desc(reb)) %>% 
            select(any_of(c("normalized_name", "team", "reb"))) %>% 
            head(10))
    
    cat("\nTop 10 Assisters:\n")
    print(df_s %>% 
            arrange(desc(ast)) %>% 
            select(any_of(c("normalized_name", "team", "ast"))) %>% 
            head(10))
  }
  
} else {
  cat("Missing required columns (season, normalized_name, pts, reb, ast).\n")
}
## 
## ### Season: 2021-22 
## 
## Top 10 Scorers:
## # A tibble: 10 × 2
##    normalized_name         pts
##    <chr>                 <dbl>
##  1 Joel Embiid            30.6
##  2 LeBron James           30.3
##  3 Kevin Durant           29.9
##  4 Giannis Antetokounmpo  29.9
##  5 Trae Young             28.4
##  6 Luka Doncic            28.4
##  7 DeMar DeRozan          27.9
##  8 Kyrie Irving           27.4
##  9 Ja Morant              27.4
## 10 Nikola Jokic           27.1
## 
## Top 10 Rebounders:
## # A tibble: 10 × 2
##    normalized_name         reb
##    <chr>                 <dbl>
##  1 Rudy Gobert            14.7
##  2 Nikola Jokic           13.8
##  3 Domantas Sabonis       12.1
##  4 Jaylen Hoard           12  
##  5 Clint Capela           11.9
##  6 Joel Embiid            11.7
##  7 Giannis Antetokounmpo  11.6
##  8 Jonas Valanciunas      11.4
##  9 Jusuf Nurkic           11.1
## 10 Nikola Vucevic         11  
## 
## Top 10 Assisters:
## # A tibble: 10 × 2
##    normalized_name     ast
##    <chr>             <dbl>
##  1 Chris Paul         10.8
##  2 James Harden       10.3
##  3 Trae Young          9.7
##  4 Dejounte Murray     9.2
##  5 Luka Doncic         8.7
##  6 Darius Garland      8.6
##  7 Tyrese Haliburton   8.2
##  8 Nikola Jokic        7.9
##  9 LaMelo Ball         7.6
## 10 Kyle Lowry          7.5
## 
## ### Season: 2022-23 
## 
## Top 10 Scorers:
## # A tibble: 10 × 2
##    normalized_name           pts
##    <chr>                   <dbl>
##  1 Joel Embiid              33.1
##  2 Luka Doncic              32.4
##  3 Damian Lillard           32.2
##  4 Shai Gilgeous-Alexander  31.4
##  5 Giannis Antetokounmpo    31.1
##  6 Jayson Tatum             30.1
##  7 Stephen Curry            29.4
##  8 Kevin Durant             29.1
##  9 LeBron James             28.9
## 10 Donovan Mitchell         28.3
## 
## Top 10 Rebounders:
## # A tibble: 10 × 2
##    normalized_name         reb
##    <chr>                 <dbl>
##  1 Anthony Davis          12.5
##  2 Domantas Sabonis       12.3
##  3 Giannis Antetokounmpo  11.8
##  4 Nikola Jokic           11.8
##  5 Rudy Gobert            11.6
##  6 Steven Adams           11.5
##  7 Nikola Vucevic         11  
##  8 Clint Capela           11  
##  9 Joel Embiid            10.2
## 10 Jonas Valanciunas      10.2
## 
## Top 10 Assisters:
## # A tibble: 10 × 2
##    normalized_name     ast
##    <chr>             <dbl>
##  1 James Harden       10.7
##  2 Tyrese Haliburton  10.4
##  3 Trae Young         10.2
##  4 Nikola Jokic        9.8
##  5 Chris Paul          8.9
##  6 LaMelo Ball         8.4
##  7 Skylar Mays         8.3
##  8 Ja Morant           8.1
##  9 Luka Doncic         8  
## 10 Darius Garland      7.8
## 
## ### Season: 2023-24 
## 
## Top 10 Scorers:
## # A tibble: 10 × 2
##    normalized_name           pts
##    <chr>                   <dbl>
##  1 Joel Embiid              34.7
##  2 Luka Doncic              33.9
##  3 Giannis Antetokounmpo    30.4
##  4 Shai Gilgeous-Alexander  30.1
##  5 Jalen Brunson            28.7
##  6 Devin Booker             27.1
##  7 Kevin Durant             27.1
##  8 Jayson Tatum             26.9
##  9 De'Aaron Fox             26.6
## 10 Donovan Mitchell         26.6
## 
## Top 10 Rebounders:
## # A tibble: 10 × 2
##    normalized_name         reb
##    <chr>                 <dbl>
##  1 Domantas Sabonis       13.7
##  2 Rudy Gobert            12.9
##  3 Anthony Davis          12.6
##  4 Nikola Jokic           12.4
##  5 Jalen Duren            11.6
##  6 Giannis Antetokounmpo  11.5
##  7 Deandre Ayton          11.1
##  8 Joel Embiid            11  
##  9 Jusuf Nurkic           11  
## 10 Victor Wembanyama      10.6
## 
## Top 10 Assisters:
## # A tibble: 10 × 2
##    normalized_name     ast
##    <chr>             <dbl>
##  1 Tyrese Haliburton  10.9
##  2 Trae Young         10.8
##  3 Luka Doncic         9.8
##  4 Nikola Jokic        9  
##  5 James Harden        8.5
##  6 LeBron James        8.3
##  7 Domantas Sabonis    8.2
##  8 Ja Morant           8.1
##  9 Fred VanVleet       8.1
## 10 LaMelo Ball         8

3)Trend over Time (Average PTS per season)

if (!is.na(season_col) && has_col("pts")) {
trend <- final_dataset_master %>%
group_by(Season = .data[[season_col]]) %>%
summarise(Avg_PTS = mean(pts, na.rm = TRUE), N = n()) %>%
arrange(Season)
ggplot(trend, aes(x = as.character(Season), y = Avg_PTS, group = 1)) +
geom_line() + geom_point() +
theme_minimal() +
labs(title = "Average Points per Game by Season", x = "Season", y = "Average PTS") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else {
cat("Season or pts column missing; cannot compute trend.\n")
}

4)Descriptive Table (Combined summary table for each team or position.)

if (has_col("team")) {
desc_by_team <- final_dataset_master %>%
group_by(team) %>%
summarise(across(all_of(num_stats), ~ round(mean(.x, na.rm = TRUE), 2)), Count = n()) %>%
arrange(desc(Count))
knitr::kable(head(desc_by_team, 25))
} else if (!is.na(pos_col)) {
knitr::kable(head(desc_by_pos, 25))
} else {
cat("Neither 'team' nor position column found.\n")
}
Position pts reb ast MP.x PER USG. TS. WS VORP BPM
C 6.97 5.02 0.94 18.05 13.93 17.36 0.53 2.77 0.51 -2.01
PF 8.02 4.55 1.22 19.70 13.29 18.31 0.52 2.70 0.58 -1.70
PG 8.72 2.30 3.66 21.24 12.90 19.80 0.50 2.51 0.76 -1.30
SF 8.53 3.36 1.51 21.14 11.94 18.30 0.51 2.46 0.62 -1.56
SG 8.92 2.49 1.91 20.84 11.73 19.51 0.51 2.24 0.56 -1.79

DIAGNOSTIC ANALYSIS:- 1)Correlation Between Minutes & Performance

min_col <- ifelse(has_col("MP.x"), "MP.x", ifelse(has_col("MP"), "MP", NA))
if (!is.na(min_col) && has_col("pts") && has_col("ast")) {
p1 <- ggplot(final_dataset_master, aes(x = .data[[min_col]], y = pts)) +
geom_point(alpha = 0.4) + geom_smooth(method = "lm", se = TRUE) +
labs(x = min_col, y = "Points", title = paste("Minutes vs Points (corr =",
round(cor(final_dataset_master[[min_col]], final_dataset_master$pts, use = "complete.obs"), 2), ")")) +
theme_minimal()
print(p1)
p2 <- ggplot(final_dataset_master, aes(x = .data[[min_col]], y = ast)) +
geom_point(alpha = 0.4) + geom_smooth(method = "lm", se = TRUE) +
labs(x = min_col, y = "Assists", title = paste("Minutes vs Assists (corr =",
round(cor(final_dataset_master[[min_col]], final_dataset_master$ast, use = "complete.obs"), 2), ")")) +
theme_minimal()
print(p2)
cat("Correlation (Minutes, Points):\n"); print(cor.test(final_dataset_master[[min_col]], final_dataset_master$pts, use = "complete.obs"))
cat("\nCorrelation (Minutes, Assists):\n"); print(cor.test(final_dataset_master[[min_col]], final_dataset_master$ast, use = "complete.obs"))
} else {
cat("Missing minutes or points/assists column.\n")
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## Correlation (Minutes, Points):
## 
##  Pearson's product-moment correlation
## 
## data:  final_dataset_master[[min_col]] and final_dataset_master$pts
## t = 223.43, df = 13389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8843501 0.8915156
## sample estimates:
##       cor 
## 0.8879867 
## 
## 
## Correlation (Minutes, Assists):
## 
##  Pearson's product-moment correlation
## 
## data:  final_dataset_master[[min_col]] and final_dataset_master$ast
## t = 107.18, df = 13389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6703204 0.6885551
## sample estimates:
##       cor 
## 0.6795427

2)ANOVA Across Positions

pos_col <- pos_col  # your position column (example: "Position")

if (!is.na(pos_col) && pos_col %in% colnames(final_dataset_master) && "pts" %in% colnames(final_dataset_master)) {
  
  nba_for_anova <- final_dataset_master %>% 
    filter(!is.na(.data[[pos_col]]), !is.na(pts))
  
  # ANOVA Model
  aov_model <- aov(as.formula(paste("pts ~", pos_col)), data = nba_for_anova)
  print(summary(aov_model))
  
  # --- Visualization 1: Boxplot ---
  p1 <- ggplot(nba_for_anova, aes(x = .data[[pos_col]], y = pts, fill = .data[[pos_col]])) +
    geom_boxplot(alpha = 0.7) +
    theme_minimal() +
    labs(title = "ANOVA: Points Across Positions",
         x = "Player Position", y = "Points Per Game")
  print(p1)
  
  # --- Visualization 2: Mean ± SE ---
  mean_se_table <- nba_for_anova %>%
    group_by(.data[[pos_col]]) %>%
    summarise(mean_pts = mean(pts),
              se = sd(pts)/sqrt(n()))
  
  p2 <- ggplot(mean_se_table, 
               aes(x = .data[[pos_col]], y = mean_pts, fill = .data[[pos_col]])) +
    geom_col(alpha = 0.8) +
    geom_errorbar(aes(ymin = mean_pts - se, ymax = mean_pts + se), width = 0.2) +
    theme_minimal() +
    labs(title = "Mean Points ± SE by Position",
         x = "Position", y = "Mean PTS")
  print(p2)
  
  # --- Visualization 3: Density Plot ---
  p3 <- ggplot(nba_for_anova, aes(x = pts, fill = .data[[pos_col]])) +
    geom_density(alpha = 0.5) +
    theme_minimal() +
    labs(title = "Density Plot of Points by Position",
         x = "Points", y = "Density")
  print(p3)
  
} else {
  cat("Position or pts missing; cannot run ANOVA.\n")
}
##                Df Sum Sq Mean Sq F value Pr(>F)    
## Pos.x           4   6616  1653.9   45.74 <2e-16 ***
## Residuals   13386 484022    36.2                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3)Regression: Impact of Variables on Points

predictors <- c("ast","reb","MP.x","X3P.","X2P.","FT.","USG.","PER")
predictors_available <- intersect(predictors, names(final_dataset_master))
if (length(predictors_available) >= 2 && has_col("pts")) {
formula_text <- paste("pts ~", paste(predictors_available, collapse = " + "))
lm_model <- lm(as.formula(formula_text), data = final_dataset_master)
print(summary(lm_model))
par(mfrow = c(2,2)); plot(lm_model); par(mfrow = c(1,1))
} else {
cat("Not enough predictors available. Available:", paste(predictors_available, collapse = ", "), "\n")
}
## 
## Call:
## lm(formula = as.formula(formula_text), data = final_dataset_master)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5396  -0.9444  -0.1399   0.7236  10.5421 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.646906   0.095594 -90.455  < 2e-16 ***
## ast          0.074307   0.013318   5.579 2.46e-08 ***
## reb          0.036242   0.010946   3.311 0.000932 ***
## MP.x         0.415495   0.003350 124.028  < 2e-16 ***
## X3P.         0.988811   0.102826   9.616  < 2e-16 ***
## X2P.         0.374843   0.164344   2.281 0.022573 *  
## FT.         -0.408880   0.088738  -4.608 4.11e-06 ***
## USG.         0.351234   0.003185 110.263  < 2e-16 ***
## PER          0.120612   0.003617  33.345  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.779 on 13382 degrees of freedom
## Multiple R-squared:  0.9136, Adjusted R-squared:  0.9136 
## F-statistic: 1.77e+04 on 8 and 13382 DF,  p-value: < 2.2e-16

if (exists("lm_model")) {

  # Add predictions
  reg_df <- final_dataset_master %>% 
    mutate(pred_pts = predict(lm_model, final_dataset_master),
           residuals = lm_model$residuals) %>%
    filter(!is.na(pred_pts), !is.na(pts))

  # Actual vs Predicted Plot
  ggplot(reg_df, aes(x = pts, y = pred_pts)) +
    geom_point(alpha = 0.6, color = "blue") +
    geom_smooth(method = "lm") +
    theme_minimal() +
    labs(title = "Actual vs Predicted Points",
         x = "Actual PTS", y = "Predicted PTS")

  # Residual Plot
  ggplot(reg_df, aes(x = pred_pts, y = residuals)) +
    geom_point(alpha = 0.6, color = "red") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    theme_minimal() +
    labs(title = "Residual Plot", x = "Predicted PTS", y = "Residuals")

  # Residual Distribution
  ggplot(reg_df, aes(x = residuals)) +
    geom_histogram(bins = 40, alpha = 0.6, fill = "purple") +
    theme_minimal() +
    labs(title = "Distribution of Regression Residuals", x = "Residuals", y = "Count")

} else {
  cat("Regression model not found; skipping regression visualization.\n")
}

4)K-Means Clustering

# Select numeric columns only
num_cols <- final_dataset_master %>% select(where(is.numeric))

# Replace NA values with median
num_cols <- num_cols %>% mutate_all(~ ifelse(is.na(.), median(., na.rm = TRUE), .))

# Normalize data
num_norm <- as.data.frame(scale(num_cols))

# K-Means with 3 clusters
set.seed(123)
kmeans_model <- kmeans(num_norm, centers = 3, nstart = 25)

# Add cluster labels
kmeans_df <- num_norm %>% mutate(Cluster = as.factor(kmeans_model$cluster))

# PCA for visualization
pca_km <- prcomp(num_norm, scale. = TRUE)
pca_data <- data.frame(PC1 = pca_km$x[,1],
                       PC2 = pca_km$x[,2],
                       Cluster = kmeans_df$Cluster)

# Visualization
ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.8) +
  theme_minimal() +
  labs(title = "K-Means Clustering (k = 3)",
       x = "Principal Component 1",
       y = "Principal Component 2")

PREDICTIVE ANALYSIS:- 1)Predict Points (Linear Regression with train/test)

if (has_col("pts") && length(predictors_available) >= 2) {
model_df <- final_dataset_master %>% select(all_of(c("pts", predictors_available))) %>% na.omit()
set.seed(123)
idx <- sample(seq_len(nrow(model_df)), size = floor(0.8 * nrow(model_df)))
train_df <- model_df[idx, ]
test_df  <- model_df[-idx, ]
lm_fit <- lm(pts ~ ., data = train_df)
preds <- predict(lm_fit, newdata = test_df)
rmse <- sqrt(mean((test_df$pts - preds)^2))
cat("Linear regression RMSE on test set:", round(rmse, 3), "\n")
print(summary(lm_fit))
} else {
cat("Insufficient data to train points prediction model.\n")
}
## Linear regression RMSE on test set: 1.81 
## 
## Call:
## lm(formula = pts ~ ., data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.5060  -0.9443  -0.1436   0.7196  10.4911 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.628526   0.107832 -80.018  < 2e-16 ***
## ast          0.077700   0.014751   5.267 1.41e-07 ***
## reb          0.045818   0.012223   3.748 0.000179 ***
## MP.x         0.412386   0.003714 111.047  < 2e-16 ***
## X3P.         1.133206   0.114419   9.904  < 2e-16 ***
## X2P.         0.307272   0.185433   1.657 0.097538 .  
## FT.         -0.475194   0.098132  -4.842 1.30e-06 ***
## USG.         0.354561   0.003656  96.968  < 2e-16 ***
## PER          0.119451   0.004276  27.934  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.772 on 10703 degrees of freedom
## Multiple R-squared:  0.9138, Adjusted R-squared:  0.9138 
## F-statistic: 1.419e+04 on 8 and 10703 DF,  p-value: < 2.2e-16

2)KNN

library(class)
library(dplyr)
library(ggplot2)

# 1. Prepare dataset

required_cols <- c("Pos.x", "pts", "ast", "TRB.")

missing_cols <- setdiff(required_cols, colnames(final_dataset_master))
if (length(missing_cols) > 0) {
  stop(paste("Missing required columns:", paste(missing_cols, collapse = ", ")))
}

final_dataset_master_knn <- final_dataset_master %>%
  filter(!is.na(Pos.x), !is.na(pts), !is.na(ast), !is.na(TRB.))

# Select numeric features
features <- final_dataset_master_knn %>% select(pts, ast, TRB.)

# Manual normalization function
normalize <- function(x) {
  if (min(x) == max(x)) return(rep(0.5, length(x))) 
  (x - min(x)) / (max(x) - min(x))
}

# Normalize
features_norm <- as.data.frame(lapply(features, normalize))

# Target variable
labels <- final_dataset_master_knn$Pos.x

# 2. Train-test split

set.seed(123)
index <- sample(1:nrow(features_norm), 0.8 * nrow(features_norm))

train_X <- features_norm[index, ]
test_X  <- features_norm[-index, ]
train_y <- labels[index]
test_y  <- labels[-index]

# 3. Run KNN Model

knn_pred <- knn(
  train = train_X,
  test  = test_X,
  cl    = train_y,
  k = 5
)

# Accuracy
accuracy <- mean(knn_pred == test_y)
print(paste("KNN Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "KNN Accuracy: 51.06 %"
# Confusion Matrix
print(table(Predicted = knn_pred, Actual = test_y))
##          Actual
## Predicted   C  PF  PG  SF  SG
##        C  310 216   2  58  13
##        PF 179 204   6  95  42
##        PG   3   6 397  28 121
##        SF  26  97  17 187 132
##        SG  17  36  82 135 270
# 4. Visualization 1: Predicted Classes

plot_df <- data.frame(
  pts = test_X$pts,
  ast = test_X$ast,
  pos_actual = test_y,
  pos_pred = knn_pred
)

p1 <- ggplot(plot_df, aes(x = pts, y = ast, color = pos_pred)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "KNN Classification (Predicted Player Positions)",
    x = "Normalized Points (PTS)",
    y = "Normalized Assists (AST)",
    color = "Predicted Position"
  )
print(p1)

# 5. Visualization 2: Correct vs Incorrect

plot_df$correct <- ifelse(plot_df$pos_pred == plot_df$pos_actual,
                          "Correct", "Incorrect")

p2 <- ggplot(plot_df, aes(x = pts, y = ast, color = correct)) +
  geom_point(size = 3, alpha = 0.9) +
  scale_color_manual(values = c("Correct" = "green", "Incorrect" = "red")) +
  theme_minimal() +
  labs(
    title = "KNN Accuracy: Correct vs Incorrect Predictions",
    x = "Normalized Points (PTS)",
    y = "Normalized Assists (AST)"
  )
print(p2)

PRESCRIPTIVE ANALYSIS:- 1)What-If Analysis (simulate +10% minutes or +5% shooting)

if (exists("lm_fit")) {
sim_row <- na.omit(final_dataset_master %>% select(all_of(c("pts", predictors_available))) %>% slice(1))
sim_row <- as.data.frame(sim_row)
cat("Baseline prediction:\n"); print(predict(lm_fit, newdata = sim_row))
if ("MP.x" %in% predictors_available) {
sim_inc <- sim_row; sim_inc$MP.x <- sim_inc$MP.x * 1.10
cat("\n+10% Minutes prediction:\n"); print(predict(lm_fit, newdata = sim_inc))
}
if ("X3P." %in% predictors_available) {
sim_inc2 <- sim_row; sim_inc2$X3P. <- pmin(sim_inc2$X3P. * 1.05, 1)
cat("\n+5% 3P% prediction:\n"); print(predict(lm_fit, newdata = sim_inc2))
}
} else {
cat("No regression model available for what-if analysis.\n")
}
## Baseline prediction:
##        1 
## 24.93009 
## 
## +10% Minutes prediction:
##        1 
## 26.66211 
## 
## +5% 3P% prediction:
##        1 
## 24.94822

2)Efficiency Ranking (custom score)

eff_cols <- intersect(c("pts","reb","ast","stl","blk","TOV.","PER","USG."), 
                      names(final_dataset_master))

if (length(eff_cols) == 0) {
  eff_cols <- intersect(c("pts","reb","ast","PER","USG."), 
                        names(final_dataset_master))
}

if (length(eff_cols) > 0) {

  # scale numeric columns manually (z-score)
  scaled <- as.data.frame(scale(final_dataset_master[, eff_cols], 
                                center = TRUE, scale = TRUE))

  eff_score <- rowSums(replace(scaled, is.na(scaled), 0))
  final_dataset_master$Efficiency_Score <- eff_score

  knitr::kable(
    head(
      final_dataset_master %>% 
        arrange(desc(Efficiency_Score)) %>% 
        select(any_of(c("normalized_name", "team", "Efficiency_Score")))
    , 20)
  )

} else {
  cat("No stats available to compute efficiency score.\n")
}
normalized_name Efficiency_Score
Russell Westbrook 18.77880
DeAndre Liggins 18.21036
Naz Mitrou-Long 18.20062
James Harden 16.47019
Nikola Jokic 16.42779
Joel Embiid 16.38515
Luka Doncic 16.31151
Giannis Antetokounmpo 15.96270
James Harden 15.82265
Jackie Butler 15.69764
Nikola Jokic 15.41107
Luka Doncic 15.37704
Giannis Antetokounmpo 15.33708
Russell Westbrook 15.23959
Nikola Jokic 15.17979
Luka Doncic 15.06546
Luka Doncic 14.93069
Russell Westbrook 14.78594
Russell Westbrook 14.74759
Giannis Antetokounmpo 14.73592