# Load Dataset and required library
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(klaR)
## Warning: package 'klaR' was built under R version 3.5.3
## Loading required package: MASS
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(rattle)
## Warning: package 'rattle' was built under R version 3.5.3
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.5.3
nba_raw <- read.csv("C:/Users/shibo/OneDrive/Desktop/520 HW/2012-18_officialBoxScore.csv")
# Data Cleaning and Pre-processing
# Remove repeat columns
nba_reduced_col <- nba_raw[,1:61]
# Remove repeat rows
select <- seq(1,nrow(nba_reduced_col),3)
nba_reduced <- nba_reduced_col[select,]
# Remove irrelevant columns
drop_col <- c("gmDate", "teamAbbr", "seasTyp", "gmTime","offLNm", "offFNm", "teamConf", "teamDiv", "teamMin", "teamDayOff", "teamPTS5", "teamPTS6", "teamPTS7", "teamPTS8", "teamTS%")
nba <- nba_reduced[ ,!(names(nba_reduced) %in% drop_col)]
str(nba)
## 'data.frame': 14762 obs. of 47 variables:
## $ teamLoc : Factor w/ 2 levels "Away","Home": 1 2 1 2 1 2 1 2 1 2 ...
## $ teamRslt : Factor w/ 2 levels "Loss","Win": 1 2 1 2 2 1 1 2 2 1 ...
## $ teamPTS : int 84 94 107 120 99 91 75 84 90 88 ...
## $ teamAST : int 26 22 24 25 22 24 19 18 22 18 ...
## $ teamTO : int 13 21 16 8 12 14 22 16 19 10 ...
## $ teamSTL : int 11 7 4 8 9 6 9 13 3 12 ...
## $ teamBLK : int 10 5 2 5 5 5 5 11 10 8 ...
## $ teamPF : int 19 21 23 20 25 21 22 14 16 18 ...
## $ teamFGA : int 90 79 75 79 85 77 88 85 78 91 ...
## $ teamFGM : int 32 36 39 43 40 38 33 30 37 33 ...
## $ teamFG. : num 0.356 0.456 0.52 0.544 0.471 ...
## $ team2PA : int 58 59 62 63 70 64 70 60 67 74 ...
## $ team2PM : int 24 29 33 35 35 35 29 23 32 27 ...
## $ team2P. : num 0.414 0.491 0.532 0.556 0.5 ...
## $ team3PA : int 32 20 13 16 15 13 18 25 11 17 ...
## $ team3PM : int 8 7 6 8 5 3 4 7 5 6 ...
## $ team3P. : num 0.25 0.35 0.462 0.5 0.333 ...
## $ teamFTA : int 20 22 28 32 18 31 11 21 16 19 ...
## $ teamFTM : int 12 15 23 26 14 12 5 17 11 16 ...
## $ teamFT. : num 0.6 0.682 0.821 0.812 0.778 ...
## $ teamORB : int 18 18 7 5 9 15 16 14 9 15 ...
## $ teamDRB : int 21 36 34 31 31 31 38 33 37 27 ...
## $ teamTRB : int 39 54 41 36 40 46 54 47 46 42 ...
## $ teamPTS1 : int 24 31 25 31 25 29 22 25 27 25 ...
## $ teamPTS2 : int 15 19 29 31 23 17 16 21 21 23 ...
## $ teamPTS3 : int 23 24 22 31 26 20 14 19 15 23 ...
## $ teamPTS4 : int 22 20 31 27 25 25 23 19 27 17 ...
## $ teamTREB. : num 41.9 58.1 53.2 46.8 46.5 ...
## $ teamASST. : num 81.2 61.1 61.5 58.1 55 ...
## $ teamTS. : num 0.425 0.53 0.613 0.645 0.533 ...
## $ teamEFG. : num 0.4 0.5 0.56 0.595 0.5 ...
## $ teamOREB. : num 33.3 46.2 18.4 12.8 22.5 ...
## $ teamDREB. : num 53.8 66.7 87.2 81.6 67.4 ...
## $ teamTO. : num 11.63 19.15 15.49 7.91 11.44 ...
## $ teamSTL. : num 12.37 7.87 4.21 8.42 9.83 ...
## $ teamBLK. : num 11.24 5.62 2.11 5.26 5.46 ...
## $ teamBLKR : num 17.24 8.47 3.23 7.94 7.14 ...
## $ teamPPS : num 0.933 1.19 1.427 1.519 1.165 ...
## $ teamFIC : num 67.2 74 75.2 97 72.2 ...
## $ teamFIC40 : num 56 61.7 62.7 80.8 60.2 ...
## $ teamOrtg : num 94.4 105.7 112.7 126.3 108.1 ...
## $ teamDrtg : num 105.7 94.4 126.3 112.7 99.4 ...
## $ teamEDiff : num -11.24 11.24 -13.69 13.69 8.74 ...
## $ teamPlay. : num 0.376 0.439 0.464 0.524 0.455 ...
## $ teamAR : num 18.9 16.7 18.9 19.8 17.3 ...
## $ teamAST.TO: num 2 1.05 1.5 3.12 1.83 ...
## $ teamSTL.TO: num 84.6 33.3 25 100 75 ...
# Convert teamLoc into numberic
nba$teamLoc <- as.numeric(nba$teamLoc)
# Correlation Matrix
cor_matrix <- cor(nba[,-2])
corrplot(cor_matrix)

# train and test data
nba_glm <- as.data.frame(cbind(Result=as.factor(nba$teamRslt),nba[-2]))
nba_glm$Result <- as.factor(nba_glm$Result)
index <- createDataPartition(nba_glm$Result, p = 0.8, list = FALSE)
train_glm = nba_glm[index,]
test_glm = nba_glm[-index,]
# Logistic Regression Model, Fit an unsatuated model
# teamDrtg, teamOrtg, teamEDiff are linear combinations of each other, provides no information
model_glm <- glm(Result ~. -teamDrtg - teamEDiff - team3PA - team3PM - teamTRB - teamFIC, data=train_glm, family = binomial)
summary(model_glm)
##
## Call:
## glm(formula = Result ~ . - teamDrtg - teamEDiff - team3PA - team3PM -
## teamTRB - teamFIC, family = binomial, data = train_glm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8029 -0.4341 -0.0039 0.4172 3.7931
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.142e+01 6.802e+00 -4.619 3.86e-06 ***
## teamLoc 4.016e-01 5.915e-02 6.791 1.12e-11 ***
## teamPTS 3.368e-01 9.702e-01 0.347 0.728479
## teamAST -4.771e-01 1.003e-01 -4.757 1.97e-06 ***
## teamTO 3.195e-01 1.249e-01 2.557 0.010546 *
## teamSTL -2.739e-01 1.817e-01 -1.507 0.131711
## teamBLK -3.621e-01 1.960e-01 -1.847 0.064682 .
## teamPF 1.314e-01 2.302e-02 5.711 1.12e-08 ***
## teamFGA 1.786e-01 7.982e-02 2.238 0.025252 *
## teamFGM -1.237e+00 2.909e+00 -0.425 0.670649
## teamFG. 1.916e+01 2.262e+01 0.847 0.396988
## team2PA 3.452e-02 3.462e-02 0.997 0.318684
## team2PM 4.088e-01 9.759e-01 0.419 0.675297
## team2P. -1.472e+00 3.595e+00 -0.409 0.682259
## team3P. 6.728e-02 9.888e-01 0.068 0.945748
## teamFTA 1.923e-01 5.444e-02 3.532 0.000413 ***
## teamFTM -5.772e-01 9.663e-01 -0.597 0.550286
## teamFT. 3.452e-01 8.303e-01 0.416 0.677547
## teamORB -2.969e-01 9.004e-02 -3.298 0.000974 ***
## teamDRB 9.777e-02 3.759e-02 2.601 0.009295 **
## teamPTS1 -3.561e-01 3.346e-02 -10.641 < 2e-16 ***
## teamPTS2 -3.595e-01 3.331e-02 -10.790 < 2e-16 ***
## teamPTS3 -3.528e-01 3.340e-02 -10.562 < 2e-16 ***
## teamPTS4 -3.531e-01 3.330e-02 -10.603 < 2e-16 ***
## teamTREB. 6.983e-02 4.263e-02 1.638 0.101405
## teamASST. 1.322e-04 3.405e-02 0.004 0.996902
## teamTS. -7.815e+00 1.790e+01 -0.437 0.662447
## teamEFG. -1.625e+01 2.150e+01 -0.756 0.449793
## teamOREB. -2.695e-02 3.670e-02 -0.734 0.462762
## teamDREB. -4.022e-02 1.508e-02 -2.668 0.007638 **
## teamTO. -3.051e-02 1.588e-01 -0.192 0.847611
## teamSTL. 3.005e-01 1.725e-01 1.742 0.081501 .
## teamBLK. 1.607e-01 1.834e-01 0.876 0.380830
## teamBLKR -3.737e-02 4.306e-02 -0.868 0.385402
## teamPPS 5.900e+00 4.346e+00 1.358 0.174608
## teamFIC40 4.837e-01 5.269e-02 9.180 < 2e-16 ***
## teamOrtg 3.173e-01 2.911e-02 10.901 < 2e-16 ***
## teamPlay. -2.207e+01 1.545e+01 -1.429 0.153084
## teamAR 1.857e-01 1.983e-01 0.937 0.348927
## teamAST.TO -6.464e-02 1.583e-01 -0.408 0.682955
## teamSTL.TO 2.403e-03 4.187e-03 0.574 0.566011
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16373.5 on 11810 degrees of freedom
## Residual deviance: 7493.7 on 11770 degrees of freedom
## AIC: 7575.7
##
## Number of Fisher Scoring iterations: 6
predict_glm <- ifelse(predict(model_glm, test_glm, type="response") > 0.5, "Win", "Loss")
confusionMatrix(factor(predict_glm), factor(test_glm$Result))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Loss Win
## Loss 1295 223
## Win 181 1252
##
## Accuracy : 0.8631
## 95% CI : (0.8502, 0.8753)
## No Information Rate : 0.5002
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7262
##
## Mcnemar's Test P-Value : 0.04137
##
## Sensitivity : 0.8774
## Specificity : 0.8488
## Pos Pred Value : 0.8531
## Neg Pred Value : 0.8737
## Prevalence : 0.5002
## Detection Rate : 0.4388
## Detection Prevalence : 0.5144
## Balanced Accuracy : 0.8631
##
## 'Positive' Class : Loss
##
# Tune the model by reducing not significant predictors. The accuracy is about the same, we will use the reduced model
model_glm_tune <- glm(Result ~ teamLoc + teamAST + teamTO + teamBLK + teamPF + teamFGA + teamFTA + teamORB + teamPTS1 + teamPTS2 +teamPTS3 + teamPTS4 + teamTREB. + teamDREB. + teamFIC40 + teamOrtg, data = train_glm, family = binomial)
summary(model_glm_tune)
##
## Call:
## glm(formula = Result ~ teamLoc + teamAST + teamTO + teamBLK +
## teamPF + teamFGA + teamFTA + teamORB + teamPTS1 + teamPTS2 +
## teamPTS3 + teamPTS4 + teamTREB. + teamDREB. + teamFIC40 +
## teamOrtg, family = binomial, data = train_glm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1415 -0.4443 -0.0051 0.4266 3.6169
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.592014 0.991764 -16.730 < 2e-16 ***
## teamLoc 0.406028 0.058336 6.960 3.4e-12 ***
## teamAST -0.401388 0.013588 -29.540 < 2e-16 ***
## teamTO 0.209206 0.015141 13.817 < 2e-16 ***
## teamBLK -0.303973 0.016139 -18.834 < 2e-16 ***
## teamPF 0.146494 0.009047 16.193 < 2e-16 ***
## teamFGA 0.145220 0.009045 16.055 < 2e-16 ***
## teamFTA 0.115892 0.005793 20.005 < 2e-16 ***
## teamORB -0.384321 0.016411 -23.419 < 2e-16 ***
## teamPTS1 -0.416650 0.014966 -27.840 < 2e-16 ***
## teamPTS2 -0.417937 0.014838 -28.167 < 2e-16 ***
## teamPTS3 -0.413121 0.014857 -27.807 < 2e-16 ***
## teamPTS4 -0.410994 0.014454 -28.435 < 2e-16 ***
## teamTREB. 0.166131 0.014971 11.097 < 2e-16 ***
## teamDREB. -0.074022 0.007517 -9.848 < 2e-16 ***
## teamFIC40 0.526851 0.013339 39.497 < 2e-16 ***
## teamOrtg 0.148477 0.010452 14.205 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16373.5 on 11810 degrees of freedom
## Residual deviance: 7652.4 on 11794 degrees of freedom
## AIC: 7686.4
##
## Number of Fisher Scoring iterations: 6
predict_glm_tune <- ifelse(predict(model_glm_tune, test_glm, type="response") > 0.5, "Win", "Loss")
confusionMatrix(factor(predict_glm_tune), factor(test_glm$Result))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Loss Win
## Loss 1290 222
## Win 186 1253
##
## Accuracy : 0.8617
## 95% CI : (0.8488, 0.874)
## No Information Rate : 0.5002
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7235
##
## Mcnemar's Test P-Value : 0.08314
##
## Sensitivity : 0.8740
## Specificity : 0.8495
## Pos Pred Value : 0.8532
## Neg Pred Value : 0.8707
## Prevalence : 0.5002
## Detection Rate : 0.4371
## Detection Prevalence : 0.5124
## Balanced Accuracy : 0.8617
##
## 'Positive' Class : Loss
##
# Principal Component Analysis
nba_pca <- prcomp(nba[,-2], scale = TRUE, center = TRUE)
# Principal Components Explaining Detials
VarExp <- nba_pca$sdev^2/sum(nba_pca$sdev^2)
VarExpcum <- cbind(1:46, cumsum(VarExp))
plot(cumsum(VarExp), xlab = "Principal Component", ylab = "Variance Explained", type = "o")
abline(h=1, col = "red")

# First 20 components explain 99% of the dataset
nba_final <- as.data.frame(cbind(Result=as.factor(nba$teamRslt),nba_pca$x))
nba_final$Result <- as.factor(nba_final$Result)
# Create Training and Validation Datasets
split <- sample(1:nrow(nba_final), nrow(nba_final)*0.8)
train <- nba_final[split,]
test <- nba_final[-split,]
# Naive Bayes Method
model_nb <- suppressWarnings(train(x = train[,2:20], y = train$Result,method = "nb"))
predict_nb = suppressWarnings(predict(model_nb, test[1:2953,2:20]))
confusionMatrix(predict_nb, test$Result[1:2953])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 1410 85
## 2 72 1386
##
## Accuracy : 0.9468
## 95% CI : (0.9381, 0.9546)
## No Information Rate : 0.5019
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8937
##
## Mcnemar's Test P-Value : 0.3382
##
## Sensitivity : 0.9514
## Specificity : 0.9422
## Pos Pred Value : 0.9431
## Neg Pred Value : 0.9506
## Prevalence : 0.5019
## Detection Rate : 0.4775
## Detection Prevalence : 0.5063
## Balanced Accuracy : 0.9468
##
## 'Positive' Class : 1
##
# K-Nearest Neighbor Method
model_knn <- train(x =train[,2:20], y = train$Result, method = "knn")
predict_knn <- predict(model_knn, test[1:2953,2:20])
confusionMatrix(predict_knn, test$Result[1:2953])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 1345 154
## 2 137 1317
##
## Accuracy : 0.9015
## 95% CI : (0.8901, 0.912)
## No Information Rate : 0.5019
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8029
##
## Mcnemar's Test P-Value : 0.3483
##
## Sensitivity : 0.9076
## Specificity : 0.8953
## Pos Pred Value : 0.8973
## Neg Pred Value : 0.9058
## Prevalence : 0.5019
## Detection Rate : 0.4555
## Detection Prevalence : 0.5076
## Balanced Accuracy : 0.9014
##
## 'Positive' Class : 1
##
# Random Forest
model_rf <- train(x = train[,2:20], y = train$Result, method = "rf")
predict_rf = predict(model_rf, test[1:2953,2:20])
confusionMatrix(predict_rf, test$Result[1:2953])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 1366 100
## 2 116 1371
##
## Accuracy : 0.9269
## 95% CI : (0.9169, 0.936)
## No Information Rate : 0.5019
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8537
##
## Mcnemar's Test P-Value : 0.3074
##
## Sensitivity : 0.9217
## Specificity : 0.9320
## Pos Pred Value : 0.9318
## Neg Pred Value : 0.9220
## Prevalence : 0.5019
## Detection Rate : 0.4626
## Detection Prevalence : 0.4964
## Balanced Accuracy : 0.9269
##
## 'Positive' Class : 1
##
varimp_rf <- varImp(model_rf)
varimp_rf
## rf variable importance
##
## Overall
## PC1 100.0000
## PC8 23.4437
## PC3 20.0113
## PC10 16.2926
## PC17 10.1021
## PC5 7.5665
## PC19 6.3170
## PC7 5.3425
## PC2 4.1875
## PC18 3.5946
## PC4 3.4537
## PC6 1.8930
## PC16 1.7730
## PC11 1.0554
## PC12 0.5777
## PC15 0.4822
## PC9 0.2724
## PC13 0.1616
## PC14 0.0000
plot(varimp_rf, main = "Variable Importance with Random Forest")

# Decision Tree
model_DT <- train(x = train[,2:20], y = train$Result, metric = "Accuracy", method = "rpart")
predict_DT = predict(model_DT, test[1:2953,2:20])
confusionMatrix(predict_DT, test$Result[1:2953])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 1237 417
## 2 245 1054
##
## Accuracy : 0.7758
## 95% CI : (0.7603, 0.7908)
## No Information Rate : 0.5019
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5514
##
## Mcnemar's Test P-Value : 3.01e-11
##
## Sensitivity : 0.8347
## Specificity : 0.7165
## Pos Pred Value : 0.7479
## Neg Pred Value : 0.8114
## Prevalence : 0.5019
## Detection Rate : 0.4189
## Detection Prevalence : 0.5601
## Balanced Accuracy : 0.7756
##
## 'Positive' Class : 1
##
plot(model_DT$finalModel)
text(model_DT$finalModel)

fancyRpartPlot(model_DT$finalModel)

# Treebag
model_bag <- train(x = train[,2:20], y = train$Result, method = "treebag")
predict_bag = predict(model_bag, test[1:2953,2:20])
confusionMatrix(predict_bag, test$Result[1:2953])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 1336 149
## 2 146 1322
##
## Accuracy : 0.9001
## 95% CI : (0.8887, 0.9107)
## No Information Rate : 0.5019
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8002
##
## Mcnemar's Test P-Value : 0.9073
##
## Sensitivity : 0.9015
## Specificity : 0.8987
## Pos Pred Value : 0.8997
## Neg Pred Value : 0.9005
## Prevalence : 0.5019
## Detection Rate : 0.4524
## Detection Prevalence : 0.5029
## Balanced Accuracy : 0.9001
##
## 'Positive' Class : 1
##
# Algorithm Performance Comparison
model_comparison <- resamples(list(BAG = model_bag, DT = model_DT, NB = model_nb, KNN = model_knn, RF = model_rf))
summary(model_comparison)
##
## Call:
## summary.resamples(object = model_comparison)
##
## Models: BAG, DT, NB, KNN, RF
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## BAG 0.8832534 0.8898383 0.8933333 0.8930203 0.8949930 0.9020060 0
## DT 0.7653722 0.7777264 0.7832344 0.7817263 0.7860120 0.7944306 0
## NB 0.9417051 0.9471275 0.9520642 0.9509892 0.9548699 0.9598431 0
## KNN 0.8689173 0.8754301 0.8772210 0.8776569 0.8798244 0.8869965 0
## RF 0.9179534 0.9252040 0.9282759 0.9275006 0.9297682 0.9350139 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## BAG 0.7665069 0.7796622 0.7866591 0.7860152 0.7899650 0.8039408 0
## DT 0.5309771 0.5551563 0.5661716 0.5634459 0.5719414 0.5889245 0
## NB 0.8834122 0.8942525 0.9041307 0.9019666 0.9097403 0.9196696 0
## KNN 0.7378252 0.7508278 0.7544055 0.7552852 0.7595857 0.7739898 0
## RF 0.8359089 0.8504095 0.8565440 0.8549822 0.8595250 0.8700045 0
# Graphically Compare Performance
scales <- list(x = list(relation = "free"),
y = list(relation = "free"))
bwplot(model_comparison, scales = scales)
