0.1 Load Libraries

 # Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525805 28.1    1167658 62.4         NA   669282 35.8
## Vcells 967946  7.4    8388608 64.0      32768  1840416 14.1
  cat("\f")       # Clear the console
  if(!is.null(dev.list())) dev.off() # Clear all plots
## null device 
##           1
# Install and load necessary packages
library(e1071)
library(readxl)
library(visdat)
library(stargazer)
library(tidyverse)
library(nnet)
library(psych)
library(tidymodels) # for modeling
library(randomForest)
library(cluster)
library(factoextra)

0.2 Load Data

data_Pitchers <- read_excel("Data_Pitchers.xlsx")

df <- as.data.frame(data_Pitchers) %>% select(Role,`Stuff+`, xERA, `Location+` )

vis_dat(df)

glimpse(df)
## Rows: 552
## Columns: 4
## $ Role        <chr> "SP", "CL", "RP", "SP", "RP", "SP", "RP", "SP", "SP", "CL"…
## $ `Stuff+`    <dbl> 74.58374, 154.09527, 97.40479, 97.00934, 114.61519, 121.07…
## $ xERA        <dbl> 7.21, 3.06, 3.38, 4.58, 3.71, 2.88, 3.06, 3.40, 4.33, 3.25…
## $ `Location+` <dbl> 102.33173, 104.22018, 99.53709, 96.30023, 101.93888, 109.3…
stargazer(df, type='text')
## 
## =============================================
## Statistic  N   Mean   St. Dev.  Min     Max  
## ---------------------------------------------
## Stuff+    550 100.351  14.590  56.505 161.319
## xERA      550  4.366   1.056   1.890   8.960 
## Location+ 550 99.682   3.952   87.601 114.128
## ---------------------------------------------

0.3 Drop Missing, Scale Data

# Assuming numeric_columns contains the names of numeric columns
numeric_columns <- c('Stuff+', 'xERA', 'Location+')

# Check data types
sapply(df, class)
##        Role      Stuff+        xERA   Location+ 
## "character"   "numeric"   "numeric"   "numeric"
# Remove missing values
# df2 <- na.omit(df)

# Scale numeric columns
df2 <- as.data.frame(scale(df[, numeric_columns]))
head(df2)
##       Stuff+       xERA  Location+
## 1 -1.7660185  2.6921888  0.6703023
## 2  3.6835482 -1.2362362  1.1480922
## 3 -0.2019079 -0.9333215 -0.0367588
## 4 -0.2290110  0.2026086 -0.8557046
## 5  0.9776601 -0.6209407  0.5709105
## 6  1.4203643 -1.4066257  2.4436433
df2$Role <- factor(df$Role)

0.4 Confirm Data is scaled

stargazer(df2, type='text')
## 
## ==========================================
## Statistic  N   Mean  St. Dev.  Min    Max 
## ------------------------------------------
## Stuff+    550 -0.000  1.000   -3.005 4.179
## xERA      550 0.000   1.000   -2.344 4.349
## Location+ 550 -0.000  1.000   -3.057 3.655
## ------------------------------------------

0.5 Training/Testing

80-20 split.

set.seed(007)  # Set seed for reproducibility
train_indices <- sample(x = 1:nrow(df2),
                        size = 0.8 * nrow(df2)
                        )

train_data <- df2[train_indices, ]
test_data  <- df2[-train_indices, ]

1 PCA

https://www.statology.org/principal-components-analysis-in-r/

Better Graphs - https://www.datacamp.com/tutorial/pca-analysis-r

train_data <- na.omit(train_data) # loose 2 obs

results <- prcomp(train_data[,-4], scale = TRUE)

#reverse the signs
results$rotation <- -1*results$rotation

#display principal components
results$rotation
##                  PC1         PC2        PC3
## Stuff+    -0.6539154  0.41534872 -0.6323607
## xERA       0.7232642  0.09793482 -0.6835917
## Location+ -0.2219988 -0.90437504 -0.3644480
#reverse the signs of the scores
results$x <- -1*results$x

#display the first six scores
head(results$x)
##             PC1        PC2        PC3
## 298  1.35067528 -2.4187071  1.3342399
## 467 -0.65121922  0.5432051  0.5286058
## 415  0.29183771  1.6769691 -0.3606453
## 476 -0.08168967 -1.2039689  0.1430709
## 218 -0.75364647  2.1815418 -0.1238471
## 392 -1.87944371  2.4579819 -0.4965785
biplot(results, scale = 0.0007)

2 Cluster - kmeans

numeric_data <- train_data[, sapply(X = train_data, 
                                    FUN = is.numeric)]

numeric_data <- na.omit(numeric_data) # loose 2 obs

# Perform k-means clustering
kmeans_result <- kmeans(numeric_data, centers = 3)

train_data <- na.omit(train_data)
numeric_data$Role <- train_data$Role

data_with_labels <- data.frame(numeric_data, Original_Labels = na.omit(train_data$Role))
head(data_with_labels)
##         Stuff.       xERA  Location. Role Original_Labels
## 298 -2.7843064 -0.1760347  1.4243437   SP              SP
## 467  0.3359799 -0.7913302 -0.5168731   RP              RP
## 415  0.7623097  0.6285824 -1.4277589   RP              RP
## 476 -0.5383801 -0.2801617  1.0777660   RP              RP
## 218  1.5232456 -0.2517634 -1.7383769   RP              RP
## 392  2.6354409 -0.7913302 -1.6025759   RP              RP
data_with_labels$Original_Labels <- as.numeric(data_with_labels$Original_Labels)
data_with_labels$Role <- NULL
# Visualize the results with original labels
?fviz_cluster
fviz_cluster(object = kmeans_result, 
             data = data_with_labels,
             geom = "point", 
             ellipse.type = "norm", # norm
             ellipse.level = 0.95,
             main = "K-means Clustering with Original Labels", 
             xlab = "Variable 1", 
             ylab = "Variable 2",
             palette = "jco", # Set a color palette
             ggtheme = theme_minimal()
             )  # Optional: Set a theme for ggplot2

numeric_data <- train_data[, sapply(X = train_data, 
                                    FUN = is.numeric)]

numeric_data <- na.omit(numeric_data) # loose 2 obs

# Function to calculate total within-cluster sum of squares (wss)
wss <- function(k) {
  kmeans_result <- kmeans(numeric_data, centers = k)
  return(sum(kmeans_result$withinss))
}

# Determine the optimal number of clusters using the Elbow Method
k_values <- 1:10  # You can adjust the range of k values
wss_values <- sapply(k_values, wss)

# Plot the Elbow Method
plot(k_values, wss_values, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of Clusters (k)", ylab = "Total Within-cluster Sum of Squares (WSS)",
     main = "Elbow Method to Determine Optimal k")

# Add a line to help identify the elbow point
abline(v = which.min(wss_values), col = "red", lty = 2)

# Generate some random data for illustration
set.seed(123)
data <- matrix(rnorm(200), ncol = 2)

# Perform K-means clustering with, for example, 3 clusters
kmeans_result <- kmeans(data, centers = 3)

# Assign cluster labels to each data point
cluster_labels <- kmeans_result$cluster

# Visualize the clusters
plot(data, col = cluster_labels, pch = 16, main = "K-means Clustering")
points(kmeans_result$centers, col = 1:3, pch = 8, cex = 2)

3 SVM

Support Vector Machines (SVM) are a type of supervised machine learning algorithm that performs classification or regression tasks. The primary goal of SVM is to find a decision boundary (or hyperplane) that best separates data points belonging to different classes in a feature space.

  • SVM starts with a dataset where each data point is represented by a set of features. These features could be anything relevant to the problem at hand, such as measurements, characteristics, or attributes. Binary Classification:

  • SVM is commonly used for binary classification, where the task is to categorize data points into one of two classes (e.g., spam or not spam, positive or negative). Finding the Best Decision Boundary:

  • SVM’s main job is to find the best decision boundary, often referred to as a hyperplane, that separates the data points of different classes in the feature space. For a 2D space, this boundary is a line. In higher dimensions, it becomes a hyperplane. Maximizing Margin:

  • SVM doesn’t just find any decision boundary; it looks for the one that maximizes the margin between the two classes. The margin is the distance between the decision boundary and the nearest data points of each class. SVM aims to make this margin as wide as possible. Support Vectors:

  • Support Vectors are the data points that are closest to the decision boundary. These points play a critical role in defining the optimal boundary. SVM pays special attention to these support vectors because they guide the positioning and orientation of the decision boundary. Handling Non-Linearity (Kernel Trick):

  • SVM has a trick up its sleeve called the kernel trick. When the data is not linearly separable in its original form, SVM can map it into a higher-dimensional space where a hyperplane can effectively separate the classes. Optimization and Balancing Act:

  • SVM involves an optimization process to find the best hyperplane. It seeks to balance the trade-off between having a smooth decision boundary and accurately classifying data points. The parameters, such as C (regularization parameter), influence this balancing act. Prediction and Classification:

  • Once the optimal hyperplane is found during training, SVM can predict the class of new, unseen data points by placing them on one side or the other of the decision boundar

# Assuming "response_variable" is the target variable/column
svm_model <- svm( Role ~ ., data = df2, kernel = "radial")

summary(svm_model)
## 
## Call:
## svm(formula = Role ~ ., data = df2, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  439
## 
##  ( 185 51 203 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  CL RP SP
table(svm_model$fitted)
## 
##  CL  RP  SP 
##  16 243 291

3.1 SVM Evaluation

predictions_svm <- predict(svm_model, newdata = test_data)

# Example: If you are dealing with a binary classification problem
conf_matrix_svm <- table(predictions_svm, test_data$Role)
conf_matrix_svm 
##                
## predictions_svm CL RP SP
##              CL  1  0  0
##              RP  8 34 13
##              SP  1 22 32
accuracy <- sum(diag(conf_matrix_svm)) / sum(conf_matrix_svm)
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy:  0.603603603603604"

4 Multinomial Logistic Model (not Binary logistic) and Evaluation

Code from - https://quantifyinghealth.com/multinomial-logistic-regression-in-r/

Does worse (though more explanatory).

?glm

logistic_model_fit <- 
multinom_reg() |> 
  fit(Role ~ .,
      data = df2)

tidy(logistic_model_fit, exponentiate = TRUE, conf.int = TRUE) |> 
  mutate_if(is.numeric, round, 4) |> 
  select(-std.error, -statistic)
## # A tibble: 8 × 6
##   y.level term        estimate p.value conf.low conf.high
##   <chr>   <chr>          <dbl>   <dbl>    <dbl>     <dbl>
## 1 RP      (Intercept)   12.7    0         7.41     21.8  
## 2 RP      `Stuff+`       0.517  0.0003    0.363     0.737
## 3 RP      xERA           3.00   0.0001    1.72      5.22 
## 4 RP      `Location+`    1.00   0.983     0.710     1.42 
## 5 SP      (Intercept)   11.1    0         6.45     19.0  
## 6 SP      `Stuff+`       0.377  0         0.256     0.556
## 7 SP      xERA           4.78   0         2.69      8.48 
## 8 SP      `Location+`    1.83   0.0013    1.26      2.64
# explain the relationship between predictors and outcome
tidy(logistic_model_fit, exponentiate = TRUE, conf.int = TRUE) |> 
  mutate_if(is.numeric, round, 4) |> 
  select(-std.error, -statistic)
## # A tibble: 8 × 6
##   y.level term        estimate p.value conf.low conf.high
##   <chr>   <chr>          <dbl>   <dbl>    <dbl>     <dbl>
## 1 RP      (Intercept)   12.7    0         7.41     21.8  
## 2 RP      `Stuff+`       0.517  0.0003    0.363     0.737
## 3 RP      xERA           3.00   0.0001    1.72      5.22 
## 4 RP      `Location+`    1.00   0.983     0.710     1.42 
## 5 SP      (Intercept)   11.1    0         6.45     19.0  
## 6 SP      `Stuff+`       0.377  0         0.256     0.556
## 7 SP      xERA           4.78   0         2.69      8.48 
## 8 SP      `Location+`    1.83   0.0013    1.26      2.64
glance(logistic_model_fit)
## # A tibble: 1 × 4
##     edf deviance   AIC  nobs
##   <dbl>    <dbl> <dbl> <int>
## 1     8     878.  894.   550
Role_preds_logistic <- logistic_model_fit |> 
  augment(new_data = test_data)

conf_mat(Role_preds_logistic, truth = Role, estimate = .pred_class)
##           Truth
## Prediction CL RP SP
##         CL  1  0  0
##         RP  7 36 20
##         SP  2 20 25
accuracy(Role_preds_logistic, truth = Role, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.559
roc_auc(Role_preds_logistic, 
        truth = Role,
        .pred_CL, .pred_RP, .pred_SP)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc hand_till      0.725
roc_curve(Role_preds_logistic, 
        truth = Role,
        .pred_CL, .pred_RP, .pred_SP) |> 
  ggplot(aes(x = 1 - specificity, y = sensitivity, color = .level)) +
  geom_line(size = 1, alpha = 0.7) +
  geom_abline(slope = 1, linetype = "dotted") +
  coord_fixed() +
  labs(color = NULL) +
  theme_light()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the yardstick package.
##   Please report the issue at <https://github.com/tidymodels/yardstick/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

5 Random Forest

Perfect prediction???

# Remove rows with missing values
df3 <- na.omit(df2)

colnames(df3)
## [1] "Stuff+"    "xERA"      "Location+" "Role"
# Change variable names
colnames(df3) <- c("StuffPlus", "xERA", "LocationPlus", "Role")

colnames(df3) <- c("StuffPlus", "xERA", "LocationPlus", "Role")

test_data3 <- test_data 
colnames(test_data3)
## [1] "Stuff+"    "xERA"      "Location+" "Role"
# Verify the new variable names
colnames(df3)
## [1] "StuffPlus"    "xERA"         "LocationPlus" "Role"
# Fit Random Forest model
rf_model <- randomForest(Role ~ .  ,
                         data = df3, 
                         ntree = 50
                         )



# Make predictions on the test set
test_data3 <- test_data 
colnames(test_data3)
## [1] "Stuff+"    "xERA"      "Location+" "Role"
colnames(test_data3) <- c("StuffPlus", "xERA", "LocationPlus", "Role")

test_data3$rf_predictions <- predict(object = rf_model, 
                          newdata = test_data3
                          )

# Display confusion matrix
table(test_data3$rf_predictions, test_data3$Role)
##     
##      CL RP SP
##   CL 10  0  0
##   RP  0 56  0
##   SP  0  0 45
10+56+45
## [1] 111
varImpPlot(rf_model)