# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525805 28.1 1167658 62.4 NA 669282 35.8
## Vcells 967946 7.4 8388608 64.0 32768 1840416 14.1
cat("\f") # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device
## 1
# Install and load necessary packages
library(e1071)
library(readxl)
library(visdat)
library(stargazer)
library(tidyverse)
library(nnet)
library(psych)
library(tidymodels) # for modeling
library(randomForest)
library(cluster)
library(factoextra)
data_Pitchers <- read_excel("Data_Pitchers.xlsx")
df <- as.data.frame(data_Pitchers) %>% select(Role,`Stuff+`, xERA, `Location+` )
vis_dat(df)
glimpse(df)
## Rows: 552
## Columns: 4
## $ Role <chr> "SP", "CL", "RP", "SP", "RP", "SP", "RP", "SP", "SP", "CL"…
## $ `Stuff+` <dbl> 74.58374, 154.09527, 97.40479, 97.00934, 114.61519, 121.07…
## $ xERA <dbl> 7.21, 3.06, 3.38, 4.58, 3.71, 2.88, 3.06, 3.40, 4.33, 3.25…
## $ `Location+` <dbl> 102.33173, 104.22018, 99.53709, 96.30023, 101.93888, 109.3…
stargazer(df, type='text')
##
## =============================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------
## Stuff+ 550 100.351 14.590 56.505 161.319
## xERA 550 4.366 1.056 1.890 8.960
## Location+ 550 99.682 3.952 87.601 114.128
## ---------------------------------------------
# Assuming numeric_columns contains the names of numeric columns
numeric_columns <- c('Stuff+', 'xERA', 'Location+')
# Check data types
sapply(df, class)
## Role Stuff+ xERA Location+
## "character" "numeric" "numeric" "numeric"
# Remove missing values
# df2 <- na.omit(df)
# Scale numeric columns
df2 <- as.data.frame(scale(df[, numeric_columns]))
head(df2)
## Stuff+ xERA Location+
## 1 -1.7660185 2.6921888 0.6703023
## 2 3.6835482 -1.2362362 1.1480922
## 3 -0.2019079 -0.9333215 -0.0367588
## 4 -0.2290110 0.2026086 -0.8557046
## 5 0.9776601 -0.6209407 0.5709105
## 6 1.4203643 -1.4066257 2.4436433
df2$Role <- factor(df$Role)
stargazer(df2, type='text')
##
## ==========================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------
## Stuff+ 550 -0.000 1.000 -3.005 4.179
## xERA 550 0.000 1.000 -2.344 4.349
## Location+ 550 -0.000 1.000 -3.057 3.655
## ------------------------------------------
80-20 split.
set.seed(007) # Set seed for reproducibility
train_indices <- sample(x = 1:nrow(df2),
size = 0.8 * nrow(df2)
)
train_data <- df2[train_indices, ]
test_data <- df2[-train_indices, ]
https://www.statology.org/principal-components-analysis-in-r/
Better Graphs - https://www.datacamp.com/tutorial/pca-analysis-r
train_data <- na.omit(train_data) # loose 2 obs
results <- prcomp(train_data[,-4], scale = TRUE)
#reverse the signs
results$rotation <- -1*results$rotation
#display principal components
results$rotation
## PC1 PC2 PC3
## Stuff+ -0.6539154 0.41534872 -0.6323607
## xERA 0.7232642 0.09793482 -0.6835917
## Location+ -0.2219988 -0.90437504 -0.3644480
#reverse the signs of the scores
results$x <- -1*results$x
#display the first six scores
head(results$x)
## PC1 PC2 PC3
## 298 1.35067528 -2.4187071 1.3342399
## 467 -0.65121922 0.5432051 0.5286058
## 415 0.29183771 1.6769691 -0.3606453
## 476 -0.08168967 -1.2039689 0.1430709
## 218 -0.75364647 2.1815418 -0.1238471
## 392 -1.87944371 2.4579819 -0.4965785
biplot(results, scale = 0.0007)
numeric_data <- train_data[, sapply(X = train_data,
FUN = is.numeric)]
numeric_data <- na.omit(numeric_data) # loose 2 obs
# Perform k-means clustering
kmeans_result <- kmeans(numeric_data, centers = 3)
train_data <- na.omit(train_data)
numeric_data$Role <- train_data$Role
data_with_labels <- data.frame(numeric_data, Original_Labels = na.omit(train_data$Role))
head(data_with_labels)
## Stuff. xERA Location. Role Original_Labels
## 298 -2.7843064 -0.1760347 1.4243437 SP SP
## 467 0.3359799 -0.7913302 -0.5168731 RP RP
## 415 0.7623097 0.6285824 -1.4277589 RP RP
## 476 -0.5383801 -0.2801617 1.0777660 RP RP
## 218 1.5232456 -0.2517634 -1.7383769 RP RP
## 392 2.6354409 -0.7913302 -1.6025759 RP RP
data_with_labels$Original_Labels <- as.numeric(data_with_labels$Original_Labels)
data_with_labels$Role <- NULL
# Visualize the results with original labels
?fviz_cluster
fviz_cluster(object = kmeans_result,
data = data_with_labels,
geom = "point",
ellipse.type = "norm", # norm
ellipse.level = 0.95,
main = "K-means Clustering with Original Labels",
xlab = "Variable 1",
ylab = "Variable 2",
palette = "jco", # Set a color palette
ggtheme = theme_minimal()
) # Optional: Set a theme for ggplot2
numeric_data <- train_data[, sapply(X = train_data,
FUN = is.numeric)]
numeric_data <- na.omit(numeric_data) # loose 2 obs
# Function to calculate total within-cluster sum of squares (wss)
wss <- function(k) {
kmeans_result <- kmeans(numeric_data, centers = k)
return(sum(kmeans_result$withinss))
}
# Determine the optimal number of clusters using the Elbow Method
k_values <- 1:10 # You can adjust the range of k values
wss_values <- sapply(k_values, wss)
# Plot the Elbow Method
plot(k_values, wss_values, type = "b", pch = 19, frame = FALSE,
xlab = "Number of Clusters (k)", ylab = "Total Within-cluster Sum of Squares (WSS)",
main = "Elbow Method to Determine Optimal k")
# Add a line to help identify the elbow point
abline(v = which.min(wss_values), col = "red", lty = 2)
# Generate some random data for illustration
set.seed(123)
data <- matrix(rnorm(200), ncol = 2)
# Perform K-means clustering with, for example, 3 clusters
kmeans_result <- kmeans(data, centers = 3)
# Assign cluster labels to each data point
cluster_labels <- kmeans_result$cluster
# Visualize the clusters
plot(data, col = cluster_labels, pch = 16, main = "K-means Clustering")
points(kmeans_result$centers, col = 1:3, pch = 8, cex = 2)
Support Vector Machines (SVM) are a type of supervised machine learning algorithm that performs classification or regression tasks. The primary goal of SVM is to find a decision boundary (or hyperplane) that best separates data points belonging to different classes in a feature space.
SVM starts with a dataset where each data point is represented by a set of features. These features could be anything relevant to the problem at hand, such as measurements, characteristics, or attributes. Binary Classification:
SVM is commonly used for binary classification, where the task is to categorize data points into one of two classes (e.g., spam or not spam, positive or negative). Finding the Best Decision Boundary:
SVM’s main job is to find the best decision boundary, often referred to as a hyperplane, that separates the data points of different classes in the feature space. For a 2D space, this boundary is a line. In higher dimensions, it becomes a hyperplane. Maximizing Margin:
SVM doesn’t just find any decision boundary; it looks for the one that maximizes the margin between the two classes. The margin is the distance between the decision boundary and the nearest data points of each class. SVM aims to make this margin as wide as possible. Support Vectors:
Support Vectors are the data points that are closest to the decision boundary. These points play a critical role in defining the optimal boundary. SVM pays special attention to these support vectors because they guide the positioning and orientation of the decision boundary. Handling Non-Linearity (Kernel Trick):
SVM has a trick up its sleeve called the kernel trick. When the data is not linearly separable in its original form, SVM can map it into a higher-dimensional space where a hyperplane can effectively separate the classes. Optimization and Balancing Act:
SVM involves an optimization process to find the best hyperplane. It seeks to balance the trade-off between having a smooth decision boundary and accurately classifying data points. The parameters, such as C (regularization parameter), influence this balancing act. Prediction and Classification:
Once the optimal hyperplane is found during training, SVM can predict the class of new, unseen data points by placing them on one side or the other of the decision boundar
# Assuming "response_variable" is the target variable/column
svm_model <- svm( Role ~ ., data = df2, kernel = "radial")
summary(svm_model)
##
## Call:
## svm(formula = Role ~ ., data = df2, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 439
##
## ( 185 51 203 )
##
##
## Number of Classes: 3
##
## Levels:
## CL RP SP
table(svm_model$fitted)
##
## CL RP SP
## 16 243 291
predictions_svm <- predict(svm_model, newdata = test_data)
# Example: If you are dealing with a binary classification problem
conf_matrix_svm <- table(predictions_svm, test_data$Role)
conf_matrix_svm
##
## predictions_svm CL RP SP
## CL 1 0 0
## RP 8 34 13
## SP 1 22 32
accuracy <- sum(diag(conf_matrix_svm)) / sum(conf_matrix_svm)
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy: 0.603603603603604"
Code from - https://quantifyinghealth.com/multinomial-logistic-regression-in-r/
Does worse (though more explanatory).
?glm
logistic_model_fit <-
multinom_reg() |>
fit(Role ~ .,
data = df2)
tidy(logistic_model_fit, exponentiate = TRUE, conf.int = TRUE) |>
mutate_if(is.numeric, round, 4) |>
select(-std.error, -statistic)
## # A tibble: 8 × 6
## y.level term estimate p.value conf.low conf.high
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 RP (Intercept) 12.7 0 7.41 21.8
## 2 RP `Stuff+` 0.517 0.0003 0.363 0.737
## 3 RP xERA 3.00 0.0001 1.72 5.22
## 4 RP `Location+` 1.00 0.983 0.710 1.42
## 5 SP (Intercept) 11.1 0 6.45 19.0
## 6 SP `Stuff+` 0.377 0 0.256 0.556
## 7 SP xERA 4.78 0 2.69 8.48
## 8 SP `Location+` 1.83 0.0013 1.26 2.64
# explain the relationship between predictors and outcome
tidy(logistic_model_fit, exponentiate = TRUE, conf.int = TRUE) |>
mutate_if(is.numeric, round, 4) |>
select(-std.error, -statistic)
## # A tibble: 8 × 6
## y.level term estimate p.value conf.low conf.high
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 RP (Intercept) 12.7 0 7.41 21.8
## 2 RP `Stuff+` 0.517 0.0003 0.363 0.737
## 3 RP xERA 3.00 0.0001 1.72 5.22
## 4 RP `Location+` 1.00 0.983 0.710 1.42
## 5 SP (Intercept) 11.1 0 6.45 19.0
## 6 SP `Stuff+` 0.377 0 0.256 0.556
## 7 SP xERA 4.78 0 2.69 8.48
## 8 SP `Location+` 1.83 0.0013 1.26 2.64
glance(logistic_model_fit)
## # A tibble: 1 × 4
## edf deviance AIC nobs
## <dbl> <dbl> <dbl> <int>
## 1 8 878. 894. 550
Role_preds_logistic <- logistic_model_fit |>
augment(new_data = test_data)
conf_mat(Role_preds_logistic, truth = Role, estimate = .pred_class)
## Truth
## Prediction CL RP SP
## CL 1 0 0
## RP 7 36 20
## SP 2 20 25
accuracy(Role_preds_logistic, truth = Role, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.559
roc_auc(Role_preds_logistic,
truth = Role,
.pred_CL, .pred_RP, .pred_SP)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc hand_till 0.725
roc_curve(Role_preds_logistic,
truth = Role,
.pred_CL, .pred_RP, .pred_SP) |>
ggplot(aes(x = 1 - specificity, y = sensitivity, color = .level)) +
geom_line(size = 1, alpha = 0.7) +
geom_abline(slope = 1, linetype = "dotted") +
coord_fixed() +
labs(color = NULL) +
theme_light()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the yardstick package.
## Please report the issue at <https://github.com/tidymodels/yardstick/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Perfect prediction???
# Remove rows with missing values
df3 <- na.omit(df2)
colnames(df3)
## [1] "Stuff+" "xERA" "Location+" "Role"
# Change variable names
colnames(df3) <- c("StuffPlus", "xERA", "LocationPlus", "Role")
colnames(df3) <- c("StuffPlus", "xERA", "LocationPlus", "Role")
test_data3 <- test_data
colnames(test_data3)
## [1] "Stuff+" "xERA" "Location+" "Role"
# Verify the new variable names
colnames(df3)
## [1] "StuffPlus" "xERA" "LocationPlus" "Role"
# Fit Random Forest model
rf_model <- randomForest(Role ~ . ,
data = df3,
ntree = 50
)
# Make predictions on the test set
test_data3 <- test_data
colnames(test_data3)
## [1] "Stuff+" "xERA" "Location+" "Role"
colnames(test_data3) <- c("StuffPlus", "xERA", "LocationPlus", "Role")
test_data3$rf_predictions <- predict(object = rf_model,
newdata = test_data3
)
# Display confusion matrix
table(test_data3$rf_predictions, test_data3$Role)
##
## CL RP SP
## CL 10 0 0
## RP 0 56 0
## SP 0 0 45
10+56+45
## [1] 111
varImpPlot(rf_model)