library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
# Load data
data <- read.csv("baseball_salary.csv")
str(data)
## 'data.frame': 150 obs. of 13 variables:
## $ Years : int 7 15 11 8 7 19 11 11 4 8 ...
## $ Hits : int 108 167 145 162 111 101 61 88 179 180 ...
## $ HomeRuns : int 28 25 34 24 23 12 6 35 19 0 ...
## $ Runs : int 43 42 81 56 31 74 32 42 49 36 ...
## $ RBIs : int 39 80 24 70 44 19 47 79 32 63 ...
## $ Walks : int 53 34 26 22 34 77 19 76 27 43 ...
## $ PutOuts : int 104 222 217 393 258 327 315 283 114 256 ...
## $ Assists : int 196 161 174 196 85 78 131 51 178 96 ...
## $ Errors : int 17 19 6 2 17 9 6 16 13 2 ...
## $ League : chr "AL" "AL" "NL" "NL" ...
## $ Division : chr "E" "E" "E" "W" ...
## $ NewLeague: chr "NL" "AL" "NL" "NL" ...
## $ Salary : num 347 402 312 441 301 ...
# Ubah ke faktor
data$League <- as.factor(data$League)
data$Division <- as.factor(data$Division)
data$NewLeague <- as.factor(data$NewLeague)
# One-hot encoding
x <- model.matrix(Salary ~ ., data = data)[,-1]
y <- data$Salary
set.seed(123)
trainIndex <- createDataPartition(y, p = 0.7, list = FALSE)
x_train <- x[trainIndex, ]
x_test <- x[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]
ridge <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_pred <- predict(ridge, s = ridge$lambda.min, newx = x_test)
ridge_rmse <- sqrt(mean((ridge_pred - y_test)^2))
ridge_mae <- mean(abs(ridge_pred - y_test))
lasso <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_pred <- predict(lasso, s = lasso$lambda.min, newx = x_test)
lasso_rmse <- sqrt(mean((lasso_pred - y_test)^2))
lasso_mae <- mean(abs(lasso_pred - y_test))
elastic_model <- train(
x = x_train, y = y_train,
method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
elastic_pred <- predict(elastic_model, newdata = x_test)
elastic_rmse <- sqrt(mean((elastic_pred - y_test)^2))
elastic_mae <- mean(abs(elastic_pred - y_test))
results <- data.frame(
Model = c("Ridge", "Lasso", "Elastic Net"),
RMSE = c(ridge_rmse, lasso_rmse, elastic_rmse),
MAE = c(ridge_mae, lasso_mae, elastic_mae)
)
print(results)
## Model RMSE MAE
## 1 Ridge 46.01834 38.43865
## 2 Lasso 46.90780 38.98438
## 3 Elastic Net 46.48282 38.66420
Model dengan performa terbaik (RMSE dan MAE terendah) digunakan untuk prediksi gaji pemain yang efisien dan akurat.