1. Load Library dan Data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
# Load data
data <- read.csv("baseball_salary.csv")
str(data)
## 'data.frame':    150 obs. of  13 variables:
##  $ Years    : int  7 15 11 8 7 19 11 11 4 8 ...
##  $ Hits     : int  108 167 145 162 111 101 61 88 179 180 ...
##  $ HomeRuns : int  28 25 34 24 23 12 6 35 19 0 ...
##  $ Runs     : int  43 42 81 56 31 74 32 42 49 36 ...
##  $ RBIs     : int  39 80 24 70 44 19 47 79 32 63 ...
##  $ Walks    : int  53 34 26 22 34 77 19 76 27 43 ...
##  $ PutOuts  : int  104 222 217 393 258 327 315 283 114 256 ...
##  $ Assists  : int  196 161 174 196 85 78 131 51 178 96 ...
##  $ Errors   : int  17 19 6 2 17 9 6 16 13 2 ...
##  $ League   : chr  "AL" "AL" "NL" "NL" ...
##  $ Division : chr  "E" "E" "E" "W" ...
##  $ NewLeague: chr  "NL" "AL" "NL" "NL" ...
##  $ Salary   : num  347 402 312 441 301 ...

2. Preprocessing

# Ubah ke faktor
data$League <- as.factor(data$League)
data$Division <- as.factor(data$Division)
data$NewLeague <- as.factor(data$NewLeague)

# One-hot encoding
x <- model.matrix(Salary ~ ., data = data)[,-1]
y <- data$Salary

3. Split Data

set.seed(123)
trainIndex <- createDataPartition(y, p = 0.7, list = FALSE)
x_train <- x[trainIndex, ]
x_test <- x[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

4. Ridge Regression

ridge <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_pred <- predict(ridge, s = ridge$lambda.min, newx = x_test)
ridge_rmse <- sqrt(mean((ridge_pred - y_test)^2))
ridge_mae <- mean(abs(ridge_pred - y_test))

5. Lasso Regression

lasso <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_pred <- predict(lasso, s = lasso$lambda.min, newx = x_test)
lasso_rmse <- sqrt(mean((lasso_pred - y_test)^2))
lasso_mae <- mean(abs(lasso_pred - y_test))

6. Elastic Net Regression

elastic_model <- train(
  x = x_train, y = y_train,
  method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
)
elastic_pred <- predict(elastic_model, newdata = x_test)
elastic_rmse <- sqrt(mean((elastic_pred - y_test)^2))
elastic_mae <- mean(abs(elastic_pred - y_test))

7. Perbandingan Model

results <- data.frame(
  Model = c("Ridge", "Lasso", "Elastic Net"),
  RMSE = c(ridge_rmse, lasso_rmse, elastic_rmse),
  MAE = c(ridge_mae, lasso_mae, elastic_mae)
)
print(results)
##         Model     RMSE      MAE
## 1       Ridge 46.01834 38.43865
## 2       Lasso 46.90780 38.98438
## 3 Elastic Net 46.48282 38.66420

8. Kesimpulan

Model dengan performa terbaik (RMSE dan MAE terendah) digunakan untuk prediksi gaji pemain yang efisien dan akurat.