library (readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library (GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library (corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library (stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library (lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
data <- read_excel("C:/Users/mhdha/Downloads/Data Anregg.xlsx")
(data)
## # A tibble: 34 × 11
## Provinsi `PP (Y)` `LP (X1)` `PU (X2)` `PN (X3)` `CH (X4)` `HH (X5)` `KU (X6)`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ACEH 1404235 254287 79277 68537 2410. 193. 81.6
## 2 SUMATER… 2087474 406109 163161 116836 238. 215. 83.9
## 3 SUMATER… 1482469 300565 89090 65779 4064. 230. 69.0
## 4 RIAU 205973 51914 4259 4499 2975. 248 69.4
## 5 JAMBI 275941 61237 14920 18667 2003. 254 69.6
## 6 SUMATER… 2832774 504143 120067 118131 1716. 260. 68.8
## 7 BENGKULU 286684 57877 20089 30014 2010. 242. 68.0
## 8 LAMPUNG 2757898 530108 273513 192302 1700. 231. 83.7
## 9 KEP. BA… 66469 15285 1521 2816 2291. 222 82.4
## 10 KEP. RI… 324 115 82 405 3085. 210. 80.0
## # ℹ 24 more rows
## # ℹ 3 more variables: `LPM (X7)` <dbl>, `TK (X8)` <dbl>, `S (X9)` <dbl>
str(data)
## tibble [34 × 11] (S3: tbl_df/tbl/data.frame)
## $ Provinsi: chr [1:34] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
## $ PP (Y) : num [1:34] 1404235 2087474 1482469 205973 275941 ...
## $ LP (X1) : num [1:34] 254287 406109 300565 51914 61237 ...
## $ PU (X2) : num [1:34] 79277 163161 89090 4259 14920 ...
## $ PN (X3) : num [1:34] 68537 116836 65779 4499 18667 ...
## $ CH (X4) : num [1:34] 2410 238 4064 2975 2003 ...
## $ HH (X5) : num [1:34] 193 215 230 248 254 ...
## $ KU (X6) : num [1:34] 81.6 83.9 69 69.4 69.6 ...
## $ LPM (X7): num [1:34] 5.72 4.53 4.55 4.6 4.77 ...
## $ TK (X8) : num [1:34] 353074 772895 348172 64584 74256 ...
## $ S (X9) : num [1:34] 27.7 27 26.4 27.7 26.1 ...
summary (data)
## Provinsi PP (Y) LP (X1) PU (X2)
## Length:34 Min. : 324 Min. : 115 Min. : 50
## Class :character 1st Qu.: 211223 1st Qu.: 50186 1st Qu.: 7795
## Mode :character Median : 506760 Median : 107104 Median : 28312
## Mean :1581930 Mean : 299006 Mean :107504
## 3rd Qu.:1524520 3rd Qu.: 297302 3rd Qu.: 86637
## Max. :9710661 Max. :1698083 Max. :926017
## PN (X3) CH (X4) HH (X5) KU (X6)
## Min. : 40 Min. : 238.5 Min. :142.4 Min. :67.98
## 1st Qu.: 8383 1st Qu.:1557.6 1st Qu.:180.0 1st Qu.:78.51
## Median : 25679 Median :2006.5 Median :215.0 Median :80.56
## Mean : 71445 Mean :2078.5 Mean :206.2 Mean :79.27
## 3rd Qu.: 67848 3rd Qu.:2566.8 3rd Qu.:230.1 3rd Qu.:82.58
## Max. :561805 Max. :4064.4 Max. :259.8 Max. :84.89
## LPM (X7) TK (X8) S (X9)
## Min. : 4.525 Min. : 2412 Min. :26.11
## 1st Qu.: 5.150 1st Qu.: 75293 1st Qu.:27.20
## Median : 5.529 Median : 147564 Median :27.62
## Mean : 6.116 Mean : 441899 Mean :27.65
## 3rd Qu.: 6.298 3rd Qu.: 414186 3rd Qu.:27.88
## Max. :17.812 Max. :2625313 Max. :29.33
library(dplyr)
data <- data[, !(names(data) %in% "Provinsi")]
colnames(data)
## [1] "PP (Y)" "LP (X1)" "PU (X2)" "PN (X3)" "CH (X4)" "HH (X5)"
## [7] "KU (X6)" "LPM (X7)" "TK (X8)" "S (X9)"
hist(data$`PP (Y)`, main = "Sebaran Produksi Padi")
boxplot(data$`PP (Y)`, main = "Sebaran Produksi Padi")
### Peubah Penjelas vs Peubah Respon
plot(data$`LP (X1)`, data$`PP (Y)`)
plot(data$`PU (X2)`,data$`PP (Y)`)
plot(data$`PN (X3)`, data$`PP (Y)`)
plot(data$`CH (X4)`, data$`PP (Y)`)
plot(data$`HH (X5)`, data$`PP (Y)`)
plot(data$`KU (X6)`, data$`PP (Y)`)
plot(data$`LPM (X7)`, data$`PP (Y)`)
plot(data$`TK (X8)`, data$`PP (Y)`)
plot(data$`S (X9)`, data$`PP (Y)`)
### Korelasi
ggpairs(data,
upper = list(continuous = wrap('cor', size = 3)),
title = "Matriks Scatterplot Data")
cor_matrix <- cor(data, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "lower",
col = colorRampPalette(c("red", "white", "blue"))(200),
addCoef.col = "black", tl.col = "black", tl.srt = 35)
## Pemodelan
model1 = lm(formula = `PP (Y)`~ ., data = data)
summary(model1)
##
## Call:
## lm(formula = `PP (Y)` ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -360712 -91525 -1515 77426 346592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.253e+05 1.294e+06 -0.251 0.8037
## `LP (X1)` 5.256e+00 2.207e-01 23.814 <2e-16 ***
## `PU (X2)` 2.411e+00 1.679e+00 1.436 0.1639
## `PN (X3)` -3.388e+00 2.595e+00 -1.305 0.2041
## `CH (X4)` -3.625e+01 4.418e+01 -0.820 0.4200
## `HH (X5)` 4.181e+02 9.861e+02 0.424 0.6754
## `KU (X6)` -7.965e+03 7.164e+03 -1.112 0.2772
## `LPM (X7)` -2.743e+04 1.293e+04 -2.122 0.0444 *
## `TK (X8)` 1.791e-01 1.165e-01 1.538 0.1372
## `S (X9)` 3.717e+04 4.183e+04 0.889 0.3830
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 151600 on 24 degrees of freedom
## Multiple R-squared: 0.9976, Adjusted R-squared: 0.9967
## F-statistic: 1121 on 9 and 24 DF, p-value: < 2.2e-16
t.test(model1$residuals,mu = 0,conf.level = 0.95)
##
## One Sample t-test
##
## data: model1$residuals
## t = -3.0167e-16, df = 33, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -45109.9 45109.9
## sample estimates:
## mean of x
## -6.688715e-12
Nilai p pada uji t sama dengan 1 yang lebih besar dari alpha 5%, sehingga dinyatakan asumsi nilai harapan sisaan sama dengan nol terpenuhi
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 24.104, df = 9, p-value = 0.00414
dwtest(model1)
##
## Durbin-Watson test
##
## data: model1
## DW = 1.9385, p-value = 0.252
## alternative hypothesis: true autocorrelation is greater than 0
shapiro.test(model1$residuals)
##
## Shapiro-Wilk normality test
##
## data: model1$residuals
## W = 0.9722, p-value = 0.5247
Indikasi masalah 1. perbedaan tanda pada korelasi dan persamaan 2. kemungkinan adanya multikolinearitas, karena banyak variabel yang tidak signifikan padahal R^2 nya tinggi 3. kemungkinan overfitting 4. hasil interpretasi bisa terganggu 5. adanya outlier
catatan baru - uji asumsi tidak terpenuhi yaitu ragam sisaan tidak homogen
x <- data[-1]
r <- cor(x)
(ev <- eigen(r)$values)
## [1] 4.025605012 1.655422563 1.125216622 1.047601313 0.594909928 0.387323218
## [7] 0.106420958 0.054127943 0.003372442
# Simultan
if ((max(ev) / min(ev)) <= 100) {
cat("Tidak ada multikolinearitas\n")
} else if ((max(ev) / min(ev)) > 100 & (max(ev) / min(ev)) <= 1000) {
cat("Ada multikolinearitas yang lemah\n")
} else {
cat("Ada multikolinearitas yang kuat\n")
}
## Ada multikolinearitas yang kuat
# Parsial
if (all((max(ev) / ev) <= 1000)) {
cat("Tidak ada multikolinearitas\n")
} else {
cat("Ada multikolinearitas pada peubah penjelas ke:", which((max(ev) / ev) > 1000), "\n")
}
## Ada multikolinearitas pada peubah penjelas ke: 9
mod1 <- lm(`PP (Y)` ~ ., data[1:2]) # Model dengan dua peubah penjelas
mod2 <- lm(`PP (Y)`~ ., data[1:3]) # Model dengan tiga peubah penjelas
mod3 <- lm(`PP (Y)` ~ ., data[1:4]) # Model dengan empat peubah penjelas
mod4 <- lm(`PP (Y)` ~ ., data[1:5]) # Model dengan lima peubah penjelas
mod5 <- lm(`PP (Y)` ~ ., data[1:6]) # Model dengan enam peubah penjelas
mod6 <- lm(`PP (Y)` ~ ., data[1:7]) # Model dengan tujuh peubah penjelas
mod7 <- lm(`PP (Y)` ~ ., data[1:8]) # Model dengan delapan peubah penjelas
mod8 <- lm(`PP (Y)` ~ ., data[1:9]) # Model dengan sembilan peubah penjelas
mod9 <- lm(`PP (Y)`~ ., data)
# Menggunakan stargazer untuk menampilkan hasil dalam format teks
stargazer(mod1, mod2, mod3, mod4, mod5,mod6, mod7, mod8, mod9,
type = "text", single.row = TRUE)
##
## =====================================================================================================================================================================================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## `PP (Y)`
## (1) (2) (3) (4) (5) (6) (7) (8) (9)
## ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## `LP (X1)` 5.651*** (0.061) 5.412*** (0.202) 5.374*** (0.205) 5.399*** (0.208) 5.375*** (0.217) 5.374*** (0.210) 5.410*** (0.199) 5.301*** (0.214) 5.256*** (0.221)
## `PU (X2)` 0.583 (0.469) 2.090 (1.595) 2.231 (1.606) 2.542 (1.754) 3.132* (1.736) 2.450 (1.676) 2.248 (1.662) 2.411 (1.679)
## `PN (X3)` -2.436 (2.465) -2.811 (2.503) -3.287 (2.726) -4.196 (2.698) -3.177 (2.600) -3.124 (2.567) -3.388 (2.595)
## `CH (X4)` -37.141 (39.917) -37.264 (40.459) -67.593 (43.214) -46.087 (42.207) -48.708 (41.720) -36.248 (44.180)
## `HH (X5)` 498.000 (1,042.451) 83.710 (1,040.508) 245.070 (987.336) 394.532 (981.607) 418.058 (986.119)
## `KU (X6)` -11,699.690 (6,998.442) -6,263.711 (7,133.650) -7,164.643 (7,077.310) -7,965.248 (7,164.162)
## `LPM (X7)` -26,393.810* (12,910.690) -25,846.820* (12,753.580) -27,434.860** (12,931.680)
## `TK (X8)` 0.137 (0.106) 0.179 (0.116)
## `S (X9)` 37,166.870 (41,826.780)
## Constant -107,806.300*** (33,809.380) -98,999.330*** (34,265.320) -75,589.990* (41,662.380) 5,693.443 (96,825.410) -88,884.390 (220,967.200) 988,860.100 (679,334.100) 631,270.700 (665,938.000) 663,754.800 (657,951.900) -325,282.400 (1,294,386.000)
## ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Observations 34 34 34 34 34 34 34 34 34
## R2 0.996 0.996 0.997 0.997 0.997 0.997 0.997 0.998 0.998
## Adjusted R2 0.996 0.996 0.996 0.996 0.996 0.996 0.997 0.997 0.997
## Residual Std. Error 165,487.300 (df = 32) 164,095.000 (df = 31) 164,155.400 (df = 30) 164,523.900 (df = 29) 166,757.900 (df = 28) 161,657.300 (df = 27) 152,905.300 (df = 26) 150,961.600 (df = 25) 151,600.900 (df = 24)
## F Statistic 8,457.325*** (df = 1; 32) 4,301.497*** (df = 2; 31) 2,865.882*** (df = 3; 30) 2,140.009*** (df = 4; 29) 1,666.491*** (df = 5; 28) 1,478.225*** (df = 6; 27) 1,416.845*** (df = 7; 26) 1,272.079*** (df = 8; 25) 1,121.308*** (df = 9; 24)
## =====================================================================================================================================================================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
adanya perubahan tanda pada variabel HH
vif(model1)
## `LP (X1)` `PU (X2)` `PN (X3)` `CH (X4)` `HH (X5)` `KU (X6)` `LPM (X7)`
## 15.373108 164.630800 135.514762 1.561444 1.386356 1.671933 1.215353
## `TK (X8)` `S (X9)`
## 8.480441 1.516754
masalah kedua yaitu masalah multikolinearitas X1, X2, X3
data$sqrt_PP <- sqrt(data$`PP (Y)`)
model_sqrt <-lm(sqrt_PP ~ `LP (X1)` + `PU (X2)` + `PN (X3)`+`CH (X4)` + `HH (X5)` + `KU (X6)` + `LPM (X7)` + `TK (X8)` + `S (X9)`, data = data)
summary(model_sqrt)
##
## Call:
## lm(formula = sqrt_PP ~ `LP (X1)` + `PU (X2)` + `PN (X3)` + `CH (X4)` +
## `HH (X5)` + `KU (X6)` + `LPM (X7)` + `TK (X8)` + `S (X9)`,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -272.42 -86.28 -28.18 85.82 330.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.047e+03 1.486e+03 0.705 0.487703
## `LP (X1)` 2.480e-03 2.534e-04 9.788 7.45e-10 ***
## `PU (X2)` -7.639e-03 1.927e-03 -3.964 0.000578 ***
## `PN (X3)` 1.131e-02 2.979e-03 3.796 0.000882 ***
## `CH (X4)` -7.786e-02 5.072e-02 -1.535 0.137831
## `HH (X5)` 1.009e+00 1.132e+00 0.892 0.381367
## `KU (X6)` 1.023e+01 8.224e+00 1.243 0.225703
## `LPM (X7)` 2.103e+00 1.484e+01 0.142 0.888535
## `TK (X8)` -2.947e-04 1.337e-04 -2.204 0.037350 *
## `S (X9)` -5.680e+01 4.801e+01 -1.183 0.248437
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 174 on 24 degrees of freedom
## Multiple R-squared: 0.969, Adjusted R-squared: 0.9574
## F-statistic: 83.31 on 9 and 24 DF, p-value: 6.673e-16
bptest(model_sqrt)
##
## studentized Breusch-Pagan test
##
## data: model_sqrt
## BP = 7.9952, df = 9, p-value = 0.5346
shapiro.test(resid(model_sqrt))
##
## Shapiro-Wilk normality test
##
## data: resid(model_sqrt)
## W = 0.95381, p-value = 0.1596
t.test(model_sqrt$residuals,mu = 0,conf.level = 0.95)
##
## One Sample t-test
##
## data: model_sqrt$residuals
## t = -1.5604e-16, df = 33, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -51.7826 51.7826
## sample estimates:
## mean of x
## -3.971496e-15
residuals_sqrt <- residuals(model_sqrt)
t.test(residuals_sqrt)
##
## One Sample t-test
##
## data: residuals_sqrt
## t = -1.5604e-16, df = 33, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -51.7826 51.7826
## sample estimates:
## mean of x
## -3.971496e-15
vif(model_sqrt)
## `LP (X1)` `PU (X2)` `PN (X3)` `CH (X4)` `HH (X5)` `KU (X6)` `LPM (X7)`
## 15.373108 164.630800 135.514762 1.561444 1.386356 1.671933 1.215353
## `TK (X8)` `S (X9)`
## 8.480441 1.516754
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x <- model.matrix(sqrt_PP ~ ., data = data[, !names(data) %in% c("PP (Y)", "PP_BoxCox", "log_PP")])[, -1]
y <- data$sqrt_PP
set.seed(123)
# Grid alpha dari 0 sampai 1 dengan step 0.1
alphas <- seq(0, 1, by = 0.1)
cv_errors <- numeric(length(alphas))
for (i in seq_along(alphas)) {
cvfit <- cv.glmnet(x, y, alpha = alphas[i])
cv_errors[i] <- cvfit$cvm[cvfit$lambda == cvfit$lambda.min]
}
# Alpha terbaik berdasarkan CV error terkecil
best_alpha <- alphas[which.min(cv_errors)]
cat("Best alpha:", best_alpha, "\n")
## Best alpha: 0.6
# Fit model dengan alpha terbaik
cvfit_best <- cv.glmnet(x, y, alpha = best_alpha)
best_lambda <- cvfit_best$lambda.min
model_best_enet <- glmnet(x, y, alpha = best_alpha, lambda = best_lambda)
# Lihat koefisien model
coef(model_best_enet)
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 734.062160181
## `LP (X1)` 0.002188849
## `PU (X2)` -0.003498849
## `PN (X3)` 0.005108282
## `CH (X4)` -0.089336267
## `HH (X5)` 1.693281337
## `KU (X6)` 4.984969744
## `LPM (X7)` 6.825887124
## `TK (X8)` -0.000246921
## `S (X9)` -33.415060035
membandingkan
library(glmnet)
# Data matrix dan response
x <- model.matrix(sqrt_PP ~ ., data = data[, !names(data) %in% c("PP (Y)", "PP_BoxCox", "log_PP")])[, -1]
y <- data$sqrt_PP
set.seed(123)
# 1. Lasso (alpha=1)
cv_lasso <- cv.glmnet(x, y, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min
model_lasso <- glmnet(x, y, alpha = 1, lambda = best_lambda_lasso)
pred_lasso <- predict(model_lasso, s = best_lambda_lasso, newx = x)
mse_lasso <- mean((y - pred_lasso)^2)
nonzero_lasso <- sum(coef(model_lasso) != 0) - 1 # minus intercept
set.seed(123)
# 2. Ridge (alpha=0)
cv_ridge <- cv.glmnet(x, y, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min
model_ridge <- glmnet(x, y, alpha = 0, lambda = best_lambda_ridge)
pred_ridge <- predict(model_ridge, s = best_lambda_ridge, newx = x)
mse_ridge <- mean((y - pred_ridge)^2)
nonzero_ridge <- sum(coef(model_ridge) != 0) - 1
# 3. Elastic Net (tuning alpha)
alphas <- seq(0, 1, by = 0.1)
cv_errors <- numeric(length(alphas))
models_enet <- list()
lambdas_enet <- numeric(length(alphas))
set.seed(123)
for (i in seq_along(alphas)) {
cv_enet <- cv.glmnet(x, y, alpha = alphas[i])
cv_errors[i] <- cv_enet$cvm[cv_enet$lambda == cv_enet$lambda.min]
models_enet[[i]] <- glmnet(x, y, alpha = alphas[i], lambda = cv_enet$lambda.min)
lambdas_enet[i] <- cv_enet$lambda.min
}
best_alpha <- alphas[which.min(cv_errors)]
best_model_enet <- models_enet[[which.min(cv_errors)]]
best_lambda_enet <- lambdas_enet[which.min(cv_errors)]
pred_enet <- predict(best_model_enet, s = best_lambda_enet, newx = x)
mse_enet <- mean((y - pred_enet)^2)
nonzero_enet <- sum(coef(best_model_enet) != 0) - 1
# Hasil perbandingan
result <- data.frame(
Model = c("Lasso", "Ridge", "Elastic Net"),
MSE = c(mse_lasso, mse_ridge, mse_enet),
NonZero_Coefficients = c(nonzero_lasso, nonzero_ridge, nonzero_enet)
)
print(result)
## Model MSE NonZero_Coefficients
## 1 Lasso 24626.87 9
## 2 Ridge 54506.01 9
## 3 Elastic Net 23253.32 9
cat("Best alpha for Elastic Net:", best_alpha, "\n")
## Best alpha for Elastic Net: 0.6
koefisien
# Ambil koefisien dari model terbaik
coef_enet <- coef(best_model_enet)
# Ubah jadi data frame
coef_df <- as.data.frame(as.matrix(coef_enet))
coef_df <- cbind(Variable = rownames(coef_df), coef_df)
colnames(coef_df)[2] <- "Coefficient"
# Tampilkan hanya variabel dengan koefisien tidak nol
coef_df_filtered <- subset(coef_df, Coefficient != 0)
print(coef_df_filtered)
## Variable Coefficient
## (Intercept) (Intercept) 8.636908e+02
## `LP (X1)` `LP (X1)` 2.294276e-03
## `PU (X2)` `PU (X2)` -4.893911e-03
## `PN (X3)` `PN (X3)` 7.188855e-03
## `CH (X4)` `CH (X4)` -8.608249e-02
## `HH (X5)` `HH (X5)` 1.462795e+00
## `KU (X6)` `KU (X6)` 6.801556e+00
## `LPM (X7)` `LPM (X7)` 5.320561e+00
## `TK (X8)` `TK (X8)` -2.672126e-04
## `S (X9)` `S (X9)` -4.227271e+01
cek residual
# Plot residual vs fitted untuk Elastic Net (contoh)
residuals_enet <- as.numeric(y - pred_enet)
df_resid <- data.frame(Fitted = as.numeric(pred_enet), Residuals = residuals_enet)
ggplot(df_resid, aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ggtitle("Residuals vs Fitted Values (Elastic Net)") +
theme_minimal()
# Plot prediksi vs aktual (Elastic Net)
df_pred <- data.frame(Actual = y, Predicted = as.numeric(pred_enet))
ggplot(df_pred, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, color = "blue", linetype = "dashed") +
ggtitle("Predicted vs Actual Values (Elastic Net)") +
theme_minimal()
# Histogram residual (Elastic Net)
ggplot(df_resid, aes(x = Residuals)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
ggtitle("Histogram of Residuals (Elastic Net)") +
theme_minimal()
Grafik residual vs fitted values menunjukkan bahwa model Elastic Net
cukup baik dengan residual yang acak dan variansi yang relatif konstan,
meskipun ada beberapa outliers yang perlu diperhatikan.
Plot Predicted vs Actual Values menunjukkan bahwa model Elastic Net memiliki akurasi prediksi yang sangat baik karena titik-titik data tersebar rapat di sekitar garis identitas (y = x).
Histogram sisaan menunjukkan distribusi yang mendekati normal dan simetris di sekitar nol, mendukung asumsi normalitas sisaan pada model Elastic Net.
cek pencilan, amatan berpengaruh, leverage
# Standardized residuals
std_resid <- scale(residuals_enet)
# Identifikasi pencilan
outliers <- which(abs(std_resid) > 2) # nilai z > 2 dicurigai outlier
print(outliers)
## [1] 3 11
# Approximate leverage via hat matrix (untuk X dinormalisasi)
X <- model.matrix(sqrt_PP ~ ., data = data[, !names(data) %in% c("PP (Y)", "PP_BoxCox", "log_PP")])[, -1]
y <- data$sqrt_PP
X_scaled <- scale(X) # pastikan X sudah dinormalisasi seperti saat masuk ke glmnet
H_diag <- hatvalues(lm(rep(1, nrow(X_scaled)) ~ X_scaled)) # diagonal H = leverage
# Threshold leverage berpengaruh
p <- ncol(X_scaled) # jumlah prediktor
n <- nrow(X_scaled)
cutoff_leverage <- 2 * p / n
influential_points <- which(H_diag > cutoff_leverage)
print(influential_points)
## 6 12 15 20 34
## 6 12 15 20 34
plot(H_diag, std_resid^2,
xlab = "Leverage", ylab = "Standardized Residual^2",
main = "Influence Plot (Approximate)",
pch = 20, col = "steelblue")
abline(h = 4, col = "red", lty = 2)
abline(v = cutoff_leverage, col = "red", lty = 2)
penyisihan amatan
library(glmnet)
library(dplyr)
# Buat ulang fungsi untuk model Elastic Net
run_elastic_net <- function(data) {
x <- model.matrix(sqrt_PP ~ ., data = data[, !names(data) %in% c("PP (Y)", "PP_BoxCox", "log_PP")])[, -1]
y <- data$sqrt_PP
alphas <- seq(0, 1, by = 0.1)
cv_errors <- numeric(length(alphas))
models_enet <- list()
lambdas_enet <- numeric(length(alphas))
set.seed(123)
for (i in seq_along(alphas)) {
cv_enet <- cv.glmnet(x, y, alpha = alphas[i])
cv_errors[i] <- cv_enet$cvm[cv_enet$lambda == cv_enet$lambda.min]
models_enet[[i]] <- glmnet(x, y, alpha = alphas[i], lambda = cv_enet$lambda.min)
lambdas_enet[i] <- cv_enet$lambda.min
}
best_index <- which.min(cv_errors)
best_alpha <- alphas[best_index]
best_model <- models_enet[[best_index]]
best_lambda <- lambdas_enet[best_index]
pred <- predict(best_model, s = best_lambda, newx = x)
mse <- mean((y - pred)^2)
nonzero <- sum(coef(best_model) != 0) - 1
list(alpha = best_alpha, lambda = best_lambda, mse = mse, nonzero = nonzero, model = best_model)
}
# Data asli
data_ori <- data
# 1. Data asli (tanpa penyisihan)
result_ori <- run_elastic_net(data_ori)
# 2. Hapus amatan ke-3
data_no3 <- data_ori[-3, ]
result_no3 <- run_elastic_net(data_no3)
# 3. Hapus amatan ke-11
data_no11 <- data_ori[-11, ]
result_no11 <- run_elastic_net(data_no11)
# 4. Hapus amatan ke-3 dan ke-11
data_no3_11 <- data_ori[-c(3, 11), ]
result_no3_11 <- run_elastic_net(data_no3_11)
# Bandingkan hasil
results <- data.frame(
Model = c("Asli", "Tanpa 3", "Tanpa 11", "Tanpa 3 & 11"),
Alpha = c(result_ori$alpha, result_no3$alpha, result_no11$alpha, result_no3_11$alpha),
Lambda = c(result_ori$lambda, result_no3$lambda, result_no11$lambda, result_no3_11$lambda),
MSE = c(result_ori$mse, result_no3$mse, result_no11$mse, result_no3_11$mse),
Nonzero = c(result_ori$nonzero, result_no3$nonzero, result_no11$nonzero, result_no3_11$nonzero)
)
print(results)
## Model Alpha Lambda MSE Nonzero
## 1 Asli 0.6 1.4940077 23253.32 9
## 2 Tanpa 3 0.4 0.6183793 17445.02 9
## 3 Tanpa 11 0.7 1.6913821 21275.72 8
## 4 Tanpa 3 & 11 0.8 1.0360591 14348.64 9
model akhir
# Buang baris 3 dan 11 dari data
data_new <- data[-c(3, 11), ]
# Buat x dan y baru dari data yang sudah disisihkan
x_new <- model.matrix(sqrt_PP ~ ., data = data_new[, !names(data_new) %in% c("PP (Y)", "PP_BoxCox", "log_PP")])[, -1]
y_new <- data_new$sqrt_PP
# Lanjutkan tuning Elastic Net seperti sebelumnya
alphas <- seq(0, 1, by = 0.1)
cv_errors <- numeric(length(alphas))
models_enet <- list()
lambdas_enet <- numeric(length(alphas))
set.seed(123)
for (i in seq_along(alphas)) {
cv_enet <- cv.glmnet(x_new, y_new, alpha = alphas[i])
cv_errors[i] <- min(cv_enet$cvm)
models_enet[[i]] <- glmnet(x_new, y_new, alpha = alphas[i], lambda = cv_enet$lambda.min)
lambdas_enet[i] <- cv_enet$lambda.min
}
best_idx <- which.min(cv_errors)
best_alpha <- alphas[best_idx]
best_lambda <- lambdas_enet[best_idx]
best_model <- models_enet[[best_idx]]
cat("Best alpha:", best_alpha, "\n")
## Best alpha: 0.8
cat("Best lambda:", best_lambda, "\n")
## Best lambda: 1.036059
# Koefisien model terbaik
coef_enet <- coef(best_model)
print(coef_enet)
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -8.297802e+02
## `LP (X1)` 2.227277e-03
## `PU (X2)` -3.974789e-03
## `PN (X3)` 5.510753e-03
## `CH (X4)` -1.430060e-01
## `HH (X5)` 1.548460e+00
## `KU (X6)` 9.495999e+00
## `LPM (X7)` 7.186879e+00
## `TK (X8)` -2.126800e-04
## `S (X9)` 1.497744e+01