Cari data apapun boleh dari R atau source dr internet, buat analisis regresi klasik, ridge, sama lasso. Bandingin performa model, peubah yang digunakan, terus interpretasiin koef model terbaiknya
library(readxl)
library(corrplot)
## corrplot 0.92 loaded
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
lapply(c("tidyverse","rvest","kableExtra"),library,character.only=T)[[1]]
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ ggplot2 3.4.3 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## ✔ readr 2.1.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'rvest'
##
##
## The following object is masked from 'package:readr':
##
## guess_encoding
##
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [1] "forcats" "stringr" "purrr" "readr" "tidyr" "tibble"
## [7] "ggplot2" "tidyverse" "dplyr" "olsrr" "corrplot" "readxl"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
## [1] "glmnet" "Matrix" "kableExtra" "rvest" "forcats"
## [6] "stringr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "dplyr" "olsrr" "corrplot"
## [16] "readxl" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
datapsd <- read_excel("/Users/radhityaharma/Documents/SEMESTER 5/PSD/datapsd/datapsd.xlsx", sheet = 2)
datapsd
## # A tibble: 34 × 4
## y x1 x2 x3
## <dbl> <dbl> <dbl> <dbl>
## 1 72.8 9.79 99.6 14.8
## 2 72.7 9.99 99.6 8.33
## 3 73.3 9.51 99.9 6.04
## 4 73.5 9.54 99.9 6.84
## 5 72.1 9.07 99.5 7.7
## 6 70.9 8.82 99.5 12.0
## 7 72.2 9.28 99.5 14.3
## 8 70.4 8.61 99.3 11.4
## 9 72.2 8.57 99.2 4.61
## 10 76.5 10.5 99.7 6.03
## # … with 24 more rows
daftarpeubah <- data.frame("Peubah" = c("Y", "X1", "X2", "X3"),
"Keterangan" = c("Indeks Pembangunan Manusia menurut Provinsi(Indeks)", "Rata-Rata Lama Sekolah Penduduk Umur ≥ 15 Tahun Menurut Provinsi(tahun)","Angka Melek Huruf Penduduk Umur 15-59 Tahun Menurut Provinsi (Indeks)","Persentase Penduduk Miskin (P0) Menurut Provinsi 2022 (Persentase)"))
print(daftarpeubah)
## Peubah
## 1 Y
## 2 X1
## 3 X2
## 4 X3
## Keterangan
## 1 Indeks Pembangunan Manusia menurut Provinsi(Indeks)
## 2 Rata-Rata Lama Sekolah Penduduk Umur ≥ 15 Tahun Menurut Provinsi(tahun)
## 3 Angka Melek Huruf Penduduk Umur 15-59 Tahun Menurut Provinsi (Indeks)
## 4 Persentase Penduduk Miskin (P0) Menurut Provinsi 2022 (Persentase)
plot(datapsd$x1,datapsd$y)
plot(datapsd$x2,datapsd$y)
plot(datapsd$x3,datapsd$y)
library(corrplot)
korelasi <- cor(datapsd)
corrplot(korelasi,
method = 'number',
bg = "white",
number.cex = 0.9,
tl.col = "black",
tl.cex = 0.6,
cl.cex = 0.6)
Terlihat bahwa peubah X1,X2, dan X3 memiliki kolerasi yang cukup tinggi
terhadap peubah y.
modelklasik <- lm(y~x1+x2+x3, data = datapsd)
summary(modelklasik)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = datapsd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7832 -1.1798 -0.1568 1.0952 7.0664
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.97863 16.29402 3.436 0.001752 **
## x1 2.51906 0.60589 4.158 0.000247 ***
## x2 -0.03575 0.17972 -0.199 0.843652
## x3 -0.36789 0.09639 -3.817 0.000630 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.3 on 30 degrees of freedom
## Multiple R-squared: 0.684, Adjusted R-squared: 0.6524
## F-statistic: 21.65 on 3 and 30 DF, p-value: 1.174e-07
Terdapat 2 Peubah yang signifikan yakni X1 dan X3
rseklasik = sigma(modelklasik);rseklasik
## [1] 2.299743
installed.packages("car")
## Package LibPath Version Priority Depends Imports LinkingTo Suggests
## Enhances License License_is_FOSS License_restricts_use OS_type Archs
## MD5sum NeedsCompilation Built
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:lmridge':
##
## vif
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
car::vif(modelklasik)
## x1 x2 x3
## 1.530249 2.144769 1.619460
Tidak ada multikolinieritas antar peubah satu dengan yang lain karena nilai VIF<10
subsetmodel <- ols_step_best_subset(modelklasik)
subsetmodel <- as.data.frame(subsetmodel)
subsetmodel
## mindex n predictors rsquare adjr predrsq cp aic sbic
## 1 1 1 x1 0.4803557 0.4641168 0.3857130 19.335286 171.7755 73.85113
## 5 2 2 x1 x3 0.6835957 0.6631825 0.5971649 2.039577 156.9073 61.17579
## 7 3 3 x1 x2 x3 0.6840126 0.6524138 0.5819402 4.000000 158.8625 63.40580
## sbc msep fpe apc hsp
## 1 176.3546 277.2642 8.633562 0.5845999 0.2630297
## 5 163.0128 174.4498 5.577167 0.3776438 0.1708321
## 7 166.4943 180.2275 5.911033 0.4002507 0.1823731
Peubah yang paling baik dimasukkan pada metode OLS ialah X1 dan X3 Karena memiliki nilai AIC yang lebih kecil dan ADJR yang lebih besar.
olsmodel <- lm(y~x1 + x3 , data = datapsd)
olsmodel
##
## Call:
## lm(formula = y ~ x1 + x3, data = datapsd)
##
## Coefficients:
## (Intercept) x1 x3
## 52.9099 2.4594 -0.3576
summary(olsmodel)
##
## Call:
## lm(formula = y ~ x1 + x3, data = datapsd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7721 -1.1837 -0.0818 1.0793 7.0730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.90990 5.16710 10.240 1.81e-11 ***
## x1 2.45941 0.51826 4.746 4.45e-05 ***
## x3 -0.35762 0.08014 -4.462 9.96e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.264 on 31 degrees of freedom
## Multiple R-squared: 0.6836, Adjusted R-squared: 0.6632
## F-statistic: 33.49 on 2 and 31 DF, p-value: 1.794e-08
rse_ols <- sigma(olsmodel);rse_ols
## [1] 2.263838
Pada seleksi peubah menggunakan OLS didapatkan bahwa peubah X1 dan X3 berpengaruh signifikan terhadap y dengan tingkat signifikansi 0. Nilai R-Square dari model OLS yang didapatkan adalah 68.36% dan RSE sebesar 2.26
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
weights <- 1 / lm(abs(olsmodel$residuals)~olsmodel$fitted.values)$fitted.values^2
wlsmodel <- lm(y~x1 +x3, data = datapsd, weights = weights)
summary(wlsmodel)
##
## Call:
## lm(formula = y ~ x1 + x3, data = datapsd, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.7600 -0.8001 -0.1165 0.6811 4.3652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.37743 4.99269 11.092 2.55e-12 ***
## x1 2.21173 0.49766 4.444 0.000105 ***
## x3 -0.37536 0.06721 -5.585 4.01e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.423 on 31 degrees of freedom
## Multiple R-squared: 0.7886, Adjusted R-squared: 0.7749
## F-statistic: 57.81 on 2 and 31 DF, p-value: 3.465e-11
rse_wls <- sigma(wlsmodel);rse_wls
## [1] 1.423257
Pada model WLS didapatkan bahwa peubah X1 dan X3 berpengaruh signifikan terhadap y dengan tingkat signifikansi 0. Didapatkan nilai R-Square dari model WLS sebesar 78.86% dan RSE sebesar 1.42
# Mendefinisikan peubah
x <- model.matrix(y~x1 +x3, data = datapsd)[,-1]
y <- datapsd$y
# Melakukan cross validation
cv.r <- cv.glmnet(x, y, alpha = 0)
plot(cv.r)
lambdaridge <- cv.r$lambda.min
ridgeterbaik <- glmnet(x, y, alpha = 0, lambda = lambdaridge)
coef(ridgeterbaik)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 53.8695974
## x1 2.3370310
## x3 -0.3409358
# Mendefinisikan fungsi R-Square
rsq <- function(model, lambda, x, y){
# Prediksi y
y_pred <- predict(model, s = lambda, newx = x)
# Menghitung JKG dan JKT
jkt <- sum((y - mean(y))^2)
jkg <- sum((y_pred - y)^2)
# Menghitung R-Squared
rsq <- 1 - jkg/jkt
return(rsq)
}
r2_ridge <- rsq(ridgeterbaik, lambdaridge, x, y)
r2_ridge
## [1] 0.6820014
rse_ridge <- sigma(ridgeterbaik);rse_ridge
## [1] 2.269535
cv.lasso<-cv.glmnet(x,y,alpha=1);plot(cv.lasso)
best.ll<-cv.lasso$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll);coef(bestlasso)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 53.031719
## x1 2.443489
## x3 -0.355162
r2_lasso <- rsq(bestlasso, best.ll, x, y)
r2_lasso
## [1] 0.6835653
rse_lasso <- sigma(bestlasso);rse_lasso
## [1] 2.263947
rse_lasso
## [1] 2.263947
r2_ols <- summary(olsmodel)$r.squared
r2_wls <- summary(wlsmodel)$r.squared
r2_ridge <- r2_ridge
r2_lasso <- r2_lasso
perbandinganmodel <- data.frame(
Method = c("OLS", "WLS", "Ridge", "Lasso"),
R2 = c(r2_ols, r2_wls, r2_ridge, r2_lasso), RSE = c(rse_ols, rse_ols, rse_ridge, rse_lasso)
) %>%
arrange(desc(R2))
print(perbandinganmodel)
## Method R2 RSE
## 1 WLS 0.7885808 2.263838
## 2 OLS 0.6835957 2.263838
## 3 Lasso 0.6835653 2.263947
## 4 Ridge 0.6820014 2.269535
Pada perbandingan model yang dinilai berdasrkan nilai R-Square yang terbesar dan RSE terkecil dapat terlihat bahwa performa model WLS merupakan terbaik diikuti dengan model OLS, Lasso dan terakhir Ridge. Sehingga dapat disimpulkan bahwa model WLS merupakan model terbaik pada data yang digunakan pada analisis ini