Load Library

library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-10
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
## 
##     R2
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack

Load dan Persiapan Data

# Import data
data <- read_excel("Data Penelitian.xlsx", skip = 1)

# Ubah nama kolom agar rapi
colnames(data) <- make.names(colnames(data))

# Struktur data
str(data)
## tibble [38 × 14] (S3: tbl_df/tbl/data.frame)
##  $ Provinsi: chr [1:38] "ACEH" "SUMATERA UTARA" "SUMATERA BARAT" "RIAU" ...
##  $ Y       : num [1:38] 12.22 7.24 5.31 6.3 6.89 ...
##  $ X1      : num [1:38] 5.64 5.32 5.62 4.16 4.26 3.69 3.41 4.21 4.45 6.45 ...
##  $ X2      : num [1:38] 741980 765842 821552 844678 775092 ...
##  $ X3      : num [1:38] 15.4 15.4 18.6 17.4 13.5 ...
##  $ X4      : num [1:38] 10.42 10.29 10.03 9.8 9.32 ...
##  $ X5      : num [1:38] 99.9 99.9 99.9 100 99.9 ...
##  $ X6      : num [1:38] 85.5 80.6 87.5 64.8 74.7 ...
##  $ X7      : num [1:38] 99.5 99.6 99.9 99.9 99.9 ...
##  $ X8      : num [1:38] 85.2 73.2 73 78.7 89.4 ...
##  $ X9      : num [1:38] 92.1 93.2 87.3 92.2 82.5 ...
##  $ X10     : num [1:38] 82.2 87.5 74.6 91.2 85.9 ...
##  $ X11     : num [1:38] 547511 551093 592911 549211 542466 ...
##  $ X12     : num [1:38] 0.274 0.283 0.28 0.304 0.291 0.298 0.339 0.287 0.214 0.385 ...
# Menentukan variabel
Y <- data[[2]]          # variabel dependen (kemiskinan)
X <- data[, -c(1, 2)]   # variabel independen

# Untuk Ridge
X_matrix <- as.matrix(X)

ANALISIS DESKRIPTIF

# Statistik deskriptif
describe(data)
##           vars  n      mean        sd    median   trimmed       mad       min
## Provinsi*    1 38     19.50     11.11     19.50     19.50     14.08      1.00
## Y            2 38     10.27      6.27      9.35      9.46      5.29      3.42
## X1           3 38      4.47      1.38      4.21      4.47      1.26      1.49
## X2           4 38 796647.84 142821.18 777366.31 781698.21 112498.65 559895.02
## X3           5 38     14.70      5.85     13.94     14.40      7.43      6.65
## X4           6 38      9.46      1.19      9.57      9.59      0.92      4.76
## X5           7 38     99.33      2.13     99.87     99.86      0.09     90.48
## X6           8 38     84.10     11.32     88.31     85.46      7.52     41.78
## X7           9 38     98.60      3.43     99.66     99.33      0.46     80.02
## X8          10 38     84.93      7.81     85.76     85.72      7.29     54.54
## X9          11 38     87.61     11.61     89.94     89.33      8.67     32.89
## X10         12 38     82.96     14.96     85.99     85.62      7.18     16.34
## X11         13 38 528730.45 112156.80 513739.50 517416.66 109439.60 386850.00
## X12         14 38      0.33      0.05      0.33      0.33      0.06      0.21
##                  max     range  skew kurtosis       se
## Provinsi*      38.00     37.00  0.00    -1.30     1.80
## Y              29.45     26.03  1.34     1.32     1.02
## X1              6.96      5.47  0.10    -0.62     0.22
## X2        1256746.67 696851.64  1.13     1.77 23168.65
## X3             27.20     20.55  0.34    -1.16     0.95
## X4             11.58      6.82 -1.84     5.20     0.19
## X5             99.97      9.49 -3.75    12.61     0.35
## X6             99.14     57.36 -1.59     3.07     1.84
## X7            100.00     19.98 -4.31    19.96     0.56
## X8             96.61     42.07 -1.55     3.78     1.27
## X9             99.98     67.09 -2.78    10.35     1.88
## X10            98.20     81.86 -2.89     9.34     2.43
## X11        942805.00 555955.00  1.38     2.72 18194.24
## X12             0.43      0.21  0.11    -0.76     0.01
summary(data)
##    Provinsi               Y                X1              X2         
##  Length:38          Min.   : 3.420   Min.   :1.490   Min.   : 559895  
##  Class :character   1st Qu.: 5.530   1st Qu.:3.500   1st Qu.: 697439  
##  Mode  :character   Median : 9.345   Median :4.210   Median : 777366  
##                     Mean   :10.269   Mean   :4.468   Mean   : 796648  
##                     3rd Qu.:12.135   3rd Qu.:5.545   3rd Qu.: 848151  
##                     Max.   :29.450   Max.   :6.960   Max.   :1256747  
##        X3               X4               X5              X6       
##  Min.   : 6.652   Min.   : 4.760   Min.   :90.48   Min.   :41.78  
##  1st Qu.: 9.649   1st Qu.: 8.932   1st Qu.:99.80   1st Qu.:80.49  
##  Median :13.940   Median : 9.575   Median :99.87   Median :88.31  
##  Mean   :14.701   Mean   : 9.457   Mean   :99.33   Mean   :84.10  
##  3rd Qu.:20.468   3rd Qu.:10.092   3rd Qu.:99.93   3rd Qu.:91.54  
##  Max.   :27.198   Max.   :11.580   Max.   :99.97   Max.   :99.14  
##        X7               X8              X9             X10       
##  Min.   : 80.02   Min.   :54.54   Min.   :32.89   Min.   :16.34  
##  1st Qu.: 99.07   1st Qu.:81.82   1st Qu.:83.29   1st Qu.:81.15  
##  Median : 99.67   Median :85.76   Median :89.94   Median :85.99  
##  Mean   : 98.60   Mean   :84.93   Mean   :87.61   Mean   :82.96  
##  3rd Qu.: 99.91   3rd Qu.:90.77   3rd Qu.:94.90   3rd Qu.:90.12  
##  Max.   :100.00   Max.   :96.61   Max.   :99.98   Max.   :98.20  
##       X11              X12        
##  Min.   :386850   Min.   :0.2140  
##  1st Qu.:445752   1st Qu.:0.2848  
##  Median :513740   Median :0.3275  
##  Mean   :528730   Mean   :0.3284  
##  3rd Qu.:590740   3rd Qu.:0.3605  
##  Max.   :942805   Max.   :0.4260
stat_desc <- data.frame(
  Variabel = names(data)[-1],
  Mean = sapply(data[,-1], mean, na.rm = TRUE),
  Min  = sapply(data[,-1], min, na.rm = TRUE),
  Max  = sapply(data[,-1], max, na.rm = TRUE),
  SD   = sapply(data[,-1], sd, na.rm = TRUE)
)

stat_desc
##     Variabel         Mean          Min          Max           SD
## Y          Y 1.026947e+01 3.420000e+00 2.945000e+01 6.268886e+00
## X1        X1 4.468158e+00 1.490000e+00 6.960000e+00 1.381286e+00
## X2        X2 7.966478e+05 5.598950e+05 1.256747e+06 1.428212e+05
## X3        X3 1.470112e+01 6.651936e+00 2.719814e+01 5.850485e+00
## X4        X4 9.457368e+00 4.760000e+00 1.158000e+01 1.189980e+00
## X5        X5 9.932789e+01 9.048000e+01 9.997000e+01 2.127473e+00
## X6        X6 8.410237e+01 4.178000e+01 9.914000e+01 1.132476e+01
## X7        X7 9.860474e+01 8.002000e+01 1.000000e+02 3.426218e+00
## X8        X8 8.493342e+01 5.454000e+01 9.661000e+01 7.807837e+00
## X9        X9 8.760789e+01 3.289000e+01 9.998000e+01 1.161487e+01
## X10      X10 8.295816e+01 1.634000e+01 9.820000e+01 1.496021e+01
## X11      X11 5.287304e+05 3.868500e+05 9.428050e+05 1.121568e+05
## X12      X12 3.284474e-01 2.140000e-01 4.260000e-01 5.080578e-02

ANALISIS OLS + UJI ASUMSI KLASIK

# Model OLS
model_ols <- lm(Y ~ ., data = X)
summary(model_ols)
## 
## Call:
## lm(formula = Y ~ ., data = X)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2086 -1.4459  0.3424  1.1685  3.5009 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.918e+01  6.544e+01   1.516 0.142181    
## X1           1.490e-01  4.246e-01   0.351 0.728654    
## X2          -3.831e-05  8.134e-06  -4.709 7.92e-05 ***
## X3           7.068e-02  9.231e-02   0.766 0.450994    
## X4           1.109e+00  1.115e+00   0.994 0.329570    
## X5          -1.351e+00  5.960e-01  -2.267 0.032323 *  
## X6          -1.988e-02  6.977e-02  -0.285 0.778017    
## X7           2.247e-01  2.243e-01   1.002 0.325953    
## X8           1.461e-01  1.279e-01   1.142 0.264195    
## X9          -3.555e-02  7.261e-02  -0.490 0.628692    
## X10         -1.709e-01  1.145e-01  -1.494 0.147794    
## X11          4.270e-05  9.770e-06   4.370 0.000191 ***
## X12          7.739e+01  1.260e+01   6.142 2.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.258 on 25 degrees of freedom
## Multiple R-squared:  0.9123, Adjusted R-squared:  0.8702 
## F-statistic: 21.68 on 12 and 25 DF,  p-value: 2.922e-10
# Uji Normalitas (Kolmogorov-Smirnov)
residuals_ols <- residuals(model_ols)

ks.test(residuals_ols, "pnorm",
        mean(residuals_ols),
        sd(residuals_ols))
## 
##  Exact one-sample Kolmogorov-Smirnov test
## 
## data:  residuals_ols
## D = 0.097594, p-value = 0.8279
## alternative hypothesis: two-sided
# Uji Multikolinearitas (VIF)
vif(model_ols)
##        X1        X2        X3        X4        X5        X6        X7        X8 
##  2.496155  9.792465  2.116038 12.776215 11.663974  4.529448  4.283562  7.235075 
##        X9       X10       X11       X12 
##  5.160012 21.271451  8.712569  2.974135
# Uji Heteroskedastisitas (Breusch-Pagan)
bptest(model_ols)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_ols
## BP = 11.321, df = 12, p-value = 0.5016
# Uji Autokorelasi (Durbin-Watson)
dwtest(model_ols)
## 
##  Durbin-Watson test
## 
## data:  model_ols
## DW = 1.809, p-value = 0.14
## alternative hypothesis: true autocorrelation is greater than 0

PRINCIPAL COMPONENT REGRESSION (PCR)

model_pcr <- pcr(Y ~ ., data = X, scale = TRUE, validation = "CV")

summary(model_pcr)
## Data:    X dimension: 38 12 
##  Y dimension: 38 1
## Fit method: svdpc
## Number of components considered: 12
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           6.353    4.677    5.909    5.519    4.884    4.771    4.906
## adjCV        6.353    4.650    5.809    5.378    4.735    4.643    4.764
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps
## CV       5.032    6.040    6.421     5.817     4.058     4.125
## adjCV    4.932    5.828    6.190     5.541     3.906     3.976
## 
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X    42.25    64.66    75.34    84.84    89.77    92.83    95.50    97.55
## Y    48.09    51.09    63.00    77.06    77.06    77.87    77.91    79.79
##    9 comps  10 comps  11 comps  12 comps
## X    98.83     99.35     99.78    100.00
## Y    81.11     88.92     91.15     91.23
validationplot(model_pcr, val.type = "MSEP")

# Ambil jumlah komponen optimal (misal 2)
model_pcr_final <- pcr(Y ~ ., data = X, scale = TRUE, ncomp = 2)

pred_pcr <- predict(model_pcr_final, X, ncomp = 2)

mse_pcr <- mean((Y - pred_pcr)^2)
r2_pcr  <- cor(Y, pred_pcr)^2
# Ambil nilai MSEP dari CV
msep_pcr <- MSEP(model_pcr)

# Lihat nilai error tiap komponen
msep_pcr$val
## , , model = (Intercept)
## 
##         response
## estimate        Y
##    CV    40.36107
##    adjCV 40.36107
## 
## , , model = 1 comps
## 
##         response
## estimate        Y
##    CV    21.87850
##    adjCV 21.62385
## 
## , , model = 2 comps
## 
##         response
## estimate        Y
##    CV    34.91638
##    adjCV 33.74772
## 
## , , model = 3 comps
## 
##         response
## estimate        Y
##    CV    30.45491
##    adjCV 28.91762
## 
## , , model = 4 comps
## 
##         response
## estimate        Y
##    CV    23.85227
##    adjCV 22.42437
## 
## , , model = 5 comps
## 
##         response
## estimate        Y
##    CV    22.75909
##    adjCV 21.55931
## 
## , , model = 6 comps
## 
##         response
## estimate        Y
##    CV    24.06637
##    adjCV 22.69909
## 
## , , model = 7 comps
## 
##         response
## estimate        Y
##    CV    25.32271
##    adjCV 24.32584
## 
## , , model = 8 comps
## 
##         response
## estimate        Y
##    CV    36.47779
##    adjCV 33.96761
## 
## , , model = 9 comps
## 
##         response
## estimate        Y
##    CV    41.23083
##    adjCV 38.31006
## 
## , , model = 10 comps
## 
##         response
## estimate        Y
##    CV    33.84084
##    adjCV 30.70468
## 
## , , model = 11 comps
## 
##         response
## estimate        Y
##    CV    16.46974
##    adjCV 15.25645
## 
## , , model = 12 comps
## 
##         response
## estimate        Y
##    CV    17.01316
##    adjCV 15.80978
# Pilih komponen dengan MSE terkecil
opt_comp_pcr <- which.min(msep_pcr$val[1,1,])

opt_comp_pcr
## 11 comps 
##       12
model_pcr_final <- pcr(Y ~ ., data = X,
                       scale = TRUE,
                       ncomp = opt_comp_pcr)

RIDGE REGRESSION

ridge_cv <- cv.glmnet(X_matrix, Y, alpha = 0)

best_lambda <- ridge_cv$lambda.min

ridge_model <- glmnet(X_matrix, Y,
                      alpha = 0,
                      lambda = best_lambda)

pred_ridge <- predict(ridge_model,
                      s = best_lambda,
                      newx = X_matrix)

mse_ridge <- mean((Y - pred_ridge)^2)
r2_ridge  <- cor(Y, pred_ridge)^2
# Visualisasi Ridge
plot(ridge_cv)

plot(ridge_model, xvar = "lambda", label = TRUE)

PARTIAL LEAST SQUARES (PLS)

model_pls <- plsr(Y ~ ., data = X,
                  scale = TRUE, validation = "CV")

summary(model_pls)
## Data:    X dimension: 38 12 
##  Y dimension: 38 1
## Fit method: kernelpls
## Number of components considered: 12
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           6.353    4.937    4.967    4.382    5.132    4.960    4.635
## adjCV        6.353    4.866    4.812    4.283    4.928    4.777    4.437
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps
## CV       4.066    3.650    3.515     3.562     3.524     3.524
## adjCV    3.912    3.531    3.408     3.452     3.417     3.417
## 
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X    40.97    54.40    73.87    77.77    85.93    88.22    90.25    92.07
## Y    62.56    78.91    81.01    86.51    88.24    90.65    91.14    91.21
##    9 comps  10 comps  11 comps  12 comps
## X    96.42     97.42     99.70    100.00
## Y    91.22     91.23     91.23     91.23
validationplot(model_pls, val.type = "MSEP")

# Misal 2 komponen
model_pls_final <- plsr(Y ~ ., data = X,
                        scale = TRUE, ncomp = 2)

pred_pls <- predict(model_pls_final, X, ncomp = 2)

mse_pls <- mean((Y - pred_pls)^2)
r2_pls  <- cor(Y, pred_pls)^2
msep_pls <- MSEP(model_pls)

msep_pls$val
## , , model = (Intercept)
## 
##         response
## estimate        Y
##    CV    40.36107
##    adjCV 40.36107
## 
## , , model = 1 comps
## 
##         response
## estimate        Y
##    CV    24.37860
##    adjCV 23.68137
## 
## , , model = 2 comps
## 
##         response
## estimate        Y
##    CV    24.66777
##    adjCV 23.15085
## 
## , , model = 3 comps
## 
##         response
## estimate        Y
##    CV    19.19937
##    adjCV 18.34565
## 
## , , model = 4 comps
## 
##         response
## estimate        Y
##    CV    26.33805
##    adjCV 24.28509
## 
## , , model = 5 comps
## 
##         response
## estimate       Y
##    CV    24.6063
##    adjCV 22.8155
## 
## , , model = 6 comps
## 
##         response
## estimate        Y
##    CV    21.48456
##    adjCV 19.69090
## 
## , , model = 7 comps
## 
##         response
## estimate        Y
##    CV    16.53139
##    adjCV 15.30136
## 
## , , model = 8 comps
## 
##         response
## estimate        Y
##    CV    13.32373
##    adjCV 12.46593
## 
## , , model = 9 comps
## 
##         response
## estimate        Y
##    CV    12.35223
##    adjCV 11.61237
## 
## , , model = 10 comps
## 
##         response
## estimate        Y
##    CV    12.68984
##    adjCV 11.91622
## 
## , , model = 11 comps
## 
##         response
## estimate        Y
##    CV    12.41863
##    adjCV 11.67566
## 
## , , model = 12 comps
## 
##         response
## estimate        Y
##    CV    12.41782
##    adjCV 11.67476
opt_comp_pls <- which.min(msep_pls$val[1,1,])

opt_comp_pls
## 9 comps 
##      10
model_pls_final <- plsr(Y ~ ., data = X,
                        scale = TRUE,
                        ncomp = opt_comp_pls)

PERBANDINGAN KINERJA MODEL

hasil <- data.frame(
  Model = c("PCR", "Ridge", "PLS"),
  MSE   = c(mse_pcr, mse_ridge, mse_pls),
  R2    = c(r2_pcr, r2_ridge, r2_pls)
)

hasil
##   Model       MSE        R2
## 1   PCR 18.715274 0.5109004
## 2 Ridge 12.913703 0.7492665
## 3   PLS  8.070855 0.7890786
# Urutkan model terbaik
hasil[order(-hasil$R2, hasil$MSE), ]
##   Model       MSE        R2
## 3   PLS  8.070855 0.7890786
## 2 Ridge 12.913703 0.7492665
## 1   PCR 18.715274 0.5109004

INTERPRETASI MODEL (OLS SEBAGAI PEMBANDING)

summary(model_ols)
## 
## Call:
## lm(formula = Y ~ ., data = X)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2086 -1.4459  0.3424  1.1685  3.5009 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.918e+01  6.544e+01   1.516 0.142181    
## X1           1.490e-01  4.246e-01   0.351 0.728654    
## X2          -3.831e-05  8.134e-06  -4.709 7.92e-05 ***
## X3           7.068e-02  9.231e-02   0.766 0.450994    
## X4           1.109e+00  1.115e+00   0.994 0.329570    
## X5          -1.351e+00  5.960e-01  -2.267 0.032323 *  
## X6          -1.988e-02  6.977e-02  -0.285 0.778017    
## X7           2.247e-01  2.243e-01   1.002 0.325953    
## X8           1.461e-01  1.279e-01   1.142 0.264195    
## X9          -3.555e-02  7.261e-02  -0.490 0.628692    
## X10         -1.709e-01  1.145e-01  -1.494 0.147794    
## X11          4.270e-05  9.770e-06   4.370 0.000191 ***
## X12          7.739e+01  1.260e+01   6.142 2.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.258 on 25 degrees of freedom
## Multiple R-squared:  0.9123, Adjusted R-squared:  0.8702 
## F-statistic: 21.68 on 12 and 25 DF,  p-value: 2.922e-10
# Variabel signifikan
coef_table <- summary(model_ols)$coefficients
signif_var <- coef_table[coef_table[,4] < 0.05, ]

signif_var
##          Estimate   Std. Error   t value     Pr(>|t|)
## X2  -3.830589e-05 8.134105e-06 -4.709293 7.915247e-05
## X5  -1.350795e+00 5.959579e-01 -2.266594 3.232313e-02
## X11  4.269621e-05 9.770210e-06  4.370041 1.907040e-04
## X12  7.739346e+01 1.260154e+01  6.141587 2.021553e-06