Regresi Klasik, Subset, dan Shrinkage

Data

Data yang digunakan merupakan hasil scrapping dari website airasia.com pada tanggal 1 September 2024, dan di cleaning di software microsoft excel. Penjelasan terkait data :

library(knitr)

# Membuat data untuk tabel
No <- c(1,2,3,4,5,6,7,8)
Peubah <- c("Harga", "Jarak dari Pusat Kota", "Bintang", "Rating", "Fasilitas Spa", "Fasilitas Concierge.services", "Fasilitas Business.center", "Hari")
Satuan <- c("MYR" , "Km", "1, 2, 3, 4, 5", "Skala 1-5", "Ya/Tidak", "Ya/Tidak", "Ya/Tidak", "Weekdays, Weekend, Hari Besar")

# Membuat data frame
data_tabel <- data.frame(No = No, Peubah = Peubah, Satuan =Satuan)

# Mencetak tabel menggunakan knitr
kable(data_tabel, format = "markdown")
No Peubah Satuan
1 Harga MYR
2 Jarak dari Pusat Kota Km
3 Bintang 1, 2, 3, 4, 5
4 Rating Skala 1-5
5 Fasilitas Spa Ya/Tidak
6 Fasilitas Concierge.services Ya/Tidak
7 Fasilitas Business.center Ya/Tidak
8 Hari Weekdays, Weekend, Hari Besar
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
hotel <- read_xlsx("C:/Users/acer/OneDrive - apps.ipb.ac.id/Semester 5/PSD/Data Cleaning.xlsx")
hotel
## # A tibble: 1,549 × 9
##    Nama      Bintang Jarak Rating Spa   Concierge.services Business.center Harga
##    <chr>       <dbl> <dbl>  <dbl> <chr> <chr>              <chr>           <dbl>
##  1 ARTOTEL …       3  0.65    4   No    Yes                No                266
##  2 Four Poi…       4  0.65    4   No    No                 Yes               365
##  3 YELLO Ho…       3  3.36    4.4 No    Yes                No                152
##  4 MaxOneHo…       3  2.76    4   No    Yes                No                 78
##  5 Goodrich…       5  7.95    4.3 No    No                 Yes               235
##  6 Kemang I…       5  6.83    2.8 No    No                 No                303
##  7 Veranda …       4  7.44    4   No    Yes                No                157
##  8 Kyriad H…       3  7.83    1   No    Yes                No                140
##  9 Harper M…       4  7.84    3.6 Yes   No                 Yes               162
## 10 Grand Da…       4  7.27    3.2 Yes   No                 No                177
## # ℹ 1,539 more rows
## # ℹ 1 more variable: Hari <chr>

Eksplorasi

Plot Jarak VS Harga

plot(hotel$Jarak,hotel$Harga)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(data = hotel, aes(x = Hari, y = Harga, fill = Hari , color=Hari)) +
  geom_boxplot(color="black") +
  theme()+
  scale_fill_manual(values = c("#711DB0", "#C21292", "#EF4040")) +
  labs(x = NULL, y = "Harga")

library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(hotel[, c("Harga", "Jarak", "Bintang", "Rating")], 
        upper = list(continuous = wrap('cor', size = 2)),
        title = "Matriks Scatterplot Data")

Konversi data kategorik menjadi faktor

hotel$Spa <- as.factor(hotel$Spa)
hotel$Concierge.services <- as.factor(hotel$Concierge.services)
hotel$Hari <- as.factor(hotel$Hari)
hotel$Business.center <- as.factor(hotel$Business.center) 

Klasik

Regresi Berganda

model <- lm(Harga ~ Jarak+Bintang+Rating+Spa+Concierge.services+Business.center+Hari, data = hotel)
summary(model)
## 
## Call:
## lm(formula = Harga ~ Jarak + Bintang + Rating + Spa + Concierge.services + 
##     Business.center + Hari, data = hotel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -456.52  -81.58  -22.32   55.55  646.22 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -95.7393    17.8620  -5.360 9.59e-08 ***
## Jarak                  -3.7869     0.6634  -5.708 1.37e-08 ***
## Bintang               136.8072     5.1359  26.638  < 2e-16 ***
## Rating                -29.8230     3.8966  -7.654 3.43e-14 ***
## SpaYes                  2.5193     9.8043   0.257   0.7972    
## Concierge.servicesYes -16.6905     9.7459  -1.713   0.0870 .  
## Business.centerYes    -19.9492     9.6422  -2.069   0.0387 *  
## HariWeekdays          -52.4994    10.0633  -5.217 2.07e-07 ***
## HariWeekend           -60.2131     9.7792  -6.157 9.42e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 148.7 on 1540 degrees of freedom
## Multiple R-squared:  0.3638, Adjusted R-squared:  0.3605 
## F-statistic: 110.1 on 8 and 1540 DF,  p-value: < 2.2e-16

Multikolinearitas

library(car)
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.2
vif(model)
##                        GVIF Df GVIF^(1/(2*Df))
## Jarak              1.023892  1        1.011875
## Bintang            1.274269  1        1.128835
## Rating             1.352623  1        1.163023
## Spa                1.197329  1        1.094225
## Concierge.services 1.154962  1        1.074691
## Business.center    1.130528  1        1.063263
## Hari               1.312888  2        1.070427

Karena VIF<10 menunjukan bahwa tidak ada multikolinearitas antar peubah bebas.

Subset

Best Subset

library(olsrr)
## Warning: package 'olsrr' was built under R version 4.3.3
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
bestsub <- ols_step_best_subset(model)
bestsub
##                             Best Subsets Regression                            
## -------------------------------------------------------------------------------
## Model Index    Predictors
## -------------------------------------------------------------------------------
##      1         Bintang                                                          
##      2         Bintang Rating                                                   
##      3         Bintang Rating Hari                                              
##      4         Jarak Bintang Rating Hari                                        
##      5         Jarak Bintang Rating Business.center Hari                        
##      6         Jarak Bintang Rating Concierge.services Business.center Hari     
##      7         Jarak Bintang Rating Spa Concierge.services Business.center Hari 
## -------------------------------------------------------------------------------
## 
##                                                             Subsets Regression Summary                                                             
## ---------------------------------------------------------------------------------------------------------------------------------------------------
##                        Adj.        Pred                                                                                                             
## Model    R-Square    R-Square    R-Square      C(p)         AIC           SBIC          SBC            MSEP            FPE          HSP       APC  
## ---------------------------------------------------------------------------------------------------------------------------------------------------
##   1        0.2667      0.2663       0.264    230.0211    20107.9163    15711.5030    20123.9524    39276247.8279    25388.6117    16.4010    0.7352 
##   2        0.3272      0.3263      0.3232     85.7269    19976.6812    15580.5111    19998.0627    36062493.6452    23326.2246    15.0687    0.6754 
##   3        0.3487      0.3470      0.3434     37.7078    19930.4011    15532.3883    19962.4733    34933320.5607    22625.0378    14.6158    0.6547 
##   4        0.3611      0.3591      0.3546      9.5236    19902.4598    15504.6043    19939.8774    34286770.5318    22220.5899    14.3546    0.6430 
##   5        0.3625      0.3600      0.3552      8.2411    19901.1687    15503.3418    19943.9316    34236186.3898    22202.0851    14.3427    0.6425 
##   6        0.3638      0.3609      0.3559      7.0660    19899.9784    15502.1880    19948.0867    34187918.8789    22185.0415    14.3318    0.6420 
##   7        0.3638      0.3605       0.355      9.0000    19901.9120    15504.1339    19955.3657    34208652.1121    22212.7620    14.3498    0.6428 
## ---------------------------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria 
##  SBIC: Sawa's Bayesian Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
##  MSEP: Estimated error of prediction, assuming multivariate normality 
##  FPE: Final Prediction Error 
##  HSP: Hocking's Sp 
##  APC: Amemiya Prediction Criteria

Pada pemilihan peubah dengan teknik best subset terlihat nilai AIC paling rendah serta Adj. R-Squared paling tinggi dimiliki oleh model ke-6. Pada model ini, peubah Fasilitas Spa tidak dimasukkan ke dalam model.

Forward

forward <- olsrr::ols_step_forward_p(model)
forward
## 
## 
##                                     Stepwise Summary                                     
## ---------------------------------------------------------------------------------------
## Step    Variable                 AIC          SBC         SBIC         R2       Adj. R2 
## ---------------------------------------------------------------------------------------
##  0      Base Model            20586.486    20597.177    16189.635    0.00000    0.00000 
##  1      Bintang               20107.916    20123.952    15711.503    0.26673    0.26626 
##  2      Rating                19976.681    19998.063    15580.511    0.32717    0.32630 
##  3      Jarak                 19950.206    19976.933    15554.082    0.33942    0.33814 
##  4      Hari                  19902.460    19939.877    15504.604    0.36113    0.35906 
##  5      Business.center       19901.169    19943.932    15503.342    0.36248    0.36000 
##  6      Concierge.services    19899.978    19948.087    15502.188    0.36379    0.36090 
## ---------------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                            Model Summary                            
## -------------------------------------------------------------------
## R                         0.603       RMSE                 148.227 
## R-Squared                 0.364       MSE                22085.237 
## Adj. R-Squared            0.361       Coef. Var             73.543 
## Pred R-Squared            0.356       AIC                19899.978 
## MAE                     102.877       SBC                19948.087 
## -------------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                    ANOVA                                    
## ---------------------------------------------------------------------------
##                     Sum of                                                 
##                    Squares          DF    Mean Square       F         Sig. 
## ---------------------------------------------------------------------------
## Regression    19460853.479           7    2780121.926    125.881    0.0000 
## Residual      34033350.733        1541      22085.237                      
## Total         53494204.212        1548                                     
## ---------------------------------------------------------------------------
## 
##                                          Parameter Estimates                                           
## ------------------------------------------------------------------------------------------------------
##                 model       Beta    Std. Error    Std. Beta      t        Sig        lower      upper 
## ------------------------------------------------------------------------------------------------------
##           (Intercept)    -96.717        17.446                 -5.544    0.000    -130.938    -62.496 
##               Bintang    137.168         4.939        0.613    27.775    0.000     127.481    146.855 
##                Rating    -29.727         3.878       -0.180    -7.666    0.000     -37.333    -22.121 
##                 Jarak     -3.787         0.663       -0.117    -5.711    0.000      -5.088     -2.486 
##          HariWeekdays    -52.366        10.047       -0.130    -5.212    0.000     -72.073    -32.659 
##           HariWeekend    -60.186         9.776       -0.146    -6.157    0.000     -79.361    -41.011 
##    Business.centerYes    -19.680         9.582       -0.044    -2.054    0.040     -38.475     -0.884 
## Concierge.servicesYes    -17.115         9.602       -0.038    -1.782    0.075     -35.949      1.720 
## ------------------------------------------------------------------------------------------------------

Hasil metode Stepwise Forward juga menunjukkan hal yang sama. Langkah paling optimal berada pada langkah ke-6, yang mana belum memasukkan peubah Fasilitas Spake dalam model.

Backward

backward <- olsrr::ols_step_backward_p(model)
backward
## 
## 
##                                 Stepwise Summary                                 
## -------------------------------------------------------------------------------
## Step    Variable         AIC          SBC         SBIC         R2       Adj. R2 
## -------------------------------------------------------------------------------
##  0      Full Model    19901.912    19955.366    15504.134    0.36382    0.36052 
##  1      Spa           19899.978    19948.087    15502.188    0.36379    0.36090 
## -------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                            Model Summary                            
## -------------------------------------------------------------------
## R                         0.603       RMSE                 148.227 
## R-Squared                 0.364       MSE                22085.237 
## Adj. R-Squared            0.361       Coef. Var             73.543 
## Pred R-Squared            0.356       AIC                19899.978 
## MAE                     102.877       SBC                19948.087 
## -------------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                    ANOVA                                    
## ---------------------------------------------------------------------------
##                     Sum of                                                 
##                    Squares          DF    Mean Square       F         Sig. 
## ---------------------------------------------------------------------------
## Regression    19460853.479           7    2780121.926    125.881    0.0000 
## Residual      34033350.733        1541      22085.237                      
## Total         53494204.212        1548                                     
## ---------------------------------------------------------------------------
## 
##                                          Parameter Estimates                                           
## ------------------------------------------------------------------------------------------------------
##                 model       Beta    Std. Error    Std. Beta      t        Sig        lower      upper 
## ------------------------------------------------------------------------------------------------------
##           (Intercept)    -96.717        17.446                 -5.544    0.000    -130.938    -62.496 
##                 Jarak     -3.787         0.663       -0.117    -5.711    0.000      -5.088     -2.486 
##               Bintang    137.168         4.939        0.613    27.775    0.000     127.481    146.855 
##                Rating    -29.727         3.878       -0.180    -7.666    0.000     -37.333    -22.121 
## Concierge.servicesYes    -17.115         9.602       -0.038    -1.782    0.075     -35.949      1.720 
##    Business.centerYes    -19.680         9.582       -0.044    -2.054    0.040     -38.475     -0.884 
##          HariWeekdays    -52.366        10.047       -0.130    -5.212    0.000     -72.073    -32.659 
##           HariWeekend    -60.186         9.776       -0.146    -6.157    0.000     -79.361    -41.011 
## ------------------------------------------------------------------------------------------------------

Pada metode Stepwise Backward, peubah Fasilitas Spa dikeluarkan dari model. Menunjukkan bahwa metode Best Subset, Stepwise Forward, dan Stepwise Backward menghasilkan kesimpulan yang sama yaitu: Model terbaik diperoleh dengan tidak memasukkan peubahpeubah Fasilitas Spa 

Shrinkage

Regresi Rigde

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.3.2
## Loaded glmnet 4.1-8
# konversi ke model matrix
y <- hotel$Harga
x <- model.matrix(~ Jarak + Bintang + Rating + Spa + Concierge.services + Business.center + Hari, data = hotel)

#regresi ridge
mod2 <- cv.glmnet(x[,-1],y,alpha=0) #pemilihan lambda dgn cv untuk ridge
mod2
## 
## Call:  cv.glmnet(x = x[, -1], y = y, alpha = 0) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure   SE Nonzero
## min   9.60   100   22294 1501       8
## 1se  81.56    77   23786 1597       8
coef(mod2,s="lambda.min")
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                               s1
## (Intercept)           -78.677154
## Jarak                  -3.597802
## Bintang               127.823419
## Rating                -27.309673
## SpaYes                  5.902962
## Concierge.servicesYes -15.748129
## Business.centerYes    -17.079866
## HariWeekdays          -49.611885
## HariWeekend           -56.492624

Berdasarkan hasil diatas, semua peubah tetap dipertahankan pada model Ridge karena tidak terdapat multikolinearitas.

Predict

# Prediksi nilai y berdasarkan model dengan lambda optimal
y_pred <- predict(mod2, newx = x[,-1], s = "lambda.min")

Mencari Nilai AIC dan Adjusted R-squared

# Residual sum of squares (RSS)
RSS <- sum((y - y_pred)^2)

# Jumlah observasi
n <- length(y)

# Menghitung jumlah parameter (koefisien non-zero)
coef_ridge <- coef(mod2, s = "lambda.min")
k <- sum(coef_ridge != 0)  # menghitung jumlah koefisien non-zero

# Menghitung AIC
AIC_ridge <- n * log(RSS/n) + 2 * k
print(paste("AIC:", AIC_ridge))
## [1] "AIC: 15507.6097814649"
# Menghitung R-squared
TSS <- sum((y - mean(y))^2)  # Total sum of squares
R2 <- 1 - (RSS / TSS)

# Menghitung Adjusted R-squared
p <- k - 1  # Jumlah prediktor (tidak termasuk intercept)
R2_adj <- 1 - ((1 - R2) * (n - 1)) / (n - p - 1)
print(paste("Adjusted R-squared:", R2_adj))
## [1] "Adjusted R-squared: 0.359040874446631"

Regresi Lasso

#lasso
mod3 <- cv.glmnet(x[,-1],y,alpha=1) #pemilihan lambda dgn cv untuk lambda
mod3
## 
## Call:  cv.glmnet(x = x[, -1], y = y, alpha = 1) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure   SE Nonzero
## min  1.211    48   22179 1046       7
## 1se 12.396    23   23126 1123       5
coef(mod3,s="lambda.min")
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                               s1
## (Intercept)           -94.999014
## Jarak                  -3.565342
## Bintang               134.620984
## Rating                -29.079167
## SpaYes                  .       
## Concierge.servicesYes -14.381044
## Business.centerYes    -16.091373
## HariWeekdays          -49.437762
## HariWeekend           -56.155460

Pada Model Lasso mempertahan kan semua peubah kecuali  Fasilitas Spa  yang terlihat cenderung lebih dekat dengan 0.

Predict

# Prediksi nilai y berdasarkan model dengan lambda optimal
y_pred <- predict(mod3, newx = x[,-1], s = "lambda.min")

Mencari Nilai AIC dan Adjusted R-squared

# Residual sum of squares (RSS)
RSS <- sum((y - y_pred)^2)

# Jumlah observasi
n <- length(y)

# Menghitung jumlah parameter (koefisien non-zero)
coef_lasso <- coef(mod3, s = "lambda.min")
k <- sum(coef_lasso != 0)  # menghitung jumlah koefisien non-zero

# Menghitung AIC
AIC_lasso <- n * log(RSS/n) + 2 * k
print(paste("AIC:", AIC_lasso))
## [1] "AIC: 15502.9536602587"
# Menghitung R-squared
TSS <- sum((y - mean(y))^2)  # Total sum of squares
R2 <- 1 - (RSS / TSS)

# Menghitung Adjusted R-squared
p <- k - 1  # Jumlah prediktor (tidak termasuk intercept)
R2_adj <- 1 - ((1 - R2) * (n - 1)) / (n - p - 1)
print(paste("Adjusted R-squared:", R2_adj))
## [1] "Adjusted R-squared: 0.360554230878531"

Penentuan Model Terbaik

library(knitr)

# Membuat data untuk tabel
metode <- c("Klasik","Seleksi Peubah", "Ridge", "Lasso")
AIC <- c(AIC(model), 19899.97, 15507.61, 15502.81)
Adj.R2 <- c(0.3605 , 0.3609 ,0.3590, 0.3606)

# Membuat data frame
data_tabel <- data.frame(Metode = metode, AIC = AIC, `Adj R^2` = Adj.R2)

# Mencetak tabel menggunakan knitr
kable(data_tabel, format = "markdown")
Metode AIC Adj.R.2
Klasik 19901.91 0.3605
Seleksi Peubah 19899.97 0.3609
Ridge 15507.61 0.3590
Lasso 15502.81 0.3606

Jika dilihat dari nilai R-Adj.Square, model regresi hasil seleksi peubah adalah yang terbaik. Namun, jika nilai AIC yang dijadikan acuan maka model Lasso adalah yang terbaik. Selisih nilai AIC model Lasso dengan kedua model lainnya, sangat besar sedangkan perbedaan nilai R-Adj.square diantara keempat model tidak jauh berbeda. Oleh karena itu, model Lasso dipilih sebagai model yang paling baik untuk data yang digunakan.