Memanggil data

library(readxl)
df <- read_excel("D:/FIRA/Local Disk/HALO/SEMESTA 4/Analisis Regresi/Mayor_K4/Syntax dan Excel/Data - Kelompok 4.xlsx")
head(df)
## # A tibble: 6 × 9
##   `no amatan`     Y      X1    X2    X3    X4    X5    X6    X7
##         <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1           1 131.  2800293  7.7   13.5  64.8  7.11  1.64 12.9 
## 2           2 135.  2751314  7.86  12.0  67.3  6.41  2.03 14.1 
## 3           3 272.  4230793  9.06  12.8  69.9  8.61  1.97 32.9 
## 4           4  83.1 4215181 10.6   12.6  64.8  7.51  2.07 16.5 
## 5           5 134.  4262015  9.07  13.9  71.6 10.8   1.16 19.1 
## 6           6  18.9 4309773 10.1   13.2  66.7 10.1   2.11  4.42

Membuat model regrersi klasik

model <- lm(Y~X1+X2+X3+X4+X5+X6+X7, data=df)
summary(model)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -76.572 -22.647   1.698  18.008  84.625 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.784e+02  1.136e+02   2.449   0.0159 *  
## X1          -1.484e-05  5.706e-06  -2.601   0.0106 *  
## X2           1.730e+00  1.537e+00   1.126   0.2628    
## X3          -1.049e+00  4.025e+00  -0.261   0.7949    
## X4          -7.042e-01  1.525e+00  -0.462   0.6451    
## X5          -1.931e+01  3.948e+00  -4.893 3.39e-06 ***
## X6           1.982e-01  1.571e-01   1.261   0.2098    
## X7           7.830e+00  4.622e-01  16.939  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.36 on 111 degrees of freedom
## Multiple R-squared:  0.8457, Adjusted R-squared:  0.836 
## F-statistic: 86.91 on 7 and 111 DF,  p-value: < 2.2e-16

#Eksplorasi

# multiple scatterplots
pairs(df[,])
#visualisasi korelasi
library("ggpubr")
## Loading required package: ggplot2

Terlihat Y memiliki korelasi negatif pada X5 dan korelasi positif pada X7

#Uji Asumsi Model Awal

sisaanawal <- model$residuals

#harapan sisaan
t.test(sisaanawal,
       mu = 0,
       conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  sisaanawal
## t = 1.7659e-16, df = 118, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -5.698178  5.698178
## sample estimates:
##    mean of x 
## 5.081355e-16

kenormalan

ks.test(sisaanawal, "pnorm", mean=mean(sisaanawal), sd=sd(sisaanawal))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  sisaanawal
## D = 0.050874, p-value = 0.9177
## alternative hypothesis: two-sided

sisaan saling bebas/

library(randtests)
runs.test(sisaanawal)
## 
##  Runs Test
## 
## data:  sisaanawal
## statistic = -0.92453, runs = 55, n1 = 59, n2 = 59, n = 118, p-value =
## 0.3552
## alternative hypothesis: nonrandomness

Model awal memenuhi seluruh asumsi

#pendeteksian multikolinieritas

library(car)
## Loading required package: carData
vif(model)    #ini yg vif nya aman
##       X1       X2       X3       X4       X5       X6       X7 
## 3.302100 1.896440 1.918748 1.767741 4.384620 1.032161 1.756636

Tidak terdapat multikolinieritas. Berdasarkan nilai VIF tidak ada yang melebihi 10

#Mencari model terbaik dengan backward dan forward

library(MASS)
full_model <- lm(Y~X1+X2+X3+X4+X5+X6+X7, data = df)
backward_model <- stepAIC(full_model, direction = "backward")
## Start:  AIC=835.26
## Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X3    1        71 116337 833.33
## - X4    1       223 116489 833.48
## - X2    1      1327 117593 834.61
## - X6    1      1667 117932 834.95
## <none>              116266 835.26
## - X1    1      7088 123353 840.30
## - X5    1     25073 141338 856.49
## - X7    1    300533 416799 985.19
## 
## Step:  AIC=833.33
## Y ~ X1 + X2 + X4 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X4    1       198 116535 831.53
## - X2    1      1391 117728 832.74
## - X6    1      1665 118002 833.02
## <none>              116337 833.33
## - X1    1      7336 123673 838.61
## - X5    1     41405 157741 867.56
## - X7    1    301420 417757 983.46
## 
## Step:  AIC=831.53
## Y ~ X1 + X2 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X2    1      1651 118186 831.21
## - X6    1      1695 118230 831.25
## <none>              116535 831.53
## - X1    1      7955 124489 837.39
## - X5    1     76199 192734 889.40
## - X7    1    328226 444760 988.91
## 
## Step:  AIC=831.21
## Y ~ X1 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X6    1      1411 119597 830.62
## <none>              118186 831.21
## - X1    1      6375 124561 835.46
## - X5    1     75471 193656 887.97
## - X7    1    350588 468773 993.17
## 
## Step:  AIC=830.62
## Y ~ X1 + X5 + X7
## 
##        Df Sum of Sq    RSS    AIC
## <none>              119597 830.62
## - X1    1      7048 126644 835.43
## - X5    1     74092 193689 885.99
## - X7    1    350145 469742 991.42

AIC terbaik ada pada AIC=830.62 dengan model Y ~ X1 + X5 + X7

#Mencari model terbaik dengan stepwise

step.model <- stepAIC(full_model, direction = "both")
## Start:  AIC=835.26
## Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X3    1        71 116337 833.33
## - X4    1       223 116489 833.48
## - X2    1      1327 117593 834.61
## - X6    1      1667 117932 834.95
## <none>              116266 835.26
## - X1    1      7088 123353 840.30
## - X5    1     25073 141338 856.49
## - X7    1    300533 416799 985.19
## 
## Step:  AIC=833.33
## Y ~ X1 + X2 + X4 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X4    1       198 116535 831.53
## - X2    1      1391 117728 832.74
## - X6    1      1665 118002 833.02
## <none>              116337 833.33
## + X3    1        71 116266 835.26
## - X1    1      7336 123673 838.61
## - X5    1     41405 157741 867.56
## - X7    1    301420 417757 983.46
## 
## Step:  AIC=831.53
## Y ~ X1 + X2 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X2    1      1651 118186 831.21
## - X6    1      1695 118230 831.25
## <none>              116535 831.53
## + X4    1       198 116337 833.33
## + X3    1        46 116489 833.48
## - X1    1      7955 124489 837.39
## - X5    1     76199 192734 889.40
## - X7    1    328226 444760 988.91
## 
## Step:  AIC=831.21
## Y ~ X1 + X5 + X6 + X7
## 
##        Df Sum of Sq    RSS    AIC
## - X6    1      1411 119597 830.62
## <none>              118186 831.21
## + X2    1      1651 116535 831.53
## + X4    1       458 117728 832.74
## + X3    1        89 118097 833.12
## - X1    1      6375 124561 835.46
## - X5    1     75471 193656 887.97
## - X7    1    350588 468773 993.17
## 
## Step:  AIC=830.62
## Y ~ X1 + X5 + X7
## 
##        Df Sum of Sq    RSS    AIC
## <none>              119597 830.62
## + X6    1      1411 118186 831.21
## + X2    1      1367 118230 831.25
## + X4    1       470 119127 832.15
## + X3    1        81 119516 832.54
## - X1    1      7048 126644 835.43
## - X5    1     74092 193689 885.99
## - X7    1    350145 469742 991.42

AIC terbaik ada pada AIC=830.62 dengan model Y ~ X1 + X5 + X7

#Model Terbaik

summary(step.model)
## 
## Call:
## lm(formula = Y ~ X1 + X5 + X7, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.477 -24.535   2.321  19.859  81.540 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.199e+02  1.718e+01  12.803  < 2e-16 ***
## X1          -1.159e-05  4.452e-06  -2.603   0.0105 *  
## X5          -1.971e+01  2.336e+00  -8.441 1.08e-13 ***
## X7           7.860e+00  4.284e-01  18.349  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.25 on 115 degrees of freedom
## Multiple R-squared:  0.8413, Adjusted R-squared:  0.8371 
## F-statistic: 203.2 on 3 and 115 DF,  p-value: < 2.2e-16

RSE sebesar 32.25 dan Rsq sebesar 0.8413

#Uji Asumsi Model Terbaik

sisaan <- step.model$residuals

#harapan sisaan
t.test(sisaan,
       mu = 0,
       conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  sisaan
## t = -4.2573e-17, df = 118, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -5.779233  5.779233
## sample estimates:
##     mean of x 
## -1.242441e-16

kenormalan

ks.test(sisaan, "pnorm", mean=mean(sisaanawal), sd=sd(sisaanawal))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  sisaan
## D = 0.055151, p-value = 0.8621
## alternative hypothesis: two-sided

sisaan saling bebas/

library(randtests)
runs.test(sisaan)
## 
##  Runs Test
## 
## data:  sisaan
## statistic = -1.2943, runs = 53, n1 = 59, n2 = 59, n = 118, p-value =
## 0.1955
## alternative hypothesis: nonrandomness

Model terbaik memenuhi seluruh asumsi

rseb <-32.25
rsqb <-0.8413

#Package Shrinkage

lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]
## Loading required package: Matrix
## Loaded glmnet 4.1-8
## 
## Attaching package: 'lmridge'
## The following object is masked from 'package:car':
## 
##     vif
##  [1] "glmnet"    "Matrix"    "MASS"      "car"       "carData"   "randtests"
##  [7] "ggpubr"    "ggplot2"   "readxl"    "stats"     "graphics"  "grDevices"
## [13] "utils"     "datasets"  "methods"   "base"

#Fungsi RSQ

rsq<-function(bestmodel,bestlambda,x,y){
 #y duga
 y.duga <- predict(bestmodel, s = bestlambda, newx = x)

 #JKG dan JKT
 jkt <- sum((y - mean(y))^2)
 jkg <- sum((y.duga- y)^2)

#find R-Squared
rsq <- 1 - jkg/jkt
return(rsq) 
}

#Fungsi RSE

rse<-function(bestmodel,x,y){
  train_predictions <- predict(bestmodel,newx = x)
  residuals <- y - train_predictions
  df <- length(y) - length(bestmodel$beta)
  residual_variance <- sum(residuals^2) / df
  rse <- sqrt(residual_variance)
  return(rse)
}
x<-data.matrix(df[, c('X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7')])
y<-df$Y

#Model Ridge

ridge<-cv.glmnet(x,y,alpha=0);plot(ridge)

bestlamdaridge<-ridge$lambda.min
bestridge<-glmnet(x,y,alpha=0,lambda=bestlamdaridge);coef(bestridge)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept)  2.955616e+02
## X1          -1.097225e-05
## X2           1.439961e+00
## X3          -3.188839e+00
## X4          -6.649525e-01
## X5          -1.812640e+01
## X6           1.683293e-01
## X7           7.055906e+00
rsqr<-rsq(bestridge,bestlamdaridge,x,y)
rsqr
## [1] 0.8396872
rser<-rse(bestridge,x,y)
rser
## [1] 32.84154

#Model Lasso

lasso<-cv.glmnet(x,y,alpha=1);plot(lasso)

best.ll<-lasso$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll);coef(bestlasso)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept)  2.152835e+02
## X1          -6.770669e-06
## X2           .           
## X3           .           
## X4           .           
## X5          -1.991824e+01
## X6           6.219234e-02
## X7           7.351541e+00
rsql<-rsq(bestlasso,best.ll,x,y)
rsql
## [1] 0.8393702
rsel<-rse(bestlasso,x,y)
rsel
## [1] 32.87399
akurasi <- matrix(c(rsqr, rser, rser, rsel, rsqb, rseb), 2,3)
row.names(akurasi)<- c("RSQ", "RSE")
colnames(akurasi) <- c("ridge","lasso","biasa")
akurasi
##          ridge    lasso   biasa
## RSQ  0.8396872 32.84154  0.8413
## RSE 32.8415360 32.87399 32.2500

Dapat disimpulkan bawah model terbaik yang dipilih adalah model regresi setelah stepwise selection (biasa). Terlihat RSQ terbesar 84% dan MSE terendah 32.25 pada model regresi biasa apabila dibandingkan dengan model ridge maupun lasso