Memanggil data
library(readxl)
df <- read_excel("D:/FIRA/Local Disk/HALO/SEMESTA 4/Analisis Regresi/Mayor_K4/Syntax dan Excel/Data - Kelompok 4.xlsx")
head(df)
## # A tibble: 6 × 9
## `no amatan` Y X1 X2 X3 X4 X5 X6 X7
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 131. 2800293 7.7 13.5 64.8 7.11 1.64 12.9
## 2 2 135. 2751314 7.86 12.0 67.3 6.41 2.03 14.1
## 3 3 272. 4230793 9.06 12.8 69.9 8.61 1.97 32.9
## 4 4 83.1 4215181 10.6 12.6 64.8 7.51 2.07 16.5
## 5 5 134. 4262015 9.07 13.9 71.6 10.8 1.16 19.1
## 6 6 18.9 4309773 10.1 13.2 66.7 10.1 2.11 4.42
Membuat model regrersi klasik
model <- lm(Y~X1+X2+X3+X4+X5+X6+X7, data=df)
summary(model)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.572 -22.647 1.698 18.008 84.625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.784e+02 1.136e+02 2.449 0.0159 *
## X1 -1.484e-05 5.706e-06 -2.601 0.0106 *
## X2 1.730e+00 1.537e+00 1.126 0.2628
## X3 -1.049e+00 4.025e+00 -0.261 0.7949
## X4 -7.042e-01 1.525e+00 -0.462 0.6451
## X5 -1.931e+01 3.948e+00 -4.893 3.39e-06 ***
## X6 1.982e-01 1.571e-01 1.261 0.2098
## X7 7.830e+00 4.622e-01 16.939 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.36 on 111 degrees of freedom
## Multiple R-squared: 0.8457, Adjusted R-squared: 0.836
## F-statistic: 86.91 on 7 and 111 DF, p-value: < 2.2e-16
#Eksplorasi
# multiple scatterplots
pairs(df[,])
#visualisasi korelasi
library("ggpubr")
## Loading required package: ggplot2
Terlihat Y memiliki korelasi negatif pada X5 dan korelasi positif pada X7
#Uji Asumsi Model Awal
sisaanawal <- model$residuals
#harapan sisaan
t.test(sisaanawal,
mu = 0,
conf.level = 0.95)
##
## One Sample t-test
##
## data: sisaanawal
## t = 1.7659e-16, df = 118, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -5.698178 5.698178
## sample estimates:
## mean of x
## 5.081355e-16
kenormalan
ks.test(sisaanawal, "pnorm", mean=mean(sisaanawal), sd=sd(sisaanawal))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: sisaanawal
## D = 0.050874, p-value = 0.9177
## alternative hypothesis: two-sided
sisaan saling bebas/
library(randtests)
runs.test(sisaanawal)
##
## Runs Test
##
## data: sisaanawal
## statistic = -0.92453, runs = 55, n1 = 59, n2 = 59, n = 118, p-value =
## 0.3552
## alternative hypothesis: nonrandomness
Model awal memenuhi seluruh asumsi
#pendeteksian multikolinieritas
library(car)
## Loading required package: carData
vif(model) #ini yg vif nya aman
## X1 X2 X3 X4 X5 X6 X7
## 3.302100 1.896440 1.918748 1.767741 4.384620 1.032161 1.756636
Tidak terdapat multikolinieritas. Berdasarkan nilai VIF tidak ada yang melebihi 10
#Mencari model terbaik dengan backward dan forward
library(MASS)
full_model <- lm(Y~X1+X2+X3+X4+X5+X6+X7, data = df)
backward_model <- stepAIC(full_model, direction = "backward")
## Start: AIC=835.26
## Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X3 1 71 116337 833.33
## - X4 1 223 116489 833.48
## - X2 1 1327 117593 834.61
## - X6 1 1667 117932 834.95
## <none> 116266 835.26
## - X1 1 7088 123353 840.30
## - X5 1 25073 141338 856.49
## - X7 1 300533 416799 985.19
##
## Step: AIC=833.33
## Y ~ X1 + X2 + X4 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X4 1 198 116535 831.53
## - X2 1 1391 117728 832.74
## - X6 1 1665 118002 833.02
## <none> 116337 833.33
## - X1 1 7336 123673 838.61
## - X5 1 41405 157741 867.56
## - X7 1 301420 417757 983.46
##
## Step: AIC=831.53
## Y ~ X1 + X2 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X2 1 1651 118186 831.21
## - X6 1 1695 118230 831.25
## <none> 116535 831.53
## - X1 1 7955 124489 837.39
## - X5 1 76199 192734 889.40
## - X7 1 328226 444760 988.91
##
## Step: AIC=831.21
## Y ~ X1 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X6 1 1411 119597 830.62
## <none> 118186 831.21
## - X1 1 6375 124561 835.46
## - X5 1 75471 193656 887.97
## - X7 1 350588 468773 993.17
##
## Step: AIC=830.62
## Y ~ X1 + X5 + X7
##
## Df Sum of Sq RSS AIC
## <none> 119597 830.62
## - X1 1 7048 126644 835.43
## - X5 1 74092 193689 885.99
## - X7 1 350145 469742 991.42
AIC terbaik ada pada AIC=830.62 dengan model Y ~ X1 + X5 + X7
#Mencari model terbaik dengan stepwise
step.model <- stepAIC(full_model, direction = "both")
## Start: AIC=835.26
## Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X3 1 71 116337 833.33
## - X4 1 223 116489 833.48
## - X2 1 1327 117593 834.61
## - X6 1 1667 117932 834.95
## <none> 116266 835.26
## - X1 1 7088 123353 840.30
## - X5 1 25073 141338 856.49
## - X7 1 300533 416799 985.19
##
## Step: AIC=833.33
## Y ~ X1 + X2 + X4 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X4 1 198 116535 831.53
## - X2 1 1391 117728 832.74
## - X6 1 1665 118002 833.02
## <none> 116337 833.33
## + X3 1 71 116266 835.26
## - X1 1 7336 123673 838.61
## - X5 1 41405 157741 867.56
## - X7 1 301420 417757 983.46
##
## Step: AIC=831.53
## Y ~ X1 + X2 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X2 1 1651 118186 831.21
## - X6 1 1695 118230 831.25
## <none> 116535 831.53
## + X4 1 198 116337 833.33
## + X3 1 46 116489 833.48
## - X1 1 7955 124489 837.39
## - X5 1 76199 192734 889.40
## - X7 1 328226 444760 988.91
##
## Step: AIC=831.21
## Y ~ X1 + X5 + X6 + X7
##
## Df Sum of Sq RSS AIC
## - X6 1 1411 119597 830.62
## <none> 118186 831.21
## + X2 1 1651 116535 831.53
## + X4 1 458 117728 832.74
## + X3 1 89 118097 833.12
## - X1 1 6375 124561 835.46
## - X5 1 75471 193656 887.97
## - X7 1 350588 468773 993.17
##
## Step: AIC=830.62
## Y ~ X1 + X5 + X7
##
## Df Sum of Sq RSS AIC
## <none> 119597 830.62
## + X6 1 1411 118186 831.21
## + X2 1 1367 118230 831.25
## + X4 1 470 119127 832.15
## + X3 1 81 119516 832.54
## - X1 1 7048 126644 835.43
## - X5 1 74092 193689 885.99
## - X7 1 350145 469742 991.42
AIC terbaik ada pada AIC=830.62 dengan model Y ~ X1 + X5 + X7
#Model Terbaik
summary(step.model)
##
## Call:
## lm(formula = Y ~ X1 + X5 + X7, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.477 -24.535 2.321 19.859 81.540
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.199e+02 1.718e+01 12.803 < 2e-16 ***
## X1 -1.159e-05 4.452e-06 -2.603 0.0105 *
## X5 -1.971e+01 2.336e+00 -8.441 1.08e-13 ***
## X7 7.860e+00 4.284e-01 18.349 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.25 on 115 degrees of freedom
## Multiple R-squared: 0.8413, Adjusted R-squared: 0.8371
## F-statistic: 203.2 on 3 and 115 DF, p-value: < 2.2e-16
RSE sebesar 32.25 dan Rsq sebesar 0.8413
#Uji Asumsi Model Terbaik
sisaan <- step.model$residuals
#harapan sisaan
t.test(sisaan,
mu = 0,
conf.level = 0.95)
##
## One Sample t-test
##
## data: sisaan
## t = -4.2573e-17, df = 118, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -5.779233 5.779233
## sample estimates:
## mean of x
## -1.242441e-16
kenormalan
ks.test(sisaan, "pnorm", mean=mean(sisaanawal), sd=sd(sisaanawal))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: sisaan
## D = 0.055151, p-value = 0.8621
## alternative hypothesis: two-sided
sisaan saling bebas/
library(randtests)
runs.test(sisaan)
##
## Runs Test
##
## data: sisaan
## statistic = -1.2943, runs = 53, n1 = 59, n2 = 59, n = 118, p-value =
## 0.1955
## alternative hypothesis: nonrandomness
Model terbaik memenuhi seluruh asumsi
rseb <-32.25
rsqb <-0.8413
#Package Shrinkage
lapply(c("glmnet","lmridge"),library,character.only=T)[[1]]
## Loading required package: Matrix
## Loaded glmnet 4.1-8
##
## Attaching package: 'lmridge'
## The following object is masked from 'package:car':
##
## vif
## [1] "glmnet" "Matrix" "MASS" "car" "carData" "randtests"
## [7] "ggpubr" "ggplot2" "readxl" "stats" "graphics" "grDevices"
## [13] "utils" "datasets" "methods" "base"
#Fungsi RSQ
rsq<-function(bestmodel,bestlambda,x,y){
#y duga
y.duga <- predict(bestmodel, s = bestlambda, newx = x)
#JKG dan JKT
jkt <- sum((y - mean(y))^2)
jkg <- sum((y.duga- y)^2)
#find R-Squared
rsq <- 1 - jkg/jkt
return(rsq)
}
#Fungsi RSE
rse<-function(bestmodel,x,y){
train_predictions <- predict(bestmodel,newx = x)
residuals <- y - train_predictions
df <- length(y) - length(bestmodel$beta)
residual_variance <- sum(residuals^2) / df
rse <- sqrt(residual_variance)
return(rse)
}
x<-data.matrix(df[, c('X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7')])
y<-df$Y
#Model Ridge
ridge<-cv.glmnet(x,y,alpha=0);plot(ridge)
bestlamdaridge<-ridge$lambda.min
bestridge<-glmnet(x,y,alpha=0,lambda=bestlamdaridge);coef(bestridge)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 2.955616e+02
## X1 -1.097225e-05
## X2 1.439961e+00
## X3 -3.188839e+00
## X4 -6.649525e-01
## X5 -1.812640e+01
## X6 1.683293e-01
## X7 7.055906e+00
rsqr<-rsq(bestridge,bestlamdaridge,x,y)
rsqr
## [1] 0.8396872
rser<-rse(bestridge,x,y)
rser
## [1] 32.84154
#Model Lasso
lasso<-cv.glmnet(x,y,alpha=1);plot(lasso)
best.ll<-lasso$lambda.min
bestlasso<-glmnet(x,y,alpha=1,lambda=best.ll);coef(bestlasso)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 2.152835e+02
## X1 -6.770669e-06
## X2 .
## X3 .
## X4 .
## X5 -1.991824e+01
## X6 6.219234e-02
## X7 7.351541e+00
rsql<-rsq(bestlasso,best.ll,x,y)
rsql
## [1] 0.8393702
rsel<-rse(bestlasso,x,y)
rsel
## [1] 32.87399
akurasi <- matrix(c(rsqr, rser, rser, rsel, rsqb, rseb), 2,3)
row.names(akurasi)<- c("RSQ", "RSE")
colnames(akurasi) <- c("ridge","lasso","biasa")
akurasi
## ridge lasso biasa
## RSQ 0.8396872 32.84154 0.8413
## RSE 32.8415360 32.87399 32.2500
Dapat disimpulkan bawah model terbaik yang dipilih adalah model regresi setelah stepwise selection (biasa). Terlihat RSQ terbesar 84% dan MSE terendah 32.25 pada model regresi biasa apabila dibandingkan dengan model ridge maupun lasso