Tugas Akhir Analisis Regresi
Packages
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'olsrr'
##
## The following object is masked from 'package:datasets':
##
## rivers
## corrplot 0.92 loaded
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:olsrr':
##
## cement
##
## The following object is masked from 'package:plotly':
##
## select
##
## The following object is masked from 'package:dplyr':
##
## select
Data
## New names:
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
Y<-data$indeks_kesehatan
X0<-rep(1,27)
X1<-data$minum
X2<-data$sanitasi
X3<-data$kurus
X4<-data$stunting
X5<-data$imunisasi
X6<-data$hipertensi
X7<-data$diare
X8<-data$nafas
X9<-data$rumah_sakit
X10<-data$dokter
X11<-data$posyandu
X12<-data$bersih
X13<-data$jaminan
X14<-data$bidan
data<-data.frame(cbind(Y,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14))
data[] <- lapply(data, as.numeric)
head(data)## Y X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## 1 79.46 92.28 68.60 16991 18666 103642 12185 166464 146427 30 596 3009 552261
## 2 79.29 85.70 100.00 5998 10231 36605 556178 68565 47222 9 173 3233 382327
## 3 77.82 80.79 92.89 3146 6871 46859 58089 63213 71337 5 90 2838 534689
## 4 83.09 93.79 100.00 10024 21018 61780 397464 102889 77467 15 247 4010 388891
## 5 79.77 78.15 93.47 8675 30266 41393 160510 71766 22451 7 98 3546 366250
## 6 76.85 89.35 89.74 5104 14122 28534 5211 48901 40549 2 32 2023 226236
## X13 X14
## 1 63.99 1950
## 2 55.99 1516
## 3 48.16 1476
## 4 58.02 876
## 5 43.32 1460
## 6 41.21 1140
## 'data.frame': 27 obs. of 15 variables:
## $ Y : num 79.5 79.3 77.8 83.1 79.8 ...
## $ X1 : num 92.3 85.7 80.8 93.8 78.2 ...
## $ X2 : num 68.6 100 92.9 100 93.5 ...
## $ X3 : num 16991 5998 3146 10024 8675 ...
## $ X4 : num 18666 10231 6871 21018 30266 ...
## $ X5 : num 103642 36605 46859 61780 41393 ...
## $ X6 : num 12185 556178 58089 397464 160510 ...
## $ X7 : num 166464 68565 63213 102889 71766 ...
## $ X8 : num 146427 47222 71337 77467 22451 ...
## $ X9 : num 30 9 5 15 7 2 6 12 12 5 ...
## $ X10: num 596 173 90 247 98 32 63 125 321 63 ...
## $ X11: num 3009 3233 2838 4010 3546 ...
## $ X12: num 552261 382327 534689 388891 366250 ...
## $ X13: num 64 56 48.2 58 43.3 ...
## $ X14: num 1950 1516 1476 876 1460 ...
Persamaan Awal Regresi
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 +
## X10 + X11 + X12 + X13 + X14, data = data)
##
## Coefficients:
## (Intercept) X1 X2 X3 X4 X5
## 6.751e+01 1.310e-01 7.944e-03 2.929e-04 -2.873e-06 7.583e-05
## X6 X7 X8 X9 X10 X11
## 1.834e-06 -8.960e-05 -2.063e-05 1.545e-01 9.005e-04 9.614e-04
## X12 X13 X14
## 3.809e-06 2.342e-03 -2.941e-03
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 +
## X10 + X11 + X12 + X13 + X14, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4303 -0.4708 -0.1421 0.6426 2.0986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.751e+01 1.013e+01 6.665 2.31e-05 ***
## X1 1.310e-01 8.994e-02 1.456 0.1710
## X2 7.944e-03 2.756e-02 0.288 0.7781
## X3 2.929e-04 3.510e-04 0.834 0.4203
## X4 -2.873e-06 1.195e-04 -0.024 0.9812
## X5 7.583e-05 1.649e-04 0.460 0.6538
## X6 1.834e-06 4.292e-06 0.427 0.6767
## X7 -8.960e-05 9.133e-05 -0.981 0.3460
## X8 -2.063e-05 4.727e-05 -0.436 0.6703
## X9 1.545e-01 8.501e-02 1.817 0.0942 .
## X10 9.005e-04 1.539e-03 0.585 0.5693
## X11 9.614e-04 1.139e-03 0.844 0.4150
## X12 3.809e-06 4.111e-06 0.926 0.3725
## X13 2.342e-03 3.684e-02 0.064 0.9503
## X14 -2.941e-03 1.364e-03 -2.157 0.0520 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.475 on 12 degrees of freedom
## Multiple R-squared: 0.7824, Adjusted R-squared: 0.5284
## F-statistic: 3.081 on 14 and 12 DF, p-value: 0.02895
Eksplorasi Data
Hubungan tiap peubah x terhadap peubah y
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X1,y = Y),color="coral",shape=8, size=1) +
geom_smooth(aes(x = X1, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Akses Air Minum Layak") +
ylab("Indeks Kesehatan") +
xlab("Akses Air Minum Layak (persen)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X2,y = Y),color="chocolate",shape=8, size=1) +
geom_smooth(aes(x = X2, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Safety & Akses Sanitasi Layak") +
ylab("Indeks Kesehatan") +
xlab("Akses Sanitasi Layak (persen") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X3,y = Y),color="darkgoldenrod3",shape=8, size=1) +
geom_smooth(aes(x = X3, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Balita Kurus") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Balita Kurus (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X4,y = Y),color="deepskyblue4",shape=8, size=1) +
geom_smooth(aes(x = X4, y =Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Balita Kurang Gizi/Stunting") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Balita Kurang Gizi/Stunting (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X5,y = Y),color="blueviolet",shape=8, size=1) +
geom_smooth(aes(x = X5, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Bayi yang Mendapat Imunisasi") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Bayi yang Mendapat Imunisasi (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X6,y = Y),color="chartreuse4",shape=8, size=1) +
geom_smooth(aes(x = X6, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Kasus Hipertensi") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Kasus Hipertensi (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X7,y = Y),color="#B2182B",shape=8, size=1) +
geom_smooth(aes(x = X7, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Kasus Diare") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Kasus Diare (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X8,y = Y),color="#D6604D",shape=8, size=1) +
geom_smooth(aes(x = X8, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Kasus Bayi Sukar Bernafas") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Kasus Bayi Sukar Bernafas (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X9,y = Y),color="#F4A582",shape=8, size=1) +
geom_smooth(aes(x = X9, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Rumah Sakit") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Rumah Sakit (unit)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X10,y = Y),color="#99CC00",shape=8, size=1) +
geom_smooth(aes(x = X10, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Dokter Umum") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Dokter Umum (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X11,y = Y),color="#92C5DE",shape=8, size=1) +
geom_smooth(aes(x = X11, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Posyandu") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Posyandu (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X12,y = Y),color="#92C5DE",shape=8, size=1) +
geom_smooth(aes(x = X12, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Perilaku Hidup Sehat dan Bersih") +
ylab("Indeks Kesehatan") +
xlab("Perilaku Hidup Sehat dan Bersih (rumah tangga)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X13,y = Y),color="#92C5DE",shape=8, size=1) +
geom_smooth(aes(x = X13, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Kepemilikan Jaminan Kesehatan") +
ylab("Indeks Kesehatan") +
xlab("Kepemilikan Jaminan Kesehatan (persen)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
## Warning in mean.default(Y): argument is not numeric or logical: returning NA
interactive.plot <- ggplot(data) +
geom_point(aes(x = X14,y = Y),color="#92C5DE",shape=8, size=1) +
geom_smooth(aes(x = X14, y = Y), method = "lm", se = FALSE, color = "cornsilk3") +
ggtitle("Indeks Kesehatan vs Jumlah Bidan") +
ylab("Indeks Kesehatan") +
xlab("Jumlah Bidan (orang)") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(interactive.plot)## `geom_smooth()` using formula = 'y ~ x'
Matriks Korelasi
## Matriks Scatterplot
Bar Chart
ggplot(data, aes(x = Y)) +
geom_histogram(binwidth = 0.05, color = "black", fill = "#69b3a2", alpha = 0.8,
size = 0.1, position = "identity", show.legend = FALSE) +
geom_vline(aes(xintercept = mean(Y)), color = "red", linetype = "dashed", size = 1) +
geom_density(alpha = 0.2, fill = "#FF9999") +
ggtitle("Sebaran Nilai Human Freedom Index") +
xlab("Indeks Kesehatan") + ylab("Nilai") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Eksplorasi Kondisi Gauss-Markov
# Plot Sisaan vs Urutan: sisaan saling bebas
plot(x = 1:dim(data)[1],
y = model$residuals,
type = 'b',
ylab = "Residuals",
xlab = "Observation")Uji Formal Kondisi Gauss-Markov
# Asumsi Gauss-Markov: Nilai harapan sisaan sama dengan nol
t.test(model$residuals,
mu = 0,
conf.level = 0.95)##
## One Sample t-test
##
## data: model$residuals
## t = -1.706e-16, df = 26, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.396341 0.396341
## sample estimates:
## mean of x
## -3.28955e-17
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.130579, Df = 1, p = 0.71783
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 13.346, df = 14, p-value = 0.4995
##
## Runs Test
##
## data: model$residuals
## statistic = 1.6013, runs = 18, n1 = 13, n2 = 13, n = 26, p-value =
## 0.1093
## alternative hypothesis: nonrandomness
##
## Durbin-Watson test
##
## data: model
## DW = 1.8069, p-value = 0.2377
## alternative hypothesis: true autocorrelation is greater than 0
##
## Shapiro-Wilk normality test
##
## data: model$residuals
## W = 0.97726, p-value = 0.796
semua asumsi terpenuhi
Pemeriksaan Multikolinieritas
## X1 X2 X3 X4 X5 X6 X7
## 3.302520 1.384595 19.570860 8.544961 174.068025 3.591355 132.137241
## X8 X9 X10 X11 X12 X13 X14
## 23.805753 15.924641 5.570065 16.191167 5.352401 4.257106 4.868067
terdapat multikol, hapus yg vif > 10
data_sub1 <- subset(data, select = -c(X5))
model_sub1 <- lm(Y ~ X1 + X2 + X3 + X4 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 , data = data)
vif(model_sub1)## X1 X2 X3 X4 X6 X7 X8 X9
## 2.964318 1.368724 15.856794 7.459559 3.085236 30.572404 11.631770 12.883843
## X10 X11 X12 X13 X14
## 3.384001 15.603791 5.174832 3.638962 4.853296
data_sub2 <- subset(data, select = -c(X5,X7))
model_sub2 <- lm(Y ~ X1 + X2 + X3 + X4 + X6 + X8 + X9 + X10 + X11 + X12 + X13 + X14 , data = data_sub1)
vif(model_sub2)## X1 X2 X3 X4 X6 X8 X9 X10
## 2.807860 1.357820 14.042467 7.323159 2.926798 8.671966 4.619223 3.272025
## X11 X12 X13 X14
## 15.529109 4.132278 3.571066 4.765114
data_sub3 <- subset(data, select = -c(X5,X7,X11))
model_sub3 <- lm(Y ~ X1 + X2 + X3 + X4 + X6 + X8 + X9 + X10 + X12 + X13 + X14 , data = data_sub2)
vif(model_sub3)## X1 X2 X3 X4 X6 X8 X9 X10
## 2.568227 1.327547 13.832819 6.279583 1.365726 8.585269 4.561951 3.076567
## X12 X13 X14
## 3.528399 2.276848 4.752695
data_sub4 <- subset(data, select = -c(X3,X5,X7,X11))
model_sub4 <- lm(Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X10 + X12 + X13 + X14 , data = data_sub3)
vif(model_sub4)## X1 X2 X4 X6 X8 X9 X10 X12
## 2.530233 1.318418 2.412155 1.363352 2.782469 4.321527 2.779123 3.165828
## X13 X14
## 2.276212 4.681330
Pemeriksaan Amatan Berpengaruh
Tabel hii dan ri
s <- sqrt(anova(model_sub4)["Residuals", "Mean Sq"])
n = dim(data_sub4)[1]
p = length(model_sub4$coefficients)
hii=hatvalues(model_sub4)
Obs = c(1:n)
ei = model_sub4$residuals
ri = ei/(s*sqrt(1-hii))
Di = (ri^2/p)*(hii/(1-hii))
summ <- cbind.data.frame(Obs, data_sub4, hii, ri, Di)
head(summ)## Obs Y X1 X2 X4 X6 X8 X9 X10 X12 X13 X14 hii
## 1 1 79.46 92.28 68.60 18666 12185 146427 30 596 552261 63.99 1950 0.7150874
## 2 2 79.29 85.70 100.00 10231 556178 47222 9 173 382327 55.99 1516 0.7406968
## 3 3 77.82 80.79 92.89 6871 58089 71337 5 90 534689 48.16 1476 0.5117364
## 4 4 83.09 93.79 100.00 21018 397464 77467 15 247 388891 58.02 876 0.6048098
## 5 5 79.77 78.15 93.47 30266 160510 22451 7 98 366250 43.32 1460 0.8045037
## 6 6 76.85 89.35 89.74 14122 5211 40549 2 32 226236 41.21 1140 0.1986898
## ri Di
## 1 -1.09577941 0.2739687805
## 2 -0.68092109 0.1204019151
## 3 0.43762837 0.0182477882
## 4 -0.05681326 0.0004490754
## 5 0.63123614 0.1490666559
## 6 -1.62083443 0.0592187928
Mendeteksi Pencilan
## [1] 27
summ_pencilan <- subset(summ, Obs %in% pencilan)
summ_pencilan <- subset(summ, Obs %in% pencilan, select = c("Obs", "hii", "ri", "Di"))
summ_pencilan## Obs hii ri Di
## 27 27 0.3633226 -2.075939 0.2235679
## Obs hii ri Di
## 1 27 0.3633226 -2.075939 0.2235679
Mendeteksi Amatan berpengaruh
Cook’s D
for (i in 1:dim(summ)[1]){
fcrit = qf(p=0.95, df1=p, df2=n-p)
amatan_berpengaruh <- which(Di > fcrit)
}
amatan_berpengaruh## 21
## 21
summ <- cbind.data.frame(Obs, data_sub4, hii, ri, Di)
summ_sorted <- summ %>%
arrange(desc(Di))
head(summ_sorted)## Obs Y X1 X2 X4 X6 X8 X9 X10 X12 X13 X14 hii
## 1 21 84.23 99.03 100.00 6614 65357 54760 39 2286 110473 78.34 1261 0.9749494
## 2 23 85.35 98.28 93.03 4575 43665 42755 47 819 25860 78.42 1178 0.7229282
## 3 16 83.14 94.86 92.59 3899 32140 46815 53 556 583003 92.32 1632 0.6642859
## 4 1 79.46 92.28 68.60 18666 12185 146427 30 596 552261 63.99 1950 0.7150874
## 5 9 80.72 94.99 91.26 10635 29798 87575 12 321 206354 66.95 1560 0.4634679
## 6 27 79.22 97.67 98.98 846 10841 7042 4 39 47023 96.88 190 0.3633226
## ri Di
## 1 -1.900880 12.7844106
## 2 1.652256 0.6475376
## 3 -1.392284 0.3486973
## 4 -1.095779 0.2739688
## 5 1.767605 0.2453588
## 6 -2.075939 0.2235679
plot amatan berpengaruh
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Menghapus Titik Leverage dan Pencilan
# Menggabungkan indeks titik leverage dan pencilan
observasi_normal <- setdiff(1:n, c(leverage, pencilan))
# Membuat data baru dengan menghapus titik leverage dan pencilan
data_clean <- data_sub4[observasi_normal,]
# Membuat model
model2 <- lm(Y ~ X1+X2+X4+X6+X8+X9+X10+X12+X13+X14 ,data= data_clean)
model2##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X10 + X12 + X13 +
## X14, data = data_clean)
##
## Coefficients:
## (Intercept) X1 X2 X4 X6 X8
## 6.432e+01 1.497e-01 1.957e-02 7.938e-05 1.192e-06 -3.135e-05
## X9 X10 X12 X13 X14
## -1.809e-02 8.557e-03 5.013e-06 1.540e-02 -2.278e-03
##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X10 + X12 + X13 +
## X14, data = data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8675 -0.5187 0.3636 0.6103 1.2872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.432e+01 7.109e+00 9.048 3.18e-07 ***
## X1 1.497e-01 6.249e-02 2.395 0.0311 *
## X2 1.957e-02 2.206e-02 0.887 0.3900
## X4 7.938e-05 5.151e-05 1.541 0.1456
## X6 1.192e-06 2.124e-06 0.561 0.5836
## X8 -3.134e-05 1.624e-05 -1.930 0.0741 .
## X9 -1.809e-02 6.843e-02 -0.264 0.7953
## X10 8.557e-03 4.523e-03 1.892 0.0794 .
## X12 5.013e-06 2.897e-06 1.731 0.1055
## X13 1.540e-02 2.468e-02 0.624 0.5426
## X14 -2.278e-03 1.058e-03 -2.154 0.0492 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.16 on 14 degrees of freedom
## Multiple R-squared: 0.8232, Adjusted R-squared: 0.6969
## F-statistic: 6.517 on 10 and 14 DF, p-value: 0.0009137
Pemeriksaan Multikolinieritas
## X1 X2 X4 X6 X8 X9 X10 X12
## 2.439064 1.340845 2.498109 1.407700 4.294924 14.236990 14.486095 3.933931
## X13 X14
## 2.586411 4.142047
Terlihat ada beberapa multikol maka dilakukan penghapusan
dataclean_sub1 <- subset(data_clean, select = -c(X10))
modelclean_sub1 <- lm(Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14 , data = data_clean)
vif(modelclean_sub1)## X1 X2 X4 X6 X8 X9 X12 X13
## 2.430352 1.329379 2.430080 1.385045 2.508107 3.004751 2.603956 2.434627
## X14
## 4.137906
sudah tidak ada multikol
Eksplorasi asumsi
plot(x = 1:dim(dataclean_sub1)[1],
y = modelclean_sub1$residuals,
type = 'b',
ylab = "Residuals",
xlab = "Observation") # plot sisaan vs urutanUji Formal Kondisi Gauss-Markov
# Asumsi Gauss-Markov: Nilai harapan sisaan sama dengan nol
t.test(modelclean_sub1$residuals,
mu = 0,
conf.level = 0.95)##
## One Sample t-test
##
## data: modelclean_sub1$residuals
## t = -1.1183e-17, df = 24, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4098126 0.4098126
## sample estimates:
## mean of x
## -2.220446e-18
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.004054268, Df = 1, p = 0.94923
##
## Runs Test
##
## data: modelclean_sub1$residuals
## statistic = 2.5045, runs = 19, n1 = 12, n2 = 12, n = 24, p-value =
## 0.01226
## alternative hypothesis: nonrandomness
##
## Shapiro-Wilk normality test
##
## data: modelclean_sub1$residuals
## W = 0.95321, p-value = 0.2958
model tidak memenuhi asumsi sisaan saling bebas, dilakukan transformasi dengan WLS
Weighted Least Square (WLS)
ads.weights=1/lm(abs(modelclean_sub1$residuals) ~ model2$fitted.values)$fitted.values^2
model_wls <- lm(Y~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14 ,data = dataclean_sub1,weights = ads.weights)
model_wls##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14,
## data = dataclean_sub1, weights = ads.weights)
##
## Coefficients:
## (Intercept) X1 X2 X4 X6 X8
## 6.411e+01 1.418e-01 2.345e-02 9.459e-05 1.694e-06 -1.143e-05
## X9 X12 X13 X14
## 9.654e-02 1.916e-06 2.717e-02 -2.345e-03
##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14,
## data = dataclean_sub1, weights = ads.weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.5406 -0.7276 0.2297 0.8535 1.9823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.411e+01 7.659e+00 8.370 4.91e-07 ***
## X1 1.418e-01 6.723e-02 2.109 0.0522 .
## X2 2.345e-02 2.385e-02 0.983 0.3410
## X4 9.459e-05 5.453e-05 1.735 0.1033
## X6 1.694e-06 2.272e-06 0.746 0.4674
## X8 -1.143e-05 1.338e-05 -0.854 0.4065
## X9 9.654e-02 3.429e-02 2.815 0.0131 *
## X12 1.916e-06 2.580e-06 0.742 0.4693
## X13 2.717e-02 2.582e-02 1.053 0.3092
## X14 -2.345e-03 1.142e-03 -2.054 0.0578 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.579 on 15 degrees of freedom
## Multiple R-squared: 0.7763, Adjusted R-squared: 0.642
## F-statistic: 5.782 on 9 and 15 DF, p-value: 0.00151
## X1 X2 X4 X6 X8 X9 X12 X13
## 2.441806 1.332315 2.405286 1.378909 2.507796 2.980975 2.649303 2.414206
## X14
## 4.117256
# Asumsi Gauss-Markov: Nilai harapan sisaan sama dengan nol
t.test(model_wls$residuals,
mu = 0,
conf.level = 0.95)##
## One Sample t-test
##
## data: model_wls$residuals
## t = 0.013361, df = 24, p-value = 0.9895
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4072019 0.4125082
## sample estimates:
## mean of x
## 0.002653187
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.01094565, Df = 1, p = 0.91668
##
## Runs Test
##
## data: model_wls$residuals
## statistic = 2.5045, runs = 19, n1 = 12, n2 = 12, n = 24, p-value =
## 0.01226
## alternative hypothesis: nonrandomness
##
## Shapiro-Wilk normality test
##
## data: model_wls$residuals
## W = 0.95091, p-value = 0.2628
model belum memenuhi asumsi, gunakan MKT
Metode Kuadrat Terkecil (MKT)
Y_cl <- as.matrix(dataclean_sub1$Y)
X_cl <- as.matrix(cbind(rep(1,25),dataclean_sub1$X1,dataclean_sub1$X2,
dataclean_sub1$X4,dataclean_sub1$X6,dataclean_sub1$X8,
dataclean_sub1$X9,dataclean_sub1$X12,dataclean_sub1$X13,
dataclean_sub1$X14))
b_cl <- solve(t(X_cl)%*%X_cl)%*%t(X_cl)%*%Y_cl;round(b_cl,5)## [,1]
## [1,] 64.07752
## [2,] 0.14262
## [3,] 0.02343
## [4,] 0.00010
## [5,] 0.00000
## [6,] -0.00001
## [7,] 0.09689
## [8,] 0.00000
## [9,] 0.02672
## [10,] -0.00234
## X1 X2 X4 X6 X8 X9 X12 X13
## 2.430352 1.329379 2.430080 1.385045 2.508107 3.004751 2.603956 2.434627
## X14
## 4.137906
# Asumsi Gauss-Markov: Nilai harapan sisaan sama dengan nol
t.test(modelclean_sub1$residuals,
mu = 0,
conf.level = 0.95)##
## One Sample t-test
##
## data: modelclean_sub1$residuals
## t = -1.1183e-17, df = 24, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4098126 0.4098126
## sample estimates:
## mean of x
## -2.220446e-18
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.004054268, Df = 1, p = 0.94923
##
## Runs Test
##
## data: modelclean_sub1$residuals
## statistic = 2.5045, runs = 19, n1 = 12, n2 = 12, n = 24, p-value =
## 0.01226
## alternative hypothesis: nonrandomness
##
## Shapiro-Wilk normality test
##
## data: modelclean_sub1$residuals
## W = 0.95321, p-value = 0.2958
Tranformasi Box-Cox
bc_model <- boxcox(Y~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14,data = dataclean_sub1,
lambda = seq(-2, 2, by = 0.1))## [1] 2
# Mengubah nilai Y di dataclean_sub1 dengan hasil transformasi
dataclean_sub1$Y <- (((dataclean_sub1$Y)^optimal_lambda)-1)/optimal_lambda
model3 <- lm(Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14 ,data= dataclean_sub1)
model3##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14,
## data = dataclean_sub1)
##
## Coefficients:
## (Intercept) X1 X2 X4 X6 X8
## 1.916e+03 1.145e+01 1.967e+00 7.715e-03 1.365e-04 -9.286e-04
## X9 X12 X13 X14
## 7.969e+00 1.431e-04 2.132e+00 -1.899e-01
# Asumsi Gauss-Markov: Nilai harapan sisaan sama dengan nol
t.test(model3$residuals,
mu = 0,
conf.level = 0.95)##
## One Sample t-test
##
## data: model3$residuals
## t = 1.5936e-16, df = 24, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -33.12779 33.12779
## sample estimates:
## mean of x
## 2.557954e-15
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.0050767, Df = 1, p = 0.9432
##
## Runs Test
##
## data: model3$residuals
## statistic = 2.5045, runs = 19, n1 = 12, n2 = 12, n = 24, p-value =
## 0.01226
## alternative hypothesis: nonrandomness
##
## Shapiro-Wilk normality test
##
## data: model3$residuals
## W = 0.95146, p-value = 0.2705
masih tidak memenuhi asumsi sisaan saling bebas, gunakan OLS (menentukan model terbaik)
Ordinary Least Square (OLS)
Alternatif pembuatan model terbaik
dataclean_sub1 <- subset(data_clean, select = -c(X10))
modelclean_sub1 <- lm(Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14 , data = data_clean)Forward
## Start: AIC=18.62
## Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14
##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14,
## data = dataclean_sub1)
##
## Coefficients:
## (Intercept) X1 X2 X4 X6 X8
## 6.408e+01 1.426e-01 2.343e-02 9.546e-05 1.702e-06 -1.153e-05
## X9 X12 X13 X14
## 9.689e-02 1.827e-06 2.672e-02 -2.342e-03
Backward
## Start: AIC=18.62
## Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X12 1 0.8085 24.465 17.459
## - X6 1 0.8778 24.534 17.530
## - X8 1 1.1613 24.818 17.817
## - X2 1 1.5311 25.187 18.187
## - X13 1 1.6748 25.331 18.329
## <none> 23.656 18.619
## - X4 1 4.7517 28.408 21.195
## - X14 1 6.6002 30.256 22.771
## - X1 1 7.0353 30.692 23.128
## - X9 1 12.7837 36.440 27.420
##
## Step: AIC=17.46
## Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X8 1 0.7743 25.239 16.238
## - X6 1 1.0831 25.548 16.542
## - X13 1 1.7231 26.188 17.160
## <none> 24.465 17.459
## - X2 1 2.1177 26.582 17.534
## - X4 1 4.4899 28.955 19.671
## - X14 1 5.7924 30.257 20.771
## - X1 1 6.2550 30.720 21.151
## - X9 1 14.1182 38.583 26.848
##
## Step: AIC=16.24
## Y ~ X1 + X2 + X4 + X6 + X9 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X6 1 1.1554 26.394 15.357
## - X13 1 1.4581 26.697 15.642
## - X2 1 1.8760 27.115 16.030
## <none> 25.239 16.238
## - X4 1 3.7244 28.963 17.679
## - X1 1 5.4836 30.723 19.153
## - X14 1 12.7233 37.962 24.443
## - X9 1 15.3401 40.579 26.110
##
## Step: AIC=15.36
## Y ~ X1 + X2 + X4 + X9 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X13 1 2.0578 28.452 15.234
## <none> 26.394 15.357
## - X2 1 3.4119 29.806 16.396
## - X1 1 5.1803 31.575 17.837
## - X4 1 5.4409 31.835 18.043
## - X14 1 12.1651 38.560 22.833
## - X9 1 14.7546 41.149 24.458
##
## Step: AIC=15.23
## Y ~ X1 + X2 + X4 + X9 + X14
##
## Df Sum of Sq RSS AIC
## <none> 28.452 15.234
## - X2 1 2.555 31.008 15.384
## - X4 1 3.920 32.372 16.460
## - X1 1 4.235 32.688 16.703
## - X14 1 21.828 50.281 27.469
## - X9 1 36.502 64.954 33.870
##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X9 + X14, data = dataclean_sub1)
##
## Coefficients:
## (Intercept) X1 X2 X4 X9 X14
## 7.004e+01 9.854e-02 2.716e-02 7.289e-05 1.251e-01 -2.893e-03
Stepwise
## Start: AIC=18.62
## Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X12 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X12 1 0.8085 24.465 17.459
## - X6 1 0.8778 24.534 17.530
## - X8 1 1.1613 24.818 17.817
## - X2 1 1.5311 25.187 18.187
## - X13 1 1.6748 25.331 18.329
## <none> 23.656 18.619
## - X4 1 4.7517 28.408 21.195
## - X14 1 6.6002 30.256 22.771
## - X1 1 7.0353 30.692 23.128
## - X9 1 12.7837 36.440 27.420
##
## Step: AIC=17.46
## Y ~ X1 + X2 + X4 + X6 + X8 + X9 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X8 1 0.7743 25.239 16.238
## - X6 1 1.0831 25.548 16.542
## - X13 1 1.7231 26.188 17.160
## <none> 24.465 17.459
## - X2 1 2.1177 26.582 17.534
## + X12 1 0.8085 23.656 18.619
## - X4 1 4.4899 28.955 19.671
## - X14 1 5.7924 30.257 20.771
## - X1 1 6.2550 30.720 21.151
## - X9 1 14.1182 38.583 26.848
##
## Step: AIC=16.24
## Y ~ X1 + X2 + X4 + X6 + X9 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X6 1 1.1554 26.394 15.357
## - X13 1 1.4581 26.697 15.642
## - X2 1 1.8760 27.115 16.030
## <none> 25.239 16.238
## + X8 1 0.7743 24.465 17.459
## - X4 1 3.7244 28.963 17.679
## + X12 1 0.4214 24.818 17.817
## - X1 1 5.4836 30.723 19.153
## - X14 1 12.7233 37.962 24.443
## - X9 1 15.3401 40.579 26.110
##
## Step: AIC=15.36
## Y ~ X1 + X2 + X4 + X9 + X13 + X14
##
## Df Sum of Sq RSS AIC
## - X13 1 2.0578 28.452 15.234
## <none> 26.394 15.357
## + X6 1 1.1554 25.239 16.238
## - X2 1 3.4119 29.806 16.396
## + X8 1 0.8466 25.548 16.542
## + X12 1 0.5614 25.833 16.820
## - X1 1 5.1803 31.575 17.837
## - X4 1 5.4409 31.835 18.043
## - X14 1 12.1651 38.560 22.833
## - X9 1 14.7546 41.149 24.458
##
## Step: AIC=15.23
## Y ~ X1 + X2 + X4 + X9 + X14
##
## Df Sum of Sq RSS AIC
## <none> 28.452 15.234
## + X13 1 2.058 26.394 15.357
## - X2 1 2.555 31.008 15.384
## + X6 1 1.755 26.697 15.642
## - X4 1 3.920 32.372 16.460
## + X12 1 0.723 27.729 16.590
## - X1 1 4.235 32.688 16.703
## + X8 1 0.535 27.918 16.760
## - X14 1 21.828 50.281 27.469
## - X9 1 36.502 64.954 33.870
##
## Call:
## lm(formula = Y ~ X1 + X2 + X4 + X9 + X14, data = dataclean_sub1)
##
## Coefficients:
## (Intercept) X1 X2 X4 X9 X14
## 7.004e+01 9.854e-02 2.716e-02 7.289e-05 1.251e-01 -2.893e-03
Best Subset Regression
## Best Subsets Regression
## --------------------------------------------
## Model Index Predictors
## --------------------------------------------
## 1 X13
## 2 X9 X14
## 3 X6 X9 X14
## 4 X1 X6 X9 X14
## 5 X1 X4 X6 X9 X14
## 6 X1 X2 X4 X9 X13 X14
## 7 X1 X2 X4 X6 X9 X13 X14
## 8 X1 X2 X4 X6 X8 X9 X13 X14
## 9 X1 X2 X4 X6 X8 X9 X12 X13 X14
## --------------------------------------------
##
## Subsets Regression Summary
## --------------------------------------------------------------------------------------------------------------------------------
## Adj. Pred
## Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
## --------------------------------------------------------------------------------------------------------------------------------
## 1 0.3494 0.3211 0.1752 22.9546 102.4433 29.3996 106.1000 75.3696 3.2550 0.1370 0.7638
## 2 0.6541 0.6227 0.5568 4.3677 88.6483 18.1107 93.5238 41.9771 1.8761 0.0798 0.4402
## 3 0.6879 0.6433 0.5507 4.0829 88.0760 18.5464 94.1703 39.7663 1.8366 0.0792 0.4310
## 4 0.7135 0.6562 0.5439 4.3564 87.9400 19.7387 95.2532 38.4314 1.8316 0.0803 0.4298
## 5 0.7379 0.6689 0.5838 4.7079 87.7146 21.3702 96.2467 37.1115 1.8226 0.0817 0.4277
## 6 0.7523 0.6697 0.5568 5.7363 88.3038 23.7820 98.0548 37.1385 1.8769 0.0863 0.4404
## 7 0.7631 0.6656 0.5468 7.0036 89.1848 26.6002 100.1547 37.7323 1.9597 0.0928 0.4598
## 8 0.7704 0.6556 0.491 8.5126 90.4058 29.7194 102.5945 39.0130 2.0795 0.1019 0.4879
## 9 0.7780 0.6447 0.1905 10.0000 91.5657 33.0632 104.9733 40.4183 2.2079 0.1126 0.5181
## --------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria
## SBIC: Sawa's Bayesian Information Criteria
## SBC: Schwarz Bayesian Criteria
## MSEP: Estimated error of prediction, assuming multivariate normality
## FPE: Final Prediction Error
## HSP: Hocking's Sp
## APC: Amemiya Prediction Criteria
didaptkan model terbaik yaitu Y ~ X4 + X8 + X10
data_best <- subset(dataclean_sub1, select = c(X1, X4, X6, X9, X14))
model_best <- lm(Y ~ X1+X4+X6+X9+X14, data = dataclean_sub1)
model_best##
## Call:
## lm(formula = Y ~ X1 + X4 + X6 + X9 + X14, data = dataclean_sub1)
##
## Coefficients:
## (Intercept) X1 X4 X6 X9 X14
## 7.210e+01 1.015e-01 6.112e-05 2.957e-06 1.218e-01 -2.940e-03
Uji Formal Akhir Guass-Markov
# Asumsi Gauss-Markov: Nilai harapan sisaan sama dengan nol
t.test(model_best$residuals,
mu = 0,
conf.level = 0.95)##
## One Sample t-test
##
## data: model_best$residuals
## t = 8.2337e-17, df = 24, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4452691 0.4452691
## sample estimates:
## mean of x
## 1.776357e-17
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1.06012, Df = 1, p = 0.30319
##
## Runs Test
##
## data: model_best$residuals
## statistic = 0.83485, runs = 15, n1 = 12, n2 = 12, n = 24, p-value =
## 0.4038
## alternative hypothesis: nonrandomness
##
## Shapiro-Wilk normality test
##
## data: model_best$residuals
## W = 0.98815, p-value = 0.9884
Kesimpulan (Model Terbaik)
Persamaan Terbaik Regresi Bergandanya adalah:
\[y=b0-b1X1+b4X4+b6X6+b9X9+b14X14\] \[y= 72.1-0.1015X1+0.00006112X4+0.000002957X6+0.1218X9-0.002940X14\]