LIBRARY
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(GGally)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
##
## cement
## The following object is masked from 'package:dplyr':
##
## select
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
DATA
data <- read_xlsx("C:\\Users\\Hafizh Fadhlah\\OneDrive\\Documents\\Indikator Utama Pendidikan Provinsi Jawa Tengah Tahun 2024 - Copy.xlsx")
head(data)
## # A tibble: 6 × 7
## AMH HLS APM_SLTA APK_SLTA APS_7_12 APS_13_15 APS_16_18
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 95.3 12.7 61.2 83.8 99.9 95.4 74.5
## 2 95.9 13.3 68.7 97.4 100. 99.6 78.3
## 3 93.9 12.0 57.1 77.4 100. 94.8 75.9
## 4 92.9 11.8 54 70.6 99.7 94.6 65.4
## 5 94.9 13.4 73.2 98.6 99.9 99.5 77.3
## 6 95.8 13.6 72.9 92.1 99.6 99.1 83.6
str(data)
## tibble [36 × 7] (S3: tbl_df/tbl/data.frame)
## $ AMH : num [1:36] 95.3 95.9 93.9 92.9 94.9 ...
## $ HLS : num [1:36] 12.7 13.3 12 11.8 13.4 ...
## $ APM_SLTA : num [1:36] 61.1 68.7 57.1 54 73.2 ...
## $ APK_SLTA : num [1:36] 83.8 97.4 77.4 70.7 98.6 ...
## $ APS_7_12 : num [1:36] 99.9 100 100 99.7 99.9 ...
## $ APS_13_15: num [1:36] 95.4 99.6 94.8 94.6 99.5 ...
## $ APS_16_18: num [1:36] 74.5 78.3 75.9 65.4 77.3 ...
summary(data)
## AMH HLS APM_SLTA APK_SLTA
## Min. :88.56 Min. :11.81 Min. :47.45 Min. : 66.74
## 1st Qu.:93.36 1st Qu.:12.57 1st Qu.:59.38 1st Qu.: 83.73
## Median :94.57 Median :12.91 Median :64.13 Median : 90.19
## Mean :94.70 Mean :13.08 Mean :63.59 Mean : 88.82
## 3rd Qu.:95.91 3rd Qu.:13.37 3rd Qu.:68.02 3rd Qu.: 96.17
## Max. :99.15 Max. :15.57 Max. :76.93 Max. :102.71
## APS_7_12 APS_13_15 APS_16_18
## Min. :97.99 Min. :88.85 Min. :55.32
## 1st Qu.:99.25 1st Qu.:96.14 1st Qu.:67.52
## Median :99.64 Median :97.28 Median :73.95
## Mean :99.54 Mean :97.14 Mean :72.45
## 3rd Qu.:99.94 3rd Qu.:99.25 3rd Qu.:77.20
## Max. :99.99 Max. :99.99 Max. :85.94
EKSPLORASI DATA
hist(data$AMH, main = "ANGKA MELEK HURUF")

boxplot(data$AMH, main = "ANGKA MELEK HURUF")

ggpairs(data,
upper = list(continuous = wrap('cor', size = 3)),
title = "Matriks Scatterplot Data")

cor_matrix <- cor(data, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "lower",
col = colorRampPalette(c("red", "white", "blue"))(200),
addCoef.col = "black", tl.col = "black", tl.srt = 35)

PEMODELAN
TANPA FUNGSI
n <- nrow(data)
x0 <- rep(1,n)
x <- data.frame(x0, data$HLS, data$APM_SLTA, data$APK_SLTA,
data$APS_7_12, data$APS_13_15, data$APS_16_18)
x <- as.matrix(x)
y <- data$AMH
beta_duga <- solve(t(x)%*%x)%*%t(x)%*%y
beta_duga
## [,1]
## x0 41.60032808
## data.HLS 2.46838302
## data.APM_SLTA 0.05739631
## data.APK_SLTA -0.21609860
## data.APS_7_12 -0.04961141
## data.APS_13_15 0.47142596
## data.APS_16_18 -0.06202815
DENGAN FUNGSI
model1 = lm(formula = AMH ~ ., data = data)
summary(model1)
##
## Call:
## lm(formula = AMH ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8973 -0.9734 0.2477 0.9393 2.3396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.60033 56.60544 0.735 0.46829
## HLS 2.46838 0.44210 5.583 5.02e-06 ***
## APM_SLTA 0.05740 0.08784 0.653 0.51864
## APK_SLTA -0.21610 0.06578 -3.285 0.00267 **
## APS_7_12 -0.04961 0.56333 -0.088 0.93043
## APS_13_15 0.47143 0.16117 2.925 0.00662 **
## APS_16_18 -0.06203 0.06985 -0.888 0.38185
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.508 on 29 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.5971
## F-statistic: 9.646 on 6 and 29 DF, p-value: 7.462e-06
PEMERIKSAAN MULTIKOLINEARITAS
vif(model1)
## HLS APM_SLTA APK_SLTA APS_7_12 APS_13_15 APS_16_18
## 2.520416 5.411143 5.410893 1.095676 2.967597 4.013443
PENGUJIAN ASUMSI
plot(model1,1)

plot(x = 1:dim(data)[1],
y = model1$residuals,
type = 'b',
ylab = "Residuals",
xlab = "Observation")

plot(model1,2)

PEMERIKSAAN AMATAN TIDAK BIASA
ri_stud <- rstudent(model1)
ri_stan <- rstandard(model1)
hii_fungsi <- hatvalues(model1)
nilai <- data.frame(ri_stud, ri_stan, hii_fungsi)
nilai
## ri_stud ri_stan hii_fungsi
## 1 1.11519990 1.11054402 0.10677209
## 2 0.94290652 0.94471505 0.10323361
## 3 0.78942162 0.79460085 0.28571659
## 4 -0.97089102 -0.97185280 0.17911519
## 5 0.12972873 0.13198534 0.14948871
## 6 -0.07126202 -0.07251682 0.14939608
## 7 -0.28255656 -0.28714886 0.32670759
## 8 0.35974013 0.36526457 0.13637986
## 9 0.40158678 0.40752314 0.18441096
## 10 -0.29915857 -0.30396843 0.45796399
## 11 0.04302874 0.04378892 0.10297839
## 12 0.73360257 0.73951461 0.27280171
## 13 -1.77422600 -1.71196037 0.12693421
## 14 -1.39952653 -1.37695211 0.32397004
## 15 -0.11776285 -0.11981764 0.30574717
## 16 -3.24227817 -2.81351452 0.15665874
## 17 0.42340580 0.42952744 0.32011304
## 18 -0.09223574 -0.09385410 0.19700324
## 19 0.59699960 0.60373649 0.18481150
## 20 0.66327971 0.66977876 0.10717949
## 21 0.21792134 0.22159082 0.12989928
## 22 0.30133719 0.30617495 0.07613060
## 23 1.22798962 1.21737426 0.06134822
## 24 0.63860462 0.64522641 0.07643499
## 25 -0.15370325 -0.15635793 0.17800840
## 26 -1.02296441 -1.02214603 0.09413390
## 27 0.68471044 0.69106858 0.43310918
## 28 -0.89764318 -0.90066449 0.10598712
## 29 -1.69436554 -1.64222116 0.18915295
## 30 1.06474921 1.06230340 0.27216443
## 31 0.65263765 0.65919475 0.23924686
## 32 -0.72934168 -0.73529969 0.34956823
## 33 -1.06181758 -1.05949187 0.42682062
## 34 1.65794826 1.61011146 0.07201544
## 35 1.45827629 1.43075071 0.08613932
## 36 -1.14864425 -1.14237088 0.03245829
PENCILAN
for (i in 1:dim(nilai)[1]){
absri <- abs(nilai[,2])
pencilan <- which(absri > 2)
}
pencilan
## [1] 16
n <- dim(data)[1]
p <- length(model1$coefficients)
for (i in 1:dim(nilai)[1]){
cutoff <- 2*p/n
titik_leverage <- which(hii_fungsi > cutoff)
}
titik_leverage
## 10 27 33
## 10 27 33
ols_plot_resid_lev(model1)

AMATAN BERPENGARUH
Jarak Cook
di <- cooks.distance(model1)
f <- qf(0.05,p,n-p, lower.tail = F)
data.frame(di, di>f)
## di di...f
## 1 2.106051e-02 FALSE
## 2 1.467728e-02 FALSE
## 3 3.607986e-02 FALSE
## 4 2.944104e-02 FALSE
## 5 4.374029e-04 FALSE
## 6 1.319445e-04 FALSE
## 7 5.715729e-03 FALSE
## 8 3.009848e-03 FALSE
## 9 5.364408e-03 FALSE
## 10 1.115224e-02 FALSE
## 11 3.144659e-05 FALSE
## 12 2.930824e-02 FALSE
## 13 6.087249e-02 FALSE
## 14 1.298012e-01 FALSE
## 15 9.032090e-04 FALSE
## 16 2.100640e-01 FALSE
## 17 1.240939e-02 FALSE
## 18 3.087223e-04 FALSE
## 19 1.180505e-02 FALSE
## 20 7.693292e-03 FALSE
## 21 1.047231e-03 FALSE
## 22 1.103545e-03 FALSE
## 23 1.383718e-02 FALSE
## 24 4.922106e-03 FALSE
## 25 7.563362e-04 FALSE
## 26 1.550993e-02 FALSE
## 27 5.212454e-02 FALSE
## 28 1.373844e-02 FALSE
## 29 8.987511e-02 FALSE
## 30 6.028333e-02 FALSE
## 31 1.952234e-02 FALSE
## 32 4.151078e-02 FALSE
## 33 1.194131e-01 FALSE
## 34 2.874079e-02 FALSE
## 35 2.756458e-02 FALSE
## 36 6.254205e-03 FALSE
cooks_crit = f
model_cooks <- cooks.distance(model1)
df <- data.frame(obs = names(model_cooks),
cooks = model_cooks)
ggplot(df, aes(y = cooks, x = obs)) +
geom_point() +
geom_hline(yintercept = cooks_crit, linetype="dashed") +
labs(title = "Cook's Distance",
subtitle = "Influential Observation ",
x = "Observation Number",
y = "Cook's")

DFBETAS
ols_plot_dfbetas(model1)


DFFITS
ols_plot_dffits(model1)

DFFITSi <- dffits(model1)
amatan_berpengaruh <- vector("list", dim(nilai)[1])
for (i in 1:dim(nilai)[1]) {
cutoff <- 2 * sqrt((p / n))
amatan_berpengaruh[[i]] <- which(abs(DFFITSi) > cutoff)
}
berpengaruh <- unlist(amatan_berpengaruh)
amatan_berpengaruh <- sort(unique(berpengaruh))
amatan_berpengaruh
## [1] 14 16 33
PEMODELAN PENYISIHAN AMATAN
dt1 <- data %>% slice(-12)
model3 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, dt1)
dt2 <- data %>% slice(-21)
model4 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, dt2)
dt3 <- data %>% slice(-22)
model5 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, dt3)
dt4 <- data %>% slice(-c(12, 21))
model6 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, dt4)
dt5 <- data %>% slice(-c(12, 22))
model7 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, dt5)
dt6 <- data%>% slice(-c(21, 22))
model8 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, dt6)
dt7 <- data %>% slice(-c(12, 21, 22))
model9 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, dt7)
get_metrics <- function(model) {
adj_r2 <- summary(model)$adj.r.squared
sse <- sum(model$residuals^2)
return(c(adj_r2, sse))
}
model_metrics <- data.frame(
Model = c("Model 3", "Model 4", "Model 5", "Model 6",
"Model 7", "Model 8", "Model 9"),
Adjusted_R2 = sapply(list(model3, model4, model5,
model6, model7, model8, model9),
function(m) get_metrics(m)[1]),
SSE = sapply(list(model3, model4, model5,
model6, model7, model8, model9),
function(m) get_metrics(m)[2])
)
print(model_metrics)
## Model Adjusted_R2 SSE
## 1 Model 3 0.5912891 64.73760
## 2 Model 4 0.5922023 65.87016
## 3 Model 5 0.5939918 65.76859
## 4 Model 6 0.5865412 64.60361
## 5 Model 7 0.5877161 64.59805
## 6 Model 8 0.5887497 65.65012
## 7 Model 9 0.5826189 64.45872
stargazer(model4, model9, type = "text", font.size = "small",
report = "vc*p")
##
## ===============================================================
## Dependent variable:
## -------------------------------------------
## AMH
## (1) (2)
## ---------------------------------------------------------------
## HLS 2.459*** 2.560***
## p = 0.00001 p = 0.00003
##
## APM_SLTA 0.057 0.075
## p = 0.529 p = 0.457
##
## APK_SLTA -0.217*** -0.239***
## p = 0.004 p = 0.006
##
## APS_7_12 -0.068 -0.105
## p = 0.907 p = 0.863
##
## APS_13_15 0.470*** 0.500***
## p = 0.008 p = 0.009
##
## APS_16_18 -0.059 -0.071
## p = 0.424 p = 0.375
##
## Constant 43.615 44.589
## p = 0.461 p = 0.464
##
## ---------------------------------------------------------------
## Observations 35 33
## R2 0.664 0.661
## Adjusted R2 0.592 0.583
## Residual Std. Error 1.534 (df = 28) 1.575 (df = 26)
## F Statistic 9.229*** (df = 6; 28) 8.445*** (df = 6; 26)
## ===============================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
PENYELEKSIAN PEUBAH
BEST SUBSET
bs <- ols_step_best_subset(model1)
bs
## Best Subsets Regression
## -----------------------------------------------------------------
## Model Index Predictors
## -----------------------------------------------------------------
## 1 HLS
## 2 HLS APK_SLTA
## 3 HLS APK_SLTA APS_13_15
## 4 HLS APK_SLTA APS_13_15 APS_16_18
## 5 HLS APM_SLTA APK_SLTA APS_13_15 APS_16_18
## 6 HLS APM_SLTA APK_SLTA APS_7_12 APS_13_15 APS_16_18
## -----------------------------------------------------------------
##
## Subsets Regression Summary
## ---------------------------------------------------------------------------------------------------------------------------------
## Adj. Pred
## Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
## ---------------------------------------------------------------------------------------------------------------------------------
## 1 0.4751 0.4596 0.4337 13.6030 146.2710 43.1764 151.0215 109.8713 3.2212 0.0925 0.5867
## 2 0.5646 0.5382 0.4778 7.8276 141.5414 39.0832 147.8754 93.9862 2.8254 0.0815 0.5146
## 3 0.6570 0.6249 0.5701 1.7970 134.9506 34.3658 142.8682 76.4216 2.3540 0.0683 0.4287
## 4 0.6610 0.6172 0.5487 3.4512 136.5303 36.4915 146.0314 78.0524 2.4618 0.0721 0.4484
## 5 0.6661 0.6104 0.5312 5.0078 137.9842 38.5970 149.0688 79.5282 2.5666 0.0759 0.4675
## 6 0.6662 0.5971 0.5121 7.0000 139.9746 41.0738 152.6427 82.3464 2.7176 0.0813 0.4950
## ---------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria
## SBIC: Sawa's Bayesian Information Criteria
## SBC: Schwarz Bayesian Criteria
## MSEP: Estimated error of prediction, assuming multivariate normality
## FPE: Final Prediction Error
## HSP: Hocking's Sp
## APC: Amemiya Prediction Criteria
Backward, Forward, Stepwise
null<-lm(AMH ~ 1, data=data) # 1 here means the intercept
full<-lm(AMH ~ ., data=data)
step(full, scope=list(lower=null, upper=full),data=data, direction='backward', trace=0)
##
## Call:
## lm(formula = AMH ~ HLS + APK_SLTA + APS_13_15, data = data)
##
## Coefficients:
## (Intercept) HLS APK_SLTA APS_13_15
## 39.2276 2.3702 -0.1992 0.4341
step(null, scope=list(lower=null, upper=full),data=data, direction='forward', trace=0)
##
## Call:
## lm(formula = AMH ~ HLS + APK_SLTA + APS_13_15, data = data)
##
## Coefficients:
## (Intercept) HLS APK_SLTA APS_13_15
## 39.2276 2.3702 -0.1992 0.4341
step(null, scope=list(lower=null, upper=full),data=data, direction='both', trace=0)
##
## Call:
## lm(formula = AMH ~ HLS + APK_SLTA + APS_13_15, data = data)
##
## Coefficients:
## (Intercept) HLS APK_SLTA APS_13_15
## 39.2276 2.3702 -0.1992 0.4341
model1 <- lm(AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 + APS_16_18, data=data)
summary(model1)
##
## Call:
## lm(formula = AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 +
## APS_16_18, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8973 -0.9734 0.2477 0.9393 2.3396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.60033 56.60544 0.735 0.46829
## HLS 2.46838 0.44210 5.583 5.02e-06 ***
## APM_SLTA 0.05740 0.08784 0.653 0.51864
## APK_SLTA -0.21610 0.06578 -3.285 0.00267 **
## APS_7_12 -0.04961 0.56333 -0.088 0.93043
## APS_13_15 0.47143 0.16117 2.925 0.00662 **
## APS_16_18 -0.06203 0.06985 -0.888 0.38185
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.508 on 29 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.5971
## F-statistic: 9.646 on 6 and 29 DF, p-value: 7.462e-06
AIC(model1); BIC(model1)
## [1] 139.9746
## [1] 152.6427
SHRINKAGE METHODS
Ridge
x <- as.matrix(data[, -1])
y <- data$AMH
rid <- cv.glmnet(x, y, alpha = 0, nfolds = 5)
rid
##
## Call: cv.glmnet(x = x, y = y, nfolds = 5, alpha = 0)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.2135 97 2.549 0.5271 6
## 1se 0.8619 82 3.020 0.6731 6
coef(rid, s = "lambda.min")
## 7 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 57.95766167
## HLS 1.91283540
## APM_SLTA -0.01454526
## APK_SLTA -0.11433811
## APS_7_12 -0.08079900
## APS_13_15 0.32912194
## APS_16_18 -0.01546010
Lasso
lass <- cv.glmnet(x, y, alpha=1, nfolds = 5)
lass
##
## Call: cv.glmnet(x = x, y = y, nfolds = 5, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.02038 48 2.949 0.6391 6
## 1se 0.22893 22 3.568 1.2502 3
coef(lass,s="lambda.min")
## 7 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 42.23790488
## HLS 2.35745416
## APM_SLTA 0.01574838
## APK_SLTA -0.18475888
## APS_7_12 -0.01331516
## APS_13_15 0.41898899
## APS_16_18 -0.03222835
summary(model3)
##
## Call:
## lm(formula = AMH ~ HLS + APM_SLTA + APK_SLTA + APS_7_12 + APS_13_15 +
## APS_16_18, data = dt1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7592 -1.0963 0.2280 0.9297 2.2871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.74609 57.08299 0.749 0.46020
## HLS 2.58348 0.47248 5.468 7.73e-06 ***
## APM_SLTA 0.08035 0.09391 0.856 0.39951
## APK_SLTA -0.24207 0.07517 -3.220 0.00323 **
## APS_7_12 -0.08973 0.57049 -0.157 0.87615
## APS_13_15 0.50479 0.16871 2.992 0.00573 **
## APS_16_18 -0.07706 0.07333 -1.051 0.30236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.521 on 28 degrees of freedom
## Multiple R-squared: 0.6634, Adjusted R-squared: 0.5913
## F-statistic: 9.198 on 6 and 28 DF, p-value: 1.353e-05