library(readxl)
library(leaps)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(tidyr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:zoo':
##
## yearmon, yearqtr
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
rumah = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\Semester 5\\PSD\\praktikum\\rumah3.xlsx")
str(rumah)
## tibble [1,047 × 8] (S3: tbl_df/tbl/data.frame)
## $ jenis_properti: chr [1:1047] "Rumah" "Rumah" "Rumah" "Rumah" ...
## $ harga_rp : chr [1:1047] "2.000.000.000" "799.000.000" "2.300.000.000" "6.250.000.000" ...
## $ lokasi : num [1:1047] 1 3 1 1 3 1 6 1 11 1 ...
## $ Jml_KM(Buah) : chr [1:1047] "4" "2" "11" "5" ...
## $ Jml_KT(Buah) : chr [1:1047] "3" "2" "4" "5" ...
## $ Jml_gar(Buah) : chr [1:1047] "2" "1" "1" "2" ...
## $ LT(m²) : chr [1:1047] "90" "102" "90" "294" ...
## $ LB(m²) : chr [1:1047] "130" "90" "150" "350" ...
# jumlah baris yang punya NA di kolom manapun
sum(!complete.cases(rumah))
## [1] 10
rumah[!complete.cases(rumah$lokasi), ]
## # A tibble: 4 × 8
## jenis_properti harga_rp lokasi `Jml_KM(Buah)` `Jml_KT(Buah)` `Jml_gar(Buah)`
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Rumah 6.500.000… NA 4 3 3
## 2 Rumah 589.000.0… NA 3 2 1
## 3 Rumah 1.300.000… NA 3 2 1
## 4 Rumah 563.000.0… NA 2 2 1
## # ℹ 2 more variables: `LT(m²)` <chr>, `LB(m²)` <chr>
nrow(rumah)
## [1] 1047
1047 baris masih mengandung missing value dan outlier
# Hapus mising value
rumah_clean <- rumah %>% drop_na()
sum(!complete.cases(rumah_clean))
## [1] 0
nrow(rumah_clean) ; ncol(rumah_clean)
## [1] 1037
## [1] 8
hasil penghapusan outlier baris menjadi berjumlah 1037
# Menghapus peubah jenis properti dan mengubah peubah menjadi numerik
rumah2 = dplyr::select(rumah_clean, -jenis_properti)
str(rumah2)
## tibble [1,037 × 7] (S3: tbl_df/tbl/data.frame)
## $ harga_rp : chr [1:1037] "2.000.000.000" "799.000.000" "2.300.000.000" "6.250.000.000" ...
## $ lokasi : num [1:1037] 1 3 1 1 3 1 6 1 11 1 ...
## $ Jml_KM(Buah) : chr [1:1037] "4" "2" "11" "5" ...
## $ Jml_KT(Buah) : chr [1:1037] "3" "2" "4" "5" ...
## $ Jml_gar(Buah): chr [1:1037] "2" "1" "1" "2" ...
## $ LT(m²) : chr [1:1037] "90" "102" "90" "294" ...
## $ LB(m²) : chr [1:1037] "130" "90" "150" "350" ...
library(dplyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
rumah2 = rumah_clean %>%
clean_names() %>% # biar nama kolom jadi aman snake_case
mutate(
harga_rp = as.numeric(gsub("\\.", "", harga_rp)),
jml_km_buah = as.numeric(jml_km_buah),
jml_kt_buah = as.numeric(jml_kt_buah),
jml_gar_buah= as.numeric(jml_gar_buah),
lt_m2 = as.numeric(lt_m2),
lb_m2 = as.numeric(lb_m2)
)
str(rumah2)
## tibble [1,037 × 8] (S3: tbl_df/tbl/data.frame)
## $ jenis_properti: chr [1:1037] "Rumah" "Rumah" "Rumah" "Rumah" ...
## $ harga_rp : num [1:1037] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ lokasi : num [1:1037] 1 3 1 1 3 1 6 1 11 1 ...
## $ jml_km_buah : num [1:1037] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1037] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah : num [1:1037] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1037] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1037] 130 90 150 350 50 80 112 90 400 200 ...
# Cek hasilnya
rumah2
## # A tibble: 1,037 × 8
## jenis_properti harga_rp lokasi jml_km_buah jml_kt_buah jml_gar_buah lt_m2
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Rumah 2000000000 1 4 3 2 90
## 2 Rumah 799000000 3 2 2 1 102
## 3 Rumah 2300000000 1 11 4 1 90
## 4 Rumah 6250000000 1 5 5 2 294
## 5 Rumah 600000000 3 2 1 1 66
## 6 Rumah 1100000000 1 3 2 1 60
## 7 Rumah 550000000 6 3 3 0 60
## 8 Rumah 2600000000 1 3 3 2 98
## 9 Rumah 4500000000 11 4 4 2 288
## 10 Rumah 3900000000 1 4 4 0 80
## # ℹ 1,027 more rows
## # ℹ 1 more variable: lb_m2 <dbl>
str(rumah2)
## tibble [1,037 × 8] (S3: tbl_df/tbl/data.frame)
## $ jenis_properti: chr [1:1037] "Rumah" "Rumah" "Rumah" "Rumah" ...
## $ harga_rp : num [1:1037] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ lokasi : num [1:1037] 1 3 1 1 3 1 6 1 11 1 ...
## $ jml_km_buah : num [1:1037] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1037] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah : num [1:1037] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1037] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1037] 130 90 150 350 50 80 112 90 400 200 ...
head(rumah2)
## # A tibble: 6 × 8
## jenis_properti harga_rp lokasi jml_km_buah jml_kt_buah jml_gar_buah lt_m2
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Rumah 2000000000 1 4 3 2 90
## 2 Rumah 799000000 3 2 2 1 102
## 3 Rumah 2300000000 1 11 4 1 90
## 4 Rumah 6250000000 1 5 5 2 294
## 5 Rumah 600000000 3 2 1 1 66
## 6 Rumah 1100000000 1 3 2 1 60
## # ℹ 1 more variable: lb_m2 <dbl>
# Cek missing value kembali
colSums(is.na(rumah2))
## jenis_properti harga_rp lokasi jml_km_buah jml_kt_buah
## 0 0 0 0 0
## jml_gar_buah lt_m2 lb_m2
## 0 0 0
rumah2 %>% filter(is.na(lokasi))
## # A tibble: 0 × 8
## # ℹ 8 variables: jenis_properti <chr>, harga_rp <dbl>, lokasi <dbl>,
## # jml_km_buah <dbl>, jml_kt_buah <dbl>, jml_gar_buah <dbl>, lt_m2 <dbl>,
## # lb_m2 <dbl>
rumah_dummy = fastDummies::dummy_cols(
rumah2,
select_columns = "lokasi",
remove_first_dummy = TRUE # otomatis jadi k-1
)
str(rumah_dummy)
## tibble [1,037 × 18] (S3: tbl_df/tbl/data.frame)
## $ jenis_properti: chr [1:1037] "Rumah" "Rumah" "Rumah" "Rumah" ...
## $ harga_rp : num [1:1037] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ lokasi : num [1:1037] 1 3 1 1 3 1 6 1 11 1 ...
## $ jml_km_buah : num [1:1037] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1037] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah : num [1:1037] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1037] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1037] 130 90 150 350 50 80 112 90 400 200 ...
## $ lokasi_2 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_3 : int [1:1037] 0 1 0 0 1 0 0 0 0 0 ...
## $ lokasi_4 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_5 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_6 : int [1:1037] 0 0 0 0 0 0 1 0 0 0 ...
## $ lokasi_7 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_8 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_9 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_10 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_11 : int [1:1037] 0 0 0 0 0 0 0 0 1 0 ...
View(rumah_dummy)
# Hapus variable dummy "lokasi_11"
rumah2_dummy = dplyr::select(rumah_dummy,-jenis_properti, -lokasi, -lokasi_11)
str(rumah2_dummy)
## tibble [1,037 × 15] (S3: tbl_df/tbl/data.frame)
## $ harga_rp : num [1:1037] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ jml_km_buah : num [1:1037] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1037] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah: num [1:1037] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1037] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1037] 130 90 150 350 50 80 112 90 400 200 ...
## $ lokasi_2 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_3 : int [1:1037] 0 1 0 0 1 0 0 0 0 0 ...
## $ lokasi_4 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_5 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_6 : int [1:1037] 0 0 0 0 0 0 1 0 0 0 ...
## $ lokasi_7 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_8 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_9 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_10 : int [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
# ubah kolom lokasi_2 sampai lokasi_10 menjadi numeric
rumah2_dummy <- rumah2_dummy %>%
mutate(across(lokasi_2:lokasi_10, as.numeric))
# cek hasilnya
str(rumah2_dummy)
## tibble [1,037 × 15] (S3: tbl_df/tbl/data.frame)
## $ harga_rp : num [1:1037] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ jml_km_buah : num [1:1037] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1037] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah: num [1:1037] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1037] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1037] 130 90 150 350 50 80 112 90 400 200 ...
## $ lokasi_2 : num [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_3 : num [1:1037] 0 1 0 0 1 0 0 0 0 0 ...
## $ lokasi_4 : num [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_5 : num [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_6 : num [1:1037] 0 0 0 0 0 0 1 0 0 0 ...
## $ lokasi_7 : num [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_8 : num [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_9 : num [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_10 : num [1:1037] 0 0 0 0 0 0 0 0 0 0 ...
# Model regresi
reg = lm(harga_rp ~., data = rumah2_dummy)
summary(reg)
##
## Call:
## lm(formula = harga_rp ~ ., data = rumah2_dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.330e+10 -8.023e+08 2.417e+08 8.084e+08 1.630e+11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.713e+09 6.239e+08 -2.745 0.00616 **
## jml_km_buah 6.311e+08 2.136e+08 2.955 0.00320 **
## jml_kt_buah -5.440e+08 2.471e+08 -2.202 0.02792 *
## jml_gar_buah 3.177e+08 1.900e+08 1.672 0.09480 .
## lt_m2 1.146e+07 9.042e+05 12.679 < 2e-16 ***
## lb_m2 1.911e+07 1.618e+06 11.811 < 2e-16 ***
## lokasi_2 -5.694e+08 5.490e+08 -1.037 0.29993
## lokasi_3 -1.126e+09 5.683e+08 -1.981 0.04783 *
## lokasi_4 -3.006e+09 2.146e+09 -1.401 0.16156
## lokasi_5 -1.577e+09 2.058e+09 -0.766 0.44377
## lokasi_6 -2.500e+09 8.548e+08 -2.924 0.00353 **
## lokasi_7 -3.932e+09 1.326e+09 -2.965 0.00309 **
## lokasi_8 -1.855e+09 3.360e+09 -0.552 0.58113
## lokasi_9 -1.140e+10 6.697e+09 -1.703 0.08896 .
## lokasi_10 -2.274e+09 6.683e+09 -0.340 0.73372
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.665e+09 on 1022 degrees of freedom
## Multiple R-squared: 0.5577, Adjusted R-squared: 0.5517
## F-statistic: 92.06 on 14 and 1022 DF, p-value: < 2.2e-16
# Uji formal
shapiro = shapiro.test(residuals(reg)) # H0: residual ~ normal
# Hasil
cat("UJI NORMALITAS (Shapiro-Wilk):\n")
## UJI NORMALITAS (Shapiro-Wilk):
cat("W =", shapiro$statistic, " | p-value =", shapiro$p.value, "\n")
## W = 0.3130196 | p-value = 4.439824e-52
if(shapiro$p.value > 0.05){
cat(">> Residual berdistribusi normal (p > 0.05)\n\n")
} else {
cat(">> Residual TIDAK berdistribusi normal (p <= 0.05)\n\n")
}
## >> Residual TIDAK berdistribusi normal (p <= 0.05)
# Plot QQ-plot
residuals_reg = residuals(reg) # residual dari model
fitted_reg = fitted(reg) # nilai prediksi
qqnorm(residuals_reg, main = "QQ Plot Residual")
qqline(residuals_reg, col = "red", lwd = 2)
hist(residuals_reg,
breaks = 30,
main = "Histogram Residual",
xlab = "Residuals",
col = " blue",
border = "white")
bp = bptest(reg)
# Hasil
cat("UJI HOMOSKEDASTISITAS (Breusch-Pagan):\n")
## UJI HOMOSKEDASTISITAS (Breusch-Pagan):
cat("BP =", bp$statistic, " | p-value =", bp$p.value, "\n")
## BP = 262.4065 | p-value = 7.756303e-48
if(bp$p.value > 0.05){
cat(">> Tidak ada masalah heteroskedastisitas (varian residual konstan)\n\n")
} else {
cat(">> Ada indikasi heteroskedastisitas (varian residual tidak konstan)\n\n")
}
## >> Ada indikasi heteroskedastisitas (varian residual tidak konstan)
plot(fitted_reg, residuals_reg,
main = "Fitted vs Residuals",
xlab = "Fitted Values",
ylab = "Residuals",
pch = 19, col = "blue")
abline(h = 0, col = "red", lwd = 2)
vif = vif(reg)
# Hasil
cat("UJI MULTIKOLINEARITAS (VIF):\n")
## UJI MULTIKOLINEARITAS (VIF):
print(vif)
## jml_km_buah jml_kt_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 2.736018 3.256056 1.770220 2.087820 2.608053 1.487995
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_8
## 1.472665 1.026294 1.037738 1.172608 1.078105 1.012837
## lokasi_9 lokasi_10
## 1.008730 1.004563
if(all(vif < 10)){
cat(">> Tidak ada multikolinearitas serius (semua VIF < 10)\n")
} else {
cat(">> Ada multikolinearitas serius (VIF >= 10)\n")
}
## >> Tidak ada multikolinearitas serius (semua VIF < 10)
# Durbin-Watson test
dw = dwtest(reg)
# Hasil
cat("UJI AUTOKORELASI (Durbin-Watson):\n")
## UJI AUTOKORELASI (Durbin-Watson):
cat("DW =", dw$statistic, " | p-value =", dw$p.value, "\n")
## DW = 1.902158 | p-value = 0.05069347
if(dw$p.value > 0.05){
cat(">> Tidak ada autokorelasi pada residual\n\n")
} else {
cat(">> Ada indikasi autokorelasi pada residual\n\n")
}
## >> Tidak ada autokorelasi pada residual
# Bonferroni test untuk outlier signifikan
outlier = outlierTest(reg)
# Hasil
if(is.null(outlier)){
cat("✅ Tidak ada outlier signifikan berdasarkan uji Bonferroni\n")
} else {
print(outlier)
cat("⚠️ Ada outlier signifikan, periksa kembali data di atas\n")
}
## rstudent unadjusted p-value Bonferroni p
## 528 48.719832 1.2985e-268 1.3440e-265
## 345 -11.885797 1.3180e-30 1.3642e-27
## 588 -7.982211 3.8474e-15 3.9820e-12
## 538 5.049411 5.2463e-07 5.4299e-04
## 323 -5.017572 6.1697e-07 6.3856e-04
## 346 -4.814501 1.6985e-06 1.7580e-03
## ⚠️ Ada outlier signifikan, periksa kembali data di atas
# Plot residual vs fitted
plot(reg, which = 1)
# Tambahkan titik outlier (jika ada)
if(!is.null(outlier)){
points(reg$fitted.values[as.numeric(names(outlier$rstudent))],
outlier$rstudent,
col = "red", pch = 19)
text(reg$fitted.values[as.numeric(names(outlier$rstudent))],
outlier$rstudent,
labels = names(outlier$rstudent),
pos = 3, col = "red")
}
std_res = rstudent(reg)
outlier_idx = which(abs(std_res)>3)
# Jumlah outlier
cat("Jumlah outlier =", length(outlier_idx), "\n")
## Jumlah outlier = 12
# Kalau mau lihat data keberapa yang outlier
outlier_idx
## 164 171 294 323 345 346 458 460 528 538 555 588
## 164 171 294 323 345 346 458 460 528 538 555 588
# Hapus baris yang outlier
rumah.clean = rumah2_dummy[-outlier_idx, ]
str(rumah.clean)
## tibble [1,025 × 15] (S3: tbl_df/tbl/data.frame)
## $ harga_rp : num [1:1025] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ jml_km_buah : num [1:1025] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1025] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah: num [1:1025] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1025] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1025] 130 90 150 350 50 80 112 90 400 200 ...
## $ lokasi_2 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_3 : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
## $ lokasi_4 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_5 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_6 : num [1:1025] 0 0 0 0 0 0 1 0 0 0 ...
## $ lokasi_7 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_8 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_9 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_10 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
# Cek jumlah baris sebelum dan sesudah
cat("Sebelum:", nrow(rumah2_dummy), "baris\n")
## Sebelum: 1037 baris
cat("Sesudah:", nrow(rumah.clean), "baris\n")
## Sesudah: 1025 baris
Terlihat bahwa setelah baris berkurang setelah dilakukan penghapusan outlier, dari baris yang berjumlah 1037 baris menjadi 1025 baris.
# Konversi semua kolom lokasi jadi numeric di rumah.clean
rumah.clean[, grep("^lokasi", names(rumah.clean))] <-
lapply(rumah.clean[, grep("^lokasi", names(rumah.clean))], as.numeric)
# Cek struktur data setelah perubahan
str(rumah.clean)
## tibble [1,025 × 15] (S3: tbl_df/tbl/data.frame)
## $ harga_rp : num [1:1025] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ jml_km_buah : num [1:1025] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1025] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah: num [1:1025] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1025] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1025] 130 90 150 350 50 80 112 90 400 200 ...
## $ lokasi_2 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_3 : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
## $ lokasi_4 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_5 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_6 : num [1:1025] 0 0 0 0 0 0 1 0 0 0 ...
## $ lokasi_7 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_8 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_9 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_10 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
# Regresi modedl tanpa NA dan outlier
reg2 = lm(harga_rp ~ ., data = rumah.clean)
summary(reg2)
##
## Call:
## lm(formula = harga_rp ~ ., data = rumah.clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.346e+10 -6.335e+08 1.769e+07 5.172e+08 2.118e+10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.192e+08 2.720e+08 0.806 0.42047
## jml_km_buah -2.733e+08 1.130e+08 -2.418 0.01578 *
## jml_kt_buah -1.090e+08 1.156e+08 -0.943 0.34585
## jml_gar_buah 5.792e+08 9.022e+07 6.420 2.10e-10 ***
## lt_m2 8.439e+06 8.093e+05 10.427 < 2e-16 ***
## lb_m2 1.819e+07 8.977e+05 20.268 < 2e-16 ***
## lokasi_2 -5.345e+08 2.207e+08 -2.422 0.01559 *
## lokasi_3 -9.860e+08 2.291e+08 -4.304 1.84e-05 ***
## lokasi_4 -2.173e+09 8.594e+08 -2.528 0.01161 *
## lokasi_5 -1.584e+09 8.245e+08 -1.921 0.05500 .
## lokasi_6 -1.576e+09 3.443e+08 -4.578 5.29e-06 ***
## lokasi_7 -1.629e+09 5.362e+08 -3.038 0.00244 **
## lokasi_8 -1.208e+09 1.344e+09 -0.899 0.36890
## lokasi_9 -8.493e+09 2.686e+09 -3.162 0.00161 **
## lokasi_10 -2.360e+09 2.672e+09 -0.883 0.37744
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.665e+09 on 1010 degrees of freedom
## Multiple R-squared: 0.7684, Adjusted R-squared: 0.7652
## F-statistic: 239.3 on 14 and 1010 DF, p-value: < 2.2e-16
best.subset <- regsubsets(
harga_rp ~ .,
data = rumah.clean,
nvmax = ncol(rumah.clean) - 1
)
best.subset
## Subset selection object
## Call: regsubsets.formula(harga_rp ~ ., data = rumah.clean, nvmax = ncol(rumah.clean) -
## 1)
## 14 Variables (and intercept)
## Forced in Forced out
## jml_km_buah FALSE FALSE
## jml_kt_buah FALSE FALSE
## jml_gar_buah FALSE FALSE
## lt_m2 FALSE FALSE
## lb_m2 FALSE FALSE
## lokasi_2 FALSE FALSE
## lokasi_3 FALSE FALSE
## lokasi_4 FALSE FALSE
## lokasi_5 FALSE FALSE
## lokasi_6 FALSE FALSE
## lokasi_7 FALSE FALSE
## lokasi_8 FALSE FALSE
## lokasi_9 FALSE FALSE
## lokasi_10 FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: exhaustive
# Ringkasan hasil Best Subset Selection
summary.best = summary(best.subset)
summary.best
## Subset selection object
## Call: regsubsets.formula(harga_rp ~ ., data = rumah.clean, nvmax = ncol(rumah.clean) -
## 1)
## 14 Variables (and intercept)
## Forced in Forced out
## jml_km_buah FALSE FALSE
## jml_kt_buah FALSE FALSE
## jml_gar_buah FALSE FALSE
## lt_m2 FALSE FALSE
## lb_m2 FALSE FALSE
## lokasi_2 FALSE FALSE
## lokasi_3 FALSE FALSE
## lokasi_4 FALSE FALSE
## lokasi_5 FALSE FALSE
## lokasi_6 FALSE FALSE
## lokasi_7 FALSE FALSE
## lokasi_8 FALSE FALSE
## lokasi_9 FALSE FALSE
## lokasi_10 FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: exhaustive
## jml_km_buah jml_kt_buah jml_gar_buah lt_m2 lb_m2 lokasi_2 lokasi_3
## 1 ( 1 ) " " " " " " " " "*" " " " "
## 2 ( 1 ) " " " " " " "*" "*" " " " "
## 3 ( 1 ) " " " " "*" "*" "*" " " " "
## 4 ( 1 ) "*" " " "*" "*" "*" " " " "
## 5 ( 1 ) "*" " " "*" "*" "*" " " " "
## 6 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 7 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 8 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 9 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 10 ( 1 ) "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" " " "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_8 lokasi_9 lokasi_10
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " "*" " " " " " " " "
## 6 ( 1 ) " " " " "*" " " " " " " " "
## 7 ( 1 ) " " " " "*" " " " " "*" " "
## 8 ( 1 ) " " " " "*" "*" " " "*" " "
## 9 ( 1 ) "*" " " "*" "*" " " "*" " "
## 10 ( 1 ) "*" " " "*" "*" " " "*" " "
## 11 ( 1 ) "*" "*" "*" "*" " " "*" " "
## 12 ( 1 ) "*" "*" "*" "*" " " "*" " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
# Tampilkan ukuran model
summary.best$which # variabel apa saja yang masuk tiap model
## (Intercept) jml_km_buah jml_kt_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 1 TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 2 TRUE FALSE FALSE FALSE TRUE TRUE FALSE
## 3 TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## 4 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 5 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 6 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 7 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 8 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 9 TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 10 TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## 11 TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 14 TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_8 lokasi_9 lokasi_10
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 5 FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 6 TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 7 TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## 8 TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## 9 TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE
## 10 TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE
## 11 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## 12 TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 14 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
summary.best$adjr2 # adjusted R^2
## [1] 0.7053393 0.7445768 0.7517270 0.7561850 0.7584367 0.7601913 0.7619955
## [8] 0.7631836 0.7640200 0.7647518 0.7653086 0.7652624 0.7652145 0.7651634
summary.best$cp # Mallows' Cp
## [1] 262.60684 92.59183 62.41744 43.99706 35.18825 28.55370 21.71906
## [8] 17.56516 14.94163 12.77720 11.37350 12.57318 13.77971 15.00000
summary.best$bic # BIC
## [1] -1239.616 -1380.161 -1403.334 -1415.979 -1419.562 -1421.108 -1422.924
## [8] -1422.129 -1419.833 -1417.094 -1413.602 -1407.480 -1401.352 -1395.210
# Plot kriteria untuk memilih model terbaik
par(mfrow = c(1, 3))
plot(summary.best$adjr2, type = "b", xlab = "Jumlah Variabel", ylab = "Adjusted R^2")
plot(summary.best$cp, type = "b", xlab = "Jumlah Variabel", ylab = "Cp")
plot(summary.best$bic, type = "b", xlab = "Jumlah Variabel", ylab = "BIC")
Adj \(R^2\)
semakin banyak variabel, model menjelaskan variasi data lebih baik ini ditunjukkan dari garis yang menaik
kenaikannya mulai melandai di sekitar 6-8 variabel
Cp. Mallow’s
model yang baik adalah dengan nilai Cp yang paling kecil
pada plot terlihat nilai Cp turun tajam pada saat jumlah variabel sekita 6-7 variabel
BIC
Model terbaik adalah model dengan nilai BIC yang paling kecil
pada plot terlihat titik terendah nilai BIC ada saat jumlah variabel sekitar 2-3 variablel
# Cari model terbaik berdasarkan kriteria
best.adjr2 = which.max(summary.best$adjr2) # Adjusted R²
best.cp = which.min(summary.best$cp) # Mallows' Cp
best.bic = which.min(summary.best$bic)
cat("Model terbaik berdasarkan Adjusted R² adalah M", best.adjr2, "\n")
## Model terbaik berdasarkan Adjusted R² adalah M 11
cat("Model terbaik berdasarkan Mallows' Cp adalah M", best.cp, "\n")
## Model terbaik berdasarkan Mallows' Cp adalah M 11
cat("Model terbaik berdasarkan BIC adalah M", best.bic, "\n\n")
## Model terbaik berdasarkan BIC adalah M 7
fw.model <- regsubsets(
harga_rp ~ .,
data = rumah.clean,
nvmax = 15,
method = "forward"
)
summary.fw = summary(fw.model)
summary.fw
## Subset selection object
## Call: regsubsets.formula(harga_rp ~ ., data = rumah.clean, nvmax = 15,
## method = "forward")
## 14 Variables (and intercept)
## Forced in Forced out
## jml_km_buah FALSE FALSE
## jml_kt_buah FALSE FALSE
## jml_gar_buah FALSE FALSE
## lt_m2 FALSE FALSE
## lb_m2 FALSE FALSE
## lokasi_2 FALSE FALSE
## lokasi_3 FALSE FALSE
## lokasi_4 FALSE FALSE
## lokasi_5 FALSE FALSE
## lokasi_6 FALSE FALSE
## lokasi_7 FALSE FALSE
## lokasi_8 FALSE FALSE
## lokasi_9 FALSE FALSE
## lokasi_10 FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: forward
## jml_km_buah jml_kt_buah jml_gar_buah lt_m2 lb_m2 lokasi_2 lokasi_3
## 1 ( 1 ) " " " " " " " " "*" " " " "
## 2 ( 1 ) " " " " " " "*" "*" " " " "
## 3 ( 1 ) " " " " "*" "*" "*" " " " "
## 4 ( 1 ) "*" " " "*" "*" "*" " " " "
## 5 ( 1 ) "*" " " "*" "*" "*" " " " "
## 6 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 7 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 8 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 9 ( 1 ) "*" " " "*" "*" "*" " " "*"
## 10 ( 1 ) "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" " " "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_8 lokasi_9 lokasi_10
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " "*" " " " " " " " "
## 6 ( 1 ) " " " " "*" " " " " " " " "
## 7 ( 1 ) " " " " "*" " " " " "*" " "
## 8 ( 1 ) " " " " "*" "*" " " "*" " "
## 9 ( 1 ) "*" " " "*" "*" " " "*" " "
## 10 ( 1 ) "*" " " "*" "*" " " "*" " "
## 11 ( 1 ) "*" "*" "*" "*" " " "*" " "
## 12 ( 1 ) "*" "*" "*" "*" " " "*" " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
length(summary.fw$adjr2)
## [1] 14
length(summary.fw$cp)
## [1] 14
length(summary.fw$bic)
## [1] 14
summary.fw = summary(fw.model)
# Buat tabel berisi Adjusted R², Cp, BIC
results = data.frame(
Jumlah_Variabel = 1:14,
Adj_R2 = summary.fw$adjr2,
Cp = summary.fw$cp,
BIC = summary.fw$bic
)
results
## Jumlah_Variabel Adj_R2 Cp BIC
## 1 1 0.7053393 262.60684 -1239.616
## 2 2 0.7445768 92.59183 -1380.161
## 3 3 0.7517270 62.41744 -1403.334
## 4 4 0.7561850 43.99706 -1415.979
## 5 5 0.7584367 35.18825 -1419.562
## 6 6 0.7601913 28.55370 -1421.108
## 7 7 0.7619955 21.71906 -1422.924
## 8 8 0.7631836 17.56516 -1422.129
## 9 9 0.7640200 14.94163 -1419.833
## 10 10 0.7647518 12.77720 -1417.094
## 11 11 0.7653086 11.37350 -1413.602
## 12 12 0.7652624 12.57318 -1407.480
## 13 13 0.7652145 13.77971 -1401.352
## 14 14 0.7651634 15.00000 -1395.210
# Cari model terbaik berdasarkan masing-masing kriteria
best_adjr2 = which.max(results$Adj_R2) # indeks model dengan Adj R² terbesar
best_cp = which.min(results$Cp) # indeks model dengan Cp terkecil
best_bic = which.min(results$BIC) # indeks model dengan BIC terkecil
# Visualisasi hasil seleksi
plot(fw.model, scale = "adjr2") # plot berdasarkan Adjusted R²
plot(fw.model, scale = "bic") # plot berdasarkan BIC
plot(fw.model, scale = "Cp") # plot berdasarkan Cp
# Model terbaik menurut Adj R²
best_adjr2 = which.max(results$Adj_R2)
cat("Model terbaik (Adj R²) adalah model ke-", best_adjr2,
"dengan", best_adjr2, "variabel:\n")
## Model terbaik (Adj R²) adalah model ke- 11 dengan 11 variabel:
print(coef(fw.model, best_adjr2))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 200519694 -351134768 570897420 8567784 17904860 -499483974
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_9
## -945784741 -2094467790 -1518320066 -1551144689 -1598904604 -8266786821
# Model terbaik menurut Cp
best_cp = which.min(results$Cp)
cat("\nModel terbaik (Cp) adalah model ke-", best_cp,
"dengan", best_cp, "variabel:\n")
##
## Model terbaik (Cp) adalah model ke- 11 dengan 11 variabel:
print(coef(fw.model, best_cp))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 200519694 -351134768 570897420 8567784 17904860 -499483974
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_9
## -945784741 -2094467790 -1518320066 -1551144689 -1598904604 -8266786821
# Model terbaik menurut BIC
best_bic = which.min(results$BIC)
cat("\nModel terbaik (BIC) adalah model ke-", best_bic,
"dengan", best_bic, "variabel:\n")
##
## Model terbaik (BIC) adalah model ke- 7 dengan 7 variabel:
print(coef(fw.model, best_bic))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_3
## -215620510 -342125852 565853026 8437699 18251029 -576357539
## lokasi_6 lokasi_9
## -1218056665 -7949345698
# Ubah ke long format untuk ggplot (hasil forward stepwise)
results_long_fw <- reshape2::melt(results, id.vars = "Jumlah_Variabel")
# Data titik model terbaik (Adj R², Cp, BIC) pada forward stepwise
best_points_fw <- data.frame(
Jumlah_Variabel = c(best_adjr2, best_cp, best_bic),
value = c(results$Adj_R2[best_adjr2],
results$Cp[best_cp],
results$BIC[best_bic]),
variable = c("Adj_R2","Cp","BIC")
)
# Plot hasil forward stepwise dengan model terbaik ditandai
ggplot(results_long_fw, aes(x = Jumlah_Variabel, y = value, color = variable)) +
geom_line(size = 1) +
geom_point(size = 2) +
geom_point(data = best_points_fw,
aes(x = Jumlah_Variabel, y = value),
color = "red", size = 3, shape = 17) +
theme_minimal() +
labs(title = "Forward Stepwise Selection (Model Terbaik)",
y = "Nilai Kriteria",
x = "Jumlah Variabel",
color = "Kriteria")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Hasil model terbaik Forward Stepwise Selection
cat("Model terbaik (Adj R²):", best_adjr2, "variabel\n")
## Model terbaik (Adj R²): 11 variabel
cat("Model terbaik (Cp):", best_cp, "variabel\n")
## Model terbaik (Cp): 11 variabel
cat("Model terbaik (BIC):", best_bic, "variabel\n")
## Model terbaik (BIC): 7 variabel
Model terbaik berdasarkan \(R^2_{adj}\) adalah: model 11 dengan 11 variable
Model terbaik berdasarkan Cp. Mallow’s adalah : model 11 dengan 11 variable
Model terbaik berdasarkan BIC adalah : model 7 dengan 7 variable
# Backward Stepwise Selection
bw.model = regsubsets(harga_rp ~ .,
data = rumah.clean,
nbest = 1,
nvmax = NULL,
method = "backward")
# Ringkasan hasil backward stepwise
summary.bw = summary(bw.model)
# Buat tabel hasil
results.bw = data.frame(
Jumlah_Variabel = 1:length(summary.bw$adjr2),
Adj_R2 = summary.bw$adjr2,
Cp = summary.bw$cp,
BIC = summary.bw$bic
)
results.bw
## Jumlah_Variabel Adj_R2 Cp BIC
## 1 1 0.7053393 262.60684 -1239.616
## 2 2 0.7445768 92.59183 -1380.161
## 3 3 0.7517270 62.41744 -1403.334
## 4 4 0.7561850 43.99706 -1415.979
## 5 5 0.7584367 35.18825 -1419.562
## 6 6 0.7601913 28.55370 -1421.108
## 7 7 0.7619955 21.71906 -1422.924
## 8 8 0.7631836 17.56516 -1422.129
## 9 9 0.7640200 14.94163 -1419.833
## 10 10 0.7647518 12.77720 -1417.094
## 11 11 0.7653086 11.37350 -1413.602
## 12 12 0.7652624 12.57318 -1407.480
## 13 13 0.7652145 13.77971 -1401.352
## 14 14 0.7651634 15.00000 -1395.210
# Cari model terbaik berdasarkan masing-masing kriteria
best_adjr2_bw = which.max(results.bw$Adj_R2)
best_cp_bw = which.min(results.bw$Cp)
best_bic_bw = which.min(results.bw$BIC)
# Tampilkan variabel yang terpilih
cat("\nVariabel pada model terbaik (Adj R²):\n")
##
## Variabel pada model terbaik (Adj R²):
print(coef(bw.model, best_adjr2_bw))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 200519694 -351134768 570897420 8567784 17904860 -499483974
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_9
## -945784741 -2094467790 -1518320066 -1551144689 -1598904604 -8266786821
cat("\nVariabel pada model terbaik (Cp):\n")
##
## Variabel pada model terbaik (Cp):
print(coef(bw.model, best_cp_bw))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 200519694 -351134768 570897420 8567784 17904860 -499483974
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_9
## -945784741 -2094467790 -1518320066 -1551144689 -1598904604 -8266786821
cat("\nVariabel pada model terbaik (BIC):\n")
##
## Variabel pada model terbaik (BIC):
print(coef(bw.model, best_bic_bw))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_3
## -215620510 -342125852 565853026 8437699 18251029 -576357539
## lokasi_6 lokasi_9
## -1218056665 -7949345698
# Visualisasi hasil backward stepwise
plot(bw.model, scale = "adjr2") # pakai Adj R²
plot(bw.model, scale = "Cp") # pakai Mallows' Cp
plot(bw.model, scale = "bic") # pakai BIC
library(ggplot2)
library(reshape2)
results_bw_long <- reshape2::melt(results.bw, id.vars = "Jumlah_Variabel")
ggplot(results_bw_long, aes(x = Jumlah_Variabel, y = value, color = variable)) +
geom_line(size = 1) +
geom_point(size = 2) +
geom_point(data = data.frame(
Jumlah_Variabel = c(best_adjr2_bw, best_cp_bw, best_bic_bw),
value = c(results.bw$Adj_R2[best_adjr2_bw],
results.bw$Cp[best_cp_bw],
results.bw$BIC[best_bic_bw]),
variable = c("Adj_R2","Cp","BIC")
), aes(x = Jumlah_Variabel, y = value),
color = "red", size = 3, shape = 17) +
theme_minimal() +
labs(title = "Backward Stepwise Selection (Model Terbaik)",
y = "Nilai Kriteria",
x = "Jumlah Variabel",
color = "Kriteria")
# # Hasil model terbaik Backward Stepwise Selection
cat("Model terbaik (Backward - Adj R²):", best_adjr2_bw, "variabel\n")
## Model terbaik (Backward - Adj R²): 11 variabel
cat("Model terbaik (Backward - Cp):", best_cp_bw, "variabel\n")
## Model terbaik (Backward - Cp): 11 variabel
cat("Model terbaik (Backward - BIC):", best_bic_bw, "variabel\n")
## Model terbaik (Backward - BIC): 7 variabel
# Hybrid Stepwise Selection
hy.model = regsubsets(harga_rp ~ .,
data = rumah.clean,
nbest = 1,
nvmax = NULL,
method = "seqrep")
# Ringkasan hasil hybrid
summary.hy = summary(hy.model)
# Buat tabel hasil
results.hy = data.frame(
Jumlah_Variabel = 1:length(summary.hy$adjr2),
Adj_R2 = summary.hy$adjr2,
Cp = summary.hy$cp,
BIC = summary.hy$bic
)
results.hy
## Jumlah_Variabel Adj_R2 Cp BIC
## 1 1 0.7053393 262.60684 -1239.616
## 2 2 0.7445768 92.59183 -1380.161
## 3 3 0.7517270 62.41744 -1403.334
## 4 4 0.6579726 470.57726 -1069.036
## 5 5 0.7584367 35.18825 -1419.562
## 6 6 0.7601913 28.55370 -1421.108
## 7 7 0.7568478 44.01194 -1400.991
## 8 8 0.7631836 17.56516 -1422.129
## 9 9 0.7640200 14.94163 -1419.833
## 10 10 0.7647518 12.77720 -1417.094
## 11 11 0.7653086 11.37350 -1413.602
## 12 12 0.7631291 21.76664 -1398.207
## 13 13 0.7652145 13.77971 -1401.352
## 14 14 0.7651634 15.00000 -1395.210
# Hasil model terbaik Hybrid Selection
best_adjr2_hy = which.max(results.hy$Adj_R2)
best_cp_hy = which.min(results.hy$Cp)
best_bic_hy = which.min(results.hy$BIC)
cat("Model terbaik (Hybrid - Adj R²):", best_adjr2_hy, "variabel\n")
## Model terbaik (Hybrid - Adj R²): 11 variabel
cat("Model terbaik (Hybrid - Cp):", best_cp_hy, "variabel\n")
## Model terbaik (Hybrid - Cp): 11 variabel
cat("Model terbaik (Hybrid - BIC):", best_bic_hy, "variabel\n")
## Model terbaik (Hybrid - BIC): 8 variabel
# Variabel yang terpilih
cat("\nVariabel pada model terbaik (Adj R²):\n")
##
## Variabel pada model terbaik (Adj R²):
print(coef(hy.model, best_adjr2_hy))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 200519694 -351134768 570897420 8567784 17904860 -499483974
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_9
## -945784741 -2094467790 -1518320066 -1551144689 -1598904604 -8266786821
cat("\nVariabel pada model terbaik (Cp):\n")
##
## Variabel pada model terbaik (Cp):
print(coef(hy.model, best_cp_hy))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 200519694 -351134768 570897420 8567784 17904860 -499483974
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_9
## -945784741 -2094467790 -1518320066 -1551144689 -1598904604 -8266786821
cat("\nVariabel pada model terbaik (BIC):\n")
##
## Variabel pada model terbaik (BIC):
print(coef(hy.model, best_bic_hy))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_3
## -158227626 -346991607 573875680 8502713 18207817 -628036083
## lokasi_6 lokasi_7 lokasi_9
## -1273605236 -1301213609 -8013660110
# Ubah ke long format untuk ggplot (hasil hybrid stepwise)
results_hy_long <- reshape2::melt(results.hy, id.vars = "Jumlah_Variabel")
# Data titik model terbaik (Adj R², Cp, BIC) pada hybrid stepwise
best_points_hy <- data.frame(
Jumlah_Variabel = c(best_adjr2_hy, best_cp_hy, best_bic_hy),
value = c(results.hy$Adj_R2[best_adjr2_hy],
results.hy$Cp[best_cp_hy],
results.hy$BIC[best_bic_hy]),
variable = c("Adj_R2","Cp","BIC")
)
# Plot hasil hybrid stepwise dengan model terbaik ditandai
ggplot(results_hy_long, aes(x = Jumlah_Variabel, y = value, color = variable)) +
geom_line(size = 1) +
geom_point(size = 2) +
geom_point(data = best_points_hy,
aes(x = Jumlah_Variabel, y = value),
color = "red", size = 3, shape = 17) +
theme_minimal() +
labs(title = "Hybrid Stepwise Selection (Model Terbaik)",
y = "Nilai Kriteria",
x = "Jumlah Variabel",
color = "Kriteria")
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# model penuh
model_full <- lm(harga_rp ~ ., data = rumah.clean)
# stepwise selection (AIC-based)
model_best <- stepAIC(model_full, direction = "both")
## Start: AIC=44506.85
## harga_rp ~ jml_km_buah + jml_kt_buah + jml_gar_buah + lt_m2 +
## lb_m2 + lokasi_2 + lokasi_3 + lokasi_4 + lokasi_5 + lokasi_6 +
## lokasi_7 + lokasi_8 + lokasi_9 + lokasi_10
##
## Df Sum of Sq RSS AIC
## - lokasi_10 1 5.5368e+18 7.1777e+21 44506
## - lokasi_8 1 5.7385e+18 7.1779e+21 44506
## - jml_kt_buah 1 6.3162e+18 7.1785e+21 44506
## <none> 7.1722e+21 44507
## - lokasi_5 1 2.6207e+19 7.1984e+21 44509
## - jml_km_buah 1 4.1522e+19 7.2137e+21 44511
## - lokasi_2 1 4.1670e+19 7.2138e+21 44511
## - lokasi_4 1 4.5394e+19 7.2175e+21 44511
## - lokasi_7 1 6.5536e+19 7.2377e+21 44514
## - lokasi_9 1 7.1011e+19 7.2432e+21 44515
## - lokasi_3 1 1.3152e+20 7.3037e+21 44523
## - lokasi_6 1 1.4881e+20 7.3210e+21 44526
## - jml_gar_buah 1 2.9266e+20 7.4648e+21 44546
## - lt_m2 1 7.7210e+20 7.9443e+21 44610
## - lb_m2 1 2.9172e+21 1.0089e+22 44855
##
## Step: AIC=44505.64
## harga_rp ~ jml_km_buah + jml_kt_buah + jml_gar_buah + lt_m2 +
## lb_m2 + lokasi_2 + lokasi_3 + lokasi_4 + lokasi_5 + lokasi_6 +
## lokasi_7 + lokasi_8 + lokasi_9
##
## Df Sum of Sq RSS AIC
## - lokasi_8 1 5.6346e+18 7.1833e+21 44504
## - jml_kt_buah 1 6.0719e+18 7.1838e+21 44505
## <none> 7.1777e+21 44506
## + lokasi_10 1 5.5368e+18 7.1722e+21 44507
## - lokasi_5 1 2.5921e+19 7.2036e+21 44507
## - lokasi_2 1 4.0306e+19 7.2180e+21 44509
## - jml_km_buah 1 4.1467e+19 7.2192e+21 44510
## - lokasi_4 1 4.5051e+19 7.2227e+21 44510
## - lokasi_7 1 6.4886e+19 7.2426e+21 44513
## - lokasi_9 1 7.0919e+19 7.2486e+21 44514
## - lokasi_3 1 1.2929e+20 7.3070e+21 44522
## - lokasi_6 1 1.4730e+20 7.3250e+21 44524
## - jml_gar_buah 1 2.9435e+20 7.4720e+21 44545
## - lt_m2 1 7.7184e+20 7.9495e+21 44608
## - lb_m2 1 2.9150e+21 1.0093e+22 44853
##
## Step: AIC=44504.44
## harga_rp ~ jml_km_buah + jml_kt_buah + jml_gar_buah + lt_m2 +
## lb_m2 + lokasi_2 + lokasi_3 + lokasi_4 + lokasi_5 + lokasi_6 +
## lokasi_7 + lokasi_9
##
## Df Sum of Sq RSS AIC
## - jml_kt_buah 1 5.6832e+18 7.1890e+21 44503
## <none> 7.1833e+21 44504
## + lokasi_8 1 5.6346e+18 7.1777e+21 44506
## + lokasi_10 1 5.4329e+18 7.1779e+21 44506
## - lokasi_5 1 2.5433e+19 7.2088e+21 44506
## - lokasi_2 1 3.8045e+19 7.2214e+21 44508
## - jml_km_buah 1 4.1842e+19 7.2252e+21 44508
## - lokasi_4 1 4.4381e+19 7.2277e+21 44509
## - lokasi_7 1 6.3692e+19 7.2470e+21 44511
## - lokasi_9 1 7.0647e+19 7.2540e+21 44512
## - lokasi_3 1 1.2572e+20 7.3090e+21 44520
## - lokasi_6 1 1.4458e+20 7.3279e+21 44523
## - jml_gar_buah 1 2.9806e+20 7.4814e+21 44544
## - lt_m2 1 7.6960e+20 7.9529e+21 44607
## - lb_m2 1 2.9118e+21 1.0095e+22 44851
##
## Step: AIC=44503.25
## harga_rp ~ jml_km_buah + jml_gar_buah + lt_m2 + lb_m2 + lokasi_2 +
## lokasi_3 + lokasi_4 + lokasi_5 + lokasi_6 + lokasi_7 + lokasi_9
##
## Df Sum of Sq RSS AIC
## <none> 7.1890e+21 44503
## + jml_kt_buah 1 5.6832e+18 7.1833e+21 44504
## + lokasi_8 1 5.2458e+18 7.1838e+21 44505
## + lokasi_10 1 5.2020e+18 7.1838e+21 44505
## - lokasi_5 1 2.4170e+19 7.2132e+21 44505
## - lokasi_2 1 3.6815e+19 7.2258e+21 44506
## - lokasi_4 1 4.2408e+19 7.2314e+21 44507
## - lokasi_7 1 6.3247e+19 7.2523e+21 44510
## - lokasi_9 1 6.7776e+19 7.2568e+21 44511
## - lokasi_3 1 1.2269e+20 7.3117e+21 44519
## - lokasi_6 1 1.4471e+20 7.3337e+21 44522
## - jml_km_buah 1 1.6227e+20 7.3513e+21 44524
## - jml_gar_buah 1 2.9255e+20 7.4816e+21 44542
## - lt_m2 1 8.2892e+20 8.0179e+21 44613
## - lb_m2 1 3.1879e+21 1.0377e+22 44877
summary(model_best)
##
## Call:
## lm(formula = harga_rp ~ jml_km_buah + jml_gar_buah + lt_m2 +
## lb_m2 + lokasi_2 + lokasi_3 + lokasi_4 + lokasi_5 + lokasi_6 +
## lokasi_7 + lokasi_9, data = rumah.clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.340e+10 -6.287e+08 1.834e+07 5.223e+08 2.123e+10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.005e+08 2.698e+08 0.743 0.45759
## jml_km_buah -3.511e+08 7.343e+07 -4.782 2.00e-06 ***
## jml_gar_buah 5.709e+08 8.892e+07 6.421 2.08e-10 ***
## lt_m2 8.568e+06 7.928e+05 10.808 < 2e-16 ***
## lb_m2 1.790e+07 8.448e+05 21.194 < 2e-16 ***
## lokasi_2 -4.995e+08 2.193e+08 -2.278 0.02296 *
## lokasi_3 -9.458e+08 2.275e+08 -4.158 3.48e-05 ***
## lokasi_4 -2.094e+09 8.568e+08 -2.445 0.01467 *
## lokasi_5 -1.518e+09 8.227e+08 -1.845 0.06526 .
## lokasi_6 -1.551e+09 3.435e+08 -4.516 7.05e-06 ***
## lokasi_7 -1.599e+09 5.356e+08 -2.985 0.00290 **
## lokasi_9 -8.267e+09 2.675e+09 -3.090 0.00205 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.664e+09 on 1013 degrees of freedom
## Multiple R-squared: 0.7678, Adjusted R-squared: 0.7653
## F-statistic: 304.6 on 11 and 1013 DF, p-value: < 2.2e-16
library(tictoc)
##
## Attaching package: 'tictoc'
## The following object is masked from 'package:data.table':
##
## shift
library(leaps)
# Best subset
tic("Best Subset")
best.subset <- regsubsets(
harga_rp ~ .,
data = rumah.clean,
nvmax = ncol(rumah.clean) - 1
)
toc()
## Best Subset: 0 sec elapsed
# FW Selection
tic("Forward Selection")
fw.model <- regsubsets(
harga_rp ~ .,
data = rumah.clean,
nvmax = 15,
method = "forward"
)
toc()
## Forward Selection: 0 sec elapsed
# Backward Stepwise Selection
tic("Backward Selection")
bw.model = regsubsets(harga_rp ~ .,
data = rumah.clean,
nbest = 1,
nvmax = NULL,
method = "backward")
toc()
## Backward Selection: 0 sec elapsed
# Hybrid Selection
tic("Hybrid Selection")
hy.model = regsubsets(harga_rp ~ .,
data = rumah.clean,
nbest = 1,
nvmax = NULL,
method = "seqrep")
toc()
## Hybrid Selection: 0.02 sec elapsed