kosong
# Membaca data
data <- read.csv("insurance.csv")
# Melihat ringkasan jumlah nilai kosong di setiap kolom
colSums(is.na(data))
## age sex bmi children smoker charges
## 0 0 0 0 0 0
# Atau bisa juga dengan
summary(data) # untuk melihat apakah ada NA di ringkasan statistik
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker charges
## Length:1338 Min. : 1122
## Class :character 1st Qu.: 4740
## Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
out
# Boxplot untuk visualisasi outlier
boxplot(data$charges, main = "Boxplot of Charges", col = "lightblue")
# Deteksi outlier dengan metode IQR
Q1 <- quantile(data$charges, 0.25)
Q3 <- quantile(data$charges, 0.75)
IQR_value <- Q3 - Q1
# Nilai-nilai outlier
outliers <- data$charges < (Q1 - 1.5 * IQR_value) | data$charges > (Q3 + 1.5 * IQR_value)
# Menampilkan baris yang mengandung outlier pada charges
data[outliers, ]
## age sex bmi children smoker charges
## 15 27 male 42.130 0 yes 39611.76
## 20 30 male 35.300 0 yes 36837.47
## 24 34 female 31.920 1 yes 37701.88
## 30 31 male 36.300 2 yes 38711.00
## 31 22 male 35.600 0 yes 35585.58
## 35 28 male 36.400 1 yes 51194.56
## 39 35 male 36.670 1 yes 39774.28
## 40 60 male 39.900 0 yes 48173.36
## 50 36 male 35.200 1 yes 38709.18
## 54 36 male 34.430 0 yes 37742.58
## 56 58 male 36.955 2 yes 47496.49
## 83 22 male 37.620 1 yes 37165.16
## 85 37 female 34.800 2 yes 39836.52
## 87 57 female 31.160 0 yes 43578.94
## 95 64 female 31.300 2 yes 47291.06
## 110 63 male 35.090 0 yes 47055.53
## 124 44 male 31.350 1 yes 39556.49
## 147 46 male 30.495 3 yes 40720.55
## 159 30 male 35.530 0 yes 36950.26
## 162 18 female 36.850 0 yes 36149.48
## 176 63 female 37.700 0 yes 48824.45
## 186 36 male 41.895 3 yes 43753.34
## 204 27 female 36.080 0 yes 37133.90
## 224 19 male 34.800 0 yes 34779.61
## 241 23 female 36.670 2 yes 38511.63
## 243 55 female 26.800 1 no 35160.13
## 252 63 female 32.200 2 yes 47305.31
## 253 54 male 34.210 2 yes 44260.75
## 255 50 male 31.825 0 yes 41097.16
## 257 56 male 33.630 0 yes 43921.18
## 264 19 male 36.955 0 yes 36219.41
## 266 46 male 42.350 3 yes 46151.12
## 272 50 male 34.200 2 yes 42856.84
## 282 54 male 40.565 3 yes 48549.18
## 289 59 female 36.765 1 yes 47896.79
## 293 25 male 45.540 2 yes 42112.24
## 299 31 male 34.390 3 yes 38746.36
## 313 43 male 35.970 3 yes 42124.52
## 315 27 female 31.400 0 yes 34838.87
## 323 34 male 30.800 0 yes 35491.64
## 328 45 male 36.480 2 yes 42760.50
## 329 64 female 33.800 1 yes 47928.03
## 331 61 female 36.385 1 yes 48517.56
## 339 50 male 32.300 1 yes 41919.10
## 374 26 male 32.900 2 yes 36085.22
## 378 24 male 40.150 0 yes 38126.25
## 382 55 male 30.685 0 yes 42303.69
## 421 64 male 33.880 0 yes 46889.26
## 422 61 male 35.860 0 yes 46599.11
## 423 40 male 32.775 1 yes 39125.33
## 442 33 female 33.500 0 yes 37079.37
## 477 24 male 28.500 0 yes 35147.53
## 489 44 female 38.060 0 yes 48885.14
## 501 29 male 34.400 0 yes 36197.70
## 525 42 male 26.070 1 yes 38245.59
## 531 57 male 42.130 1 yes 48675.52
## 544 54 female 47.410 0 yes 63770.43
## 550 43 female 46.200 0 yes 45863.21
## 559 35 female 34.105 3 yes 39983.43
## 570 48 male 40.565 2 yes 45702.02
## 578 31 female 38.095 1 yes 58571.07
## 588 34 female 30.210 1 yes 43943.88
## 610 30 male 37.800 2 yes 39241.44
## 616 47 female 36.630 1 yes 42969.85
## 622 37 male 34.100 4 yes 40182.25
## 624 18 male 33.535 0 yes 34617.84
## 630 44 female 38.950 0 yes 42983.46
## 666 43 male 38.060 2 yes 42560.43
## 668 40 female 32.775 2 yes 40003.33
## 669 62 male 32.015 0 yes 45710.21
## 675 44 female 43.890 2 yes 46200.99
## 678 60 male 31.350 3 yes 46130.53
## 683 39 male 35.300 2 yes 40103.89
## 690 27 male 31.130 1 yes 34806.47
## 698 41 male 35.750 1 yes 40273.65
## 707 51 female 38.060 0 yes 44400.41
## 726 30 female 39.050 3 yes 40932.43
## 737 37 female 38.390 0 yes 40419.02
## 739 23 male 31.730 3 yes 36189.10
## 740 29 male 35.500 2 yes 44585.46
## 743 53 male 34.105 0 yes 43254.42
## 760 18 male 38.170 0 yes 36307.80
## 804 18 female 42.240 0 yes 38792.69
## 820 33 female 35.530 0 yes 55135.40
## 827 56 male 31.790 2 yes 43813.87
## 829 41 male 30.780 3 yes 39597.41
## 843 23 female 32.780 2 yes 36021.01
## 846 60 female 32.450 0 yes 45008.96
## 851 37 female 30.780 0 yes 37270.15
## 853 46 female 35.530 0 yes 42111.66
## 857 48 female 33.110 0 yes 40974.16
## 861 37 female 47.600 2 yes 46113.51
## 884 51 female 37.050 3 yes 46255.11
## 894 47 male 38.940 2 yes 44202.65
## 902 60 male 40.920 0 yes 48673.56
## 918 45 male 22.895 0 yes 35069.37
## 948 37 male 34.200 1 yes 39047.29
## 952 51 male 42.900 2 yes 47462.89
## 954 44 male 30.200 2 yes 38998.55
## 957 54 male 30.800 1 yes 41999.52
## 959 43 male 34.960 1 yes 41034.22
## 1013 61 female 33.330 4 no 36580.28
## 1022 22 female 31.020 3 yes 35595.59
## 1023 47 male 36.080 1 yes 42211.14
## 1032 55 female 35.200 0 yes 44423.80
## 1037 22 male 37.070 2 yes 37484.45
## 1038 45 female 30.495 1 yes 39725.52
## 1048 22 male 52.580 1 yes 44501.40
## 1050 49 male 30.900 0 yes 39727.61
## 1063 59 male 41.140 1 yes 48970.25
## 1071 37 male 37.070 1 yes 39871.70
## 1079 28 male 31.680 0 yes 34672.15
## 1091 47 male 36.190 0 yes 41676.08
## 1097 51 female 34.960 2 yes 44641.20
## 1112 38 male 38.390 3 yes 41949.24
## 1118 25 male 33.330 2 yes 36124.57
## 1119 33 male 35.750 1 yes 38282.75
## 1123 53 female 36.860 3 yes 46661.44
## 1125 23 female 42.750 1 yes 40904.20
## 1140 19 female 32.490 0 yes 36898.73
## 1147 60 male 32.800 0 yes 52590.83
## 1153 43 female 32.560 3 yes 40941.29
## 1157 19 male 44.880 0 yes 39722.75
## 1187 20 male 35.625 3 yes 37465.34
## 1207 59 female 34.800 2 no 36910.61
## 1208 36 male 33.400 2 yes 38415.47
## 1219 46 female 34.600 1 yes 41661.60
## 1231 52 male 34.485 3 yes 60021.40
## 1241 52 male 41.800 2 yes 47269.85
## 1242 64 male 36.960 2 yes 49577.66
## 1250 32 male 33.630 1 yes 37607.53
## 1285 61 male 36.300 1 yes 47403.88
## 1289 20 male 39.400 2 yes 38344.57
## 1292 19 male 34.900 0 yes 34828.65
## 1301 45 male 30.360 0 yes 62592.87
## 1302 62 male 30.875 3 yes 46718.16
## 1304 43 male 27.800 0 yes 37829.72
## 1314 19 female 34.700 2 yes 36397.58
## 1324 42 female 40.370 2 yes 43896.38
trans
# Transformasi log charges (tambah 1 jika ada nilai nol)
data$log_charges <- log(data$charges)
# Cek distribusinya
hist(data$log_charges, main = "Histogram Log Charges", col = "orange")
model
# Baca data dari file CSV
insurance <- read.csv("insurance.csv")
# Pastikan variabel kategorik dikonversi ke faktor
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
# Buat model regresi linear berganda
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)
# Lihat hasil ringkasan model
summary(model)
##
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11837.2 -2916.7 -994.2 1375.3 29565.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12052.46 951.26 -12.670 < 2e-16 ***
## age 257.73 11.90 21.651 < 2e-16 ***
## sexmale -128.64 333.36 -0.386 0.699641
## bmi 322.36 27.42 11.757 < 2e-16 ***
## children 474.41 137.86 3.441 0.000597 ***
## smokeryes 23823.39 412.52 57.750 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6070 on 1332 degrees of freedom
## Multiple R-squared: 0.7497, Adjusted R-squared: 0.7488
## F-statistic: 798 on 5 and 1332 DF, p-value: < 2.2e-16
VIF
# Install jika belum ada
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
# Pastikan faktor sudah benar
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
# Model regresi utama
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)
# Hitung VIF
vif(model)
## age sex bmi children smoker
## 1.015129 1.008878 1.014578 1.002242 1.006457
Shapiro
# Pastikan data sudah dibaca dan variabel kategorik diubah jadi faktor
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
# Buat model regresi linear
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)
# Simpan residual
res <- residuals(model)
# Uji normalitas residual dengan Shapiro-Wilk
shapiro.test(res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.8994, p-value < 2.2e-16
# Plot distribusi residual: Histogram
hist(res, main = "Histogram of Residuals", col = "lightblue", xlab = "Residuals")
# Plot Q-Q untuk memeriksa normalitas
qqnorm(res)
qqline(res, col = "red", lwd = 2)
brush
# Pastikan package lmtest terpasang
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Pastikan data dan model
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
# Buat model regresi (pakai model log jika perlu)
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)
# Atau model log jika ingin transformasi:
# model <- lm(log(charges) ~ age + sex + bmi + children + smoker, data = insurance)
# Uji Breusch-Pagan
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 118.02, df = 5, p-value < 2.2e-16
dw
# Model regresi dengan log transformasi (opsional jika mau dibandingkan)
log.model <- lm(log(charges) ~ age + sex + bmi + children + smoker, data = insurance)
# Uji Durbin-Watson untuk autokorelasi
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 2.0869, p-value = 0.944
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(log.model)
##
## Durbin-Watson test
##
## data: log.model
## DW = 2.0519, p-value = 0.8288
## alternative hypothesis: true autocorrelation is greater than 0
R-squared dan Adjusted R-squared
# Pastikan variabel kategorik sudah diubah ke faktor
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
# Buat model regresi linear
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)
# R-squared
summary(model)$r.squared # R-squared
## [1] 0.7497225
# Adjusted R-squared
summary(model)$adj.r.squared # Adjusted R-squared
## [1] 0.748783
# Uji F
summary(model)$fstatistic # Nilai F, degrees of freedom numerator & denominator
## value numdf dendf
## 798.0185 5.0000 1332.0000
Uji F dan uji t
# Pastikan variabel kategorik difaktorkan
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
# Model regresi linear
model_terbaik <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)
# Uji F dari summary model
summary(model_terbaik)$fstatistic
## value numdf dendf
## 798.0185 5.0000 1332.0000
# Jika ingin lihat p-value dari uji F:
f_stat <- summary(model_terbaik)$fstatistic
pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
## value
## 0
# Lihat semua hasil uji t (estimate, t value, p-value)
summary(model_terbaik)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12052.4620 951.26043 -12.6699919 8.099035e-35
## age 257.7350 11.90389 21.6513323 2.593679e-89
## sexmale -128.6399 333.36051 -0.3858881 6.996412e-01
## bmi 322.3642 27.41860 11.7571362 1.954711e-30
## children 474.4111 137.85580 3.4413577 5.967197e-04
## smokeryes 23823.3925 412.52338 57.7504052 0.000000e+00
Pemilihan variabel
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
# Pastikan faktor sudah diatur
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)
# Model awal (full model)
full_model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)
# Model kosong (intercept only)
null_model <- lm(charges ~ 1, data = insurance)
summary(full_model)
##
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11837.2 -2916.7 -994.2 1375.3 29565.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12052.46 951.26 -12.670 < 2e-16 ***
## age 257.73 11.90 21.651 < 2e-16 ***
## sexmale -128.64 333.36 -0.386 0.699641
## bmi 322.36 27.42 11.757 < 2e-16 ***
## children 474.41 137.86 3.441 0.000597 ***
## smokeryes 23823.39 412.52 57.750 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6070 on 1332 degrees of freedom
## Multiple R-squared: 0.7497, Adjusted R-squared: 0.7488
## F-statistic: 798 on 5 and 1332 DF, p-value: < 2.2e-16
AIC
# Stepwise (both direction)
step_model <- stepAIC(null_model,
scope = list(lower = null_model, upper = full_model),
direction = "both",
trace = FALSE)
summary(step_model)
##
## Call:
## lm(formula = charges ~ smoker + age + bmi + children, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11897.9 -2920.8 -986.6 1392.2 29509.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12102.77 941.98 -12.848 < 2e-16 ***
## smokeryes 23811.40 411.22 57.904 < 2e-16 ***
## age 257.85 11.90 21.675 < 2e-16 ***
## bmi 321.85 27.38 11.756 < 2e-16 ***
## children 473.50 137.79 3.436 0.000608 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6068 on 1333 degrees of freedom
## Multiple R-squared: 0.7497, Adjusted R-squared: 0.7489
## F-statistic: 998.1 on 4 and 1333 DF, p-value: < 2.2e-16
cv k = 5 & M = 10
# Fungsi untuk menghitung MSE CV
cv_error <- function(model_formula, data, K = 10) {
glm_fit <- glm(model_formula, data = data)
cv_result <- cv.glm(data, glm_fit, K = K)
return(cv_result$delta[1]) # cross-validated error estimate
}
# Contoh model-model yang ingin diuji
model1_formula <- charges ~ age + bmi + smoker
model2_formula <- charges ~ age + bmi + children + smoker
model3_formula <- charges ~ age + sex + bmi + children + smoker
# Hitung AIC dan CV error
model1 <- lm(model1_formula, data = insurance)
model2 <- lm(model2_formula, data = insurance)
model3 <- lm(model3_formula, data = insurance)
data.frame(
Model = c("Model 1", "Model 2", "Model 3"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
Adj_R2 = c(summary(model1)$adj.r.squared,
summary(model2)$adj.r.squared,
summary(model3)$adj.r.squared),
CV = c(cv_error(model1_formula, insurance, K = 10),
cv_error(model2_formula, insurance, K = 10),
cv_error(model3_formula, insurance, K = 10))
)
## Model AIC Adj_R2 CV
## 1 Model 1 27123.84 0.7469093 37350093
## 2 Model 2 27114.04 0.7489434 36980430
## 3 Model 3 27115.89 0.7487830 37157561