kosong

# Membaca data
data <- read.csv("insurance.csv")

# Melihat ringkasan jumlah nilai kosong di setiap kolom
colSums(is.na(data))
##      age      sex      bmi children   smoker  charges 
##        0        0        0        0        0        0
# Atau bisa juga dengan
summary(data)  # untuk melihat apakah ada NA di ringkasan statistik
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             charges     
##  Length:1338        Min.   : 1122  
##  Class :character   1st Qu.: 4740  
##  Mode  :character   Median : 9382  
##                     Mean   :13270  
##                     3rd Qu.:16640  
##                     Max.   :63770

out

# Boxplot untuk visualisasi outlier
boxplot(data$charges, main = "Boxplot of Charges", col = "lightblue")

# Deteksi outlier dengan metode IQR
Q1 <- quantile(data$charges, 0.25)
Q3 <- quantile(data$charges, 0.75)
IQR_value <- Q3 - Q1

# Nilai-nilai outlier
outliers <- data$charges < (Q1 - 1.5 * IQR_value) | data$charges > (Q3 + 1.5 * IQR_value)

# Menampilkan baris yang mengandung outlier pada charges
data[outliers, ]
##      age    sex    bmi children smoker  charges
## 15    27   male 42.130        0    yes 39611.76
## 20    30   male 35.300        0    yes 36837.47
## 24    34 female 31.920        1    yes 37701.88
## 30    31   male 36.300        2    yes 38711.00
## 31    22   male 35.600        0    yes 35585.58
## 35    28   male 36.400        1    yes 51194.56
## 39    35   male 36.670        1    yes 39774.28
## 40    60   male 39.900        0    yes 48173.36
## 50    36   male 35.200        1    yes 38709.18
## 54    36   male 34.430        0    yes 37742.58
## 56    58   male 36.955        2    yes 47496.49
## 83    22   male 37.620        1    yes 37165.16
## 85    37 female 34.800        2    yes 39836.52
## 87    57 female 31.160        0    yes 43578.94
## 95    64 female 31.300        2    yes 47291.06
## 110   63   male 35.090        0    yes 47055.53
## 124   44   male 31.350        1    yes 39556.49
## 147   46   male 30.495        3    yes 40720.55
## 159   30   male 35.530        0    yes 36950.26
## 162   18 female 36.850        0    yes 36149.48
## 176   63 female 37.700        0    yes 48824.45
## 186   36   male 41.895        3    yes 43753.34
## 204   27 female 36.080        0    yes 37133.90
## 224   19   male 34.800        0    yes 34779.61
## 241   23 female 36.670        2    yes 38511.63
## 243   55 female 26.800        1     no 35160.13
## 252   63 female 32.200        2    yes 47305.31
## 253   54   male 34.210        2    yes 44260.75
## 255   50   male 31.825        0    yes 41097.16
## 257   56   male 33.630        0    yes 43921.18
## 264   19   male 36.955        0    yes 36219.41
## 266   46   male 42.350        3    yes 46151.12
## 272   50   male 34.200        2    yes 42856.84
## 282   54   male 40.565        3    yes 48549.18
## 289   59 female 36.765        1    yes 47896.79
## 293   25   male 45.540        2    yes 42112.24
## 299   31   male 34.390        3    yes 38746.36
## 313   43   male 35.970        3    yes 42124.52
## 315   27 female 31.400        0    yes 34838.87
## 323   34   male 30.800        0    yes 35491.64
## 328   45   male 36.480        2    yes 42760.50
## 329   64 female 33.800        1    yes 47928.03
## 331   61 female 36.385        1    yes 48517.56
## 339   50   male 32.300        1    yes 41919.10
## 374   26   male 32.900        2    yes 36085.22
## 378   24   male 40.150        0    yes 38126.25
## 382   55   male 30.685        0    yes 42303.69
## 421   64   male 33.880        0    yes 46889.26
## 422   61   male 35.860        0    yes 46599.11
## 423   40   male 32.775        1    yes 39125.33
## 442   33 female 33.500        0    yes 37079.37
## 477   24   male 28.500        0    yes 35147.53
## 489   44 female 38.060        0    yes 48885.14
## 501   29   male 34.400        0    yes 36197.70
## 525   42   male 26.070        1    yes 38245.59
## 531   57   male 42.130        1    yes 48675.52
## 544   54 female 47.410        0    yes 63770.43
## 550   43 female 46.200        0    yes 45863.21
## 559   35 female 34.105        3    yes 39983.43
## 570   48   male 40.565        2    yes 45702.02
## 578   31 female 38.095        1    yes 58571.07
## 588   34 female 30.210        1    yes 43943.88
## 610   30   male 37.800        2    yes 39241.44
## 616   47 female 36.630        1    yes 42969.85
## 622   37   male 34.100        4    yes 40182.25
## 624   18   male 33.535        0    yes 34617.84
## 630   44 female 38.950        0    yes 42983.46
## 666   43   male 38.060        2    yes 42560.43
## 668   40 female 32.775        2    yes 40003.33
## 669   62   male 32.015        0    yes 45710.21
## 675   44 female 43.890        2    yes 46200.99
## 678   60   male 31.350        3    yes 46130.53
## 683   39   male 35.300        2    yes 40103.89
## 690   27   male 31.130        1    yes 34806.47
## 698   41   male 35.750        1    yes 40273.65
## 707   51 female 38.060        0    yes 44400.41
## 726   30 female 39.050        3    yes 40932.43
## 737   37 female 38.390        0    yes 40419.02
## 739   23   male 31.730        3    yes 36189.10
## 740   29   male 35.500        2    yes 44585.46
## 743   53   male 34.105        0    yes 43254.42
## 760   18   male 38.170        0    yes 36307.80
## 804   18 female 42.240        0    yes 38792.69
## 820   33 female 35.530        0    yes 55135.40
## 827   56   male 31.790        2    yes 43813.87
## 829   41   male 30.780        3    yes 39597.41
## 843   23 female 32.780        2    yes 36021.01
## 846   60 female 32.450        0    yes 45008.96
## 851   37 female 30.780        0    yes 37270.15
## 853   46 female 35.530        0    yes 42111.66
## 857   48 female 33.110        0    yes 40974.16
## 861   37 female 47.600        2    yes 46113.51
## 884   51 female 37.050        3    yes 46255.11
## 894   47   male 38.940        2    yes 44202.65
## 902   60   male 40.920        0    yes 48673.56
## 918   45   male 22.895        0    yes 35069.37
## 948   37   male 34.200        1    yes 39047.29
## 952   51   male 42.900        2    yes 47462.89
## 954   44   male 30.200        2    yes 38998.55
## 957   54   male 30.800        1    yes 41999.52
## 959   43   male 34.960        1    yes 41034.22
## 1013  61 female 33.330        4     no 36580.28
## 1022  22 female 31.020        3    yes 35595.59
## 1023  47   male 36.080        1    yes 42211.14
## 1032  55 female 35.200        0    yes 44423.80
## 1037  22   male 37.070        2    yes 37484.45
## 1038  45 female 30.495        1    yes 39725.52
## 1048  22   male 52.580        1    yes 44501.40
## 1050  49   male 30.900        0    yes 39727.61
## 1063  59   male 41.140        1    yes 48970.25
## 1071  37   male 37.070        1    yes 39871.70
## 1079  28   male 31.680        0    yes 34672.15
## 1091  47   male 36.190        0    yes 41676.08
## 1097  51 female 34.960        2    yes 44641.20
## 1112  38   male 38.390        3    yes 41949.24
## 1118  25   male 33.330        2    yes 36124.57
## 1119  33   male 35.750        1    yes 38282.75
## 1123  53 female 36.860        3    yes 46661.44
## 1125  23 female 42.750        1    yes 40904.20
## 1140  19 female 32.490        0    yes 36898.73
## 1147  60   male 32.800        0    yes 52590.83
## 1153  43 female 32.560        3    yes 40941.29
## 1157  19   male 44.880        0    yes 39722.75
## 1187  20   male 35.625        3    yes 37465.34
## 1207  59 female 34.800        2     no 36910.61
## 1208  36   male 33.400        2    yes 38415.47
## 1219  46 female 34.600        1    yes 41661.60
## 1231  52   male 34.485        3    yes 60021.40
## 1241  52   male 41.800        2    yes 47269.85
## 1242  64   male 36.960        2    yes 49577.66
## 1250  32   male 33.630        1    yes 37607.53
## 1285  61   male 36.300        1    yes 47403.88
## 1289  20   male 39.400        2    yes 38344.57
## 1292  19   male 34.900        0    yes 34828.65
## 1301  45   male 30.360        0    yes 62592.87
## 1302  62   male 30.875        3    yes 46718.16
## 1304  43   male 27.800        0    yes 37829.72
## 1314  19 female 34.700        2    yes 36397.58
## 1324  42 female 40.370        2    yes 43896.38

trans

# Transformasi log charges (tambah 1 jika ada nilai nol)
data$log_charges <- log(data$charges)

# Cek distribusinya
hist(data$log_charges, main = "Histogram Log Charges", col = "orange")

model

# Baca data dari file CSV
insurance <- read.csv("insurance.csv")

# Pastikan variabel kategorik dikonversi ke faktor
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)

# Buat model regresi linear berganda
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)

# Lihat hasil ringkasan model
summary(model)
## 
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11837.2  -2916.7   -994.2   1375.3  29565.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12052.46     951.26 -12.670  < 2e-16 ***
## age            257.73      11.90  21.651  < 2e-16 ***
## sexmale       -128.64     333.36  -0.386 0.699641    
## bmi            322.36      27.42  11.757  < 2e-16 ***
## children       474.41     137.86   3.441 0.000597 ***
## smokeryes    23823.39     412.52  57.750  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6070 on 1332 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.7488 
## F-statistic:   798 on 5 and 1332 DF,  p-value: < 2.2e-16

VIF

# Install jika belum ada
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
# Pastikan faktor sudah benar
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)

# Model regresi utama
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)

# Hitung VIF
vif(model)
##      age      sex      bmi children   smoker 
## 1.015129 1.008878 1.014578 1.002242 1.006457

Shapiro

# Pastikan data sudah dibaca dan variabel kategorik diubah jadi faktor
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)

# Buat model regresi linear
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)

# Simpan residual
res <- residuals(model)

# Uji normalitas residual dengan Shapiro-Wilk
shapiro.test(res)
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.8994, p-value < 2.2e-16
# Plot distribusi residual: Histogram
hist(res, main = "Histogram of Residuals", col = "lightblue", xlab = "Residuals")

# Plot Q-Q untuk memeriksa normalitas
qqnorm(res)
qqline(res, col = "red", lwd = 2)

brush

# Pastikan package lmtest terpasang
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Pastikan data dan model
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)

# Buat model regresi (pakai model log jika perlu)
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)

# Atau model log jika ingin transformasi:
# model <- lm(log(charges) ~ age + sex + bmi + children + smoker, data = insurance)

# Uji Breusch-Pagan
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 118.02, df = 5, p-value < 2.2e-16

dw

# Model regresi dengan log transformasi (opsional jika mau dibandingkan)
log.model <- lm(log(charges) ~ age + sex + bmi + children + smoker, data = insurance)

# Uji Durbin-Watson untuk autokorelasi
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 2.0869, p-value = 0.944
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(log.model)
## 
##  Durbin-Watson test
## 
## data:  log.model
## DW = 2.0519, p-value = 0.8288
## alternative hypothesis: true autocorrelation is greater than 0

R-squared dan Adjusted R-squared

# Pastikan variabel kategorik sudah diubah ke faktor
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)

# Buat model regresi linear
model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)

# R-squared
summary(model)$r.squared         # R-squared
## [1] 0.7497225
# Adjusted R-squared
summary(model)$adj.r.squared     # Adjusted R-squared
## [1] 0.748783
# Uji F
summary(model)$fstatistic        # Nilai F, degrees of freedom numerator & denominator
##     value     numdf     dendf 
##  798.0185    5.0000 1332.0000

Uji F dan uji t

# Pastikan variabel kategorik difaktorkan
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)

# Model regresi linear
model_terbaik <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)

# Uji F dari summary model
summary(model_terbaik)$fstatistic
##     value     numdf     dendf 
##  798.0185    5.0000 1332.0000
# Jika ingin lihat p-value dari uji F:
f_stat <- summary(model_terbaik)$fstatistic
pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
## value 
##     0
# Lihat semua hasil uji t (estimate, t value, p-value)
summary(model_terbaik)$coefficients
##                Estimate Std. Error     t value     Pr(>|t|)
## (Intercept) -12052.4620  951.26043 -12.6699919 8.099035e-35
## age            257.7350   11.90389  21.6513323 2.593679e-89
## sexmale       -128.6399  333.36051  -0.3858881 6.996412e-01
## bmi            322.3642   27.41860  11.7571362 1.954711e-30
## children       474.4111  137.85580   3.4413577 5.967197e-04
## smokeryes    23823.3925  412.52338  57.7504052 0.000000e+00

Pemilihan variabel

library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit
# Pastikan faktor sudah diatur
insurance$sex <- as.factor(insurance$sex)
insurance$smoker <- as.factor(insurance$smoker)

# Model awal (full model)
full_model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance)

# Model kosong (intercept only)
null_model <- lm(charges ~ 1, data = insurance)
summary(full_model)
## 
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11837.2  -2916.7   -994.2   1375.3  29565.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12052.46     951.26 -12.670  < 2e-16 ***
## age            257.73      11.90  21.651  < 2e-16 ***
## sexmale       -128.64     333.36  -0.386 0.699641    
## bmi            322.36      27.42  11.757  < 2e-16 ***
## children       474.41     137.86   3.441 0.000597 ***
## smokeryes    23823.39     412.52  57.750  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6070 on 1332 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.7488 
## F-statistic:   798 on 5 and 1332 DF,  p-value: < 2.2e-16

AIC

# Stepwise (both direction)
step_model <- stepAIC(null_model, 
                      scope = list(lower = null_model, upper = full_model), 
                      direction = "both",
                      trace = FALSE)

summary(step_model)
## 
## Call:
## lm(formula = charges ~ smoker + age + bmi + children, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11897.9  -2920.8   -986.6   1392.2  29509.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12102.77     941.98 -12.848  < 2e-16 ***
## smokeryes    23811.40     411.22  57.904  < 2e-16 ***
## age            257.85      11.90  21.675  < 2e-16 ***
## bmi            321.85      27.38  11.756  < 2e-16 ***
## children       473.50     137.79   3.436 0.000608 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6068 on 1333 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.7489 
## F-statistic: 998.1 on 4 and 1333 DF,  p-value: < 2.2e-16

cv k = 5 & M = 10

# Fungsi untuk menghitung MSE CV
cv_error <- function(model_formula, data, K = 10) {
  glm_fit <- glm(model_formula, data = data)
  cv_result <- cv.glm(data, glm_fit, K = K)
  return(cv_result$delta[1])  # cross-validated error estimate
}

# Contoh model-model yang ingin diuji
model1_formula <- charges ~ age + bmi + smoker
model2_formula <- charges ~ age + bmi + children + smoker
model3_formula <- charges ~ age + sex + bmi + children + smoker

# Hitung AIC dan CV error
model1 <- lm(model1_formula, data = insurance)
model2 <- lm(model2_formula, data = insurance)
model3 <- lm(model3_formula, data = insurance)

data.frame(
  Model = c("Model 1", "Model 2", "Model 3"),
  AIC = c(AIC(model1), AIC(model2), AIC(model3)),
  Adj_R2 = c(summary(model1)$adj.r.squared,
             summary(model2)$adj.r.squared,
             summary(model3)$adj.r.squared),
  CV = c(cv_error(model1_formula, insurance, K = 10),
         cv_error(model2_formula, insurance, K = 10),
         cv_error(model3_formula, insurance, K = 10))
)
##     Model      AIC    Adj_R2       CV
## 1 Model 1 27123.84 0.7469093 37350093
## 2 Model 2 27114.04 0.7489434 36980430
## 3 Model 3 27115.89 0.7487830 37157561