#Input Data
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Warning: package 'outliers' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
# Input data
df <- read.csv("C:/Users/User/Downloads/TravelInsurancePrediction.csv",
sep=";",
header=TRUE)
data <- df[c("Annual_Income", "Travel_Insurance")]#Missing Value
##
## === Missing Value ===
## [1] 0
#Outlier
# Outlier
detect_outliers <- function(data) {
cat("\n=== Deteksi Outlier ===\n\n")
outlier_report <- list()
for(var in names(data)[sapply(data, is.numeric)]) {
cat("Analisis outlier untuk variabel:", var, "\n")
# Statistik deskriptif
stats <- summary(data[[var]])
iqr_val <- IQR(data[[var]], na.rm = TRUE)
q1 <- quantile(data[[var]], 0.25, na.rm = TRUE)
q3 <- quantile(data[[var]], 0.75, na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_val
upper_bound <- q3 + 1.5 * iqr_val
# Deteksi outlier dengan metode IQR
outliers_iqr <- data[[var]][data[[var]] < lower_bound | data[[var]] > upper_bound]
# Deteksi outlier dengan metode Z-score
z_scores <- scale(data[[var]])
outliers_z <- data[[var]][abs(z_scores) > 3]
# Deteksi outlier dengan metode Grubbs (uji statistik)
if(length(na.omit(data[[var]])) > 6) {
tryCatch({
grubbs_test <- grubbs.test(na.omit(data[[var]]))
grubbs_outlier <- ifelse(grubbs_test$p.value < 0.05, "Terdeteksi", "Tidak terdeteksi")
}, error = function(e) {
grubbs_outlier <- "Tidak dapat dihitung"
})
} else {
grubbs_outlier <- "Data tidak cukup"
}
# Ringkasan
outlier_report[[var]] <- list(
n_outliers_iqr = length(outliers_iqr),
n_outliers_z = length(outliers_z),
grubbs_result = grubbs_outlier,
lower_bound = lower_bound,
upper_bound = upper_bound,
outlier_values = unique(round(outliers_iqr, 2))
)
cat(" - Outlier (IQR method):", length(outliers_iqr), "\n")
cat(" - Outlier (Z-score > 3):", length(outliers_z), "\n")
cat(" - Uji Grubbs:", grubbs_outlier, "\n")
cat(" - Batas bawah:", round(lower_bound, 2), "\n")
cat(" - Batas atas:", round(upper_bound, 2), "\n")
if(length(outliers_iqr) > 0) {
cat(" - Nilai outlier:", paste(head(unique(round(outliers_iqr, 2)), 5), collapse = ", "), "\n")
}
cat("\n")
}
return(outlier_report)
}
# Deteksi outlier
outlier_analysis <- detect_outliers(data)##
## === Deteksi Outlier ===
##
## Analisis outlier untuk variabel: Annual_Income
## - Outlier (IQR method): 0
## - Outlier (Z-score > 3): 0
## - Uji Grubbs: Tidak terdeteksi
## - Batas bawah: -375000
## - Batas atas: 2225000
##
## Analisis outlier untuk variabel: Travel_Insurance
## - Outlier (IQR method): 0
## - Outlier (Z-score > 3): 0
## - Uji Grubbs: Tidak terdeteksi
## - Batas bawah: -1.5
## - Batas atas: 2.5
# Visual
visualize_outliers <- function(data) {
numeric_vars <- names(data)[sapply(data, is.numeric)]
# Boxplot untuk setiap variabel numerik
par(mfrow = c(2, 3))
for(var in numeric_vars[1:min(6, length(numeric_vars))]) {
boxplot(data[[var]], main = var, col = "lightblue",
ylab = "Price", outline = TRUE)
grid()
}
par(mfrow = c(1, 1))
# Histogram dengan overlay outlier
for(var in numeric_vars[1:min(3, length(numeric_vars))]) {
# Hitung batas outlier
q1 <- quantile(data[[var]], 0.25, na.rm = TRUE)
q3 <- quantile(data[[var]], 0.75, na.rm = TRUE)
iqr_val <- IQR(data[[var]], na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_val
upper_bound <- q3 + 1.5 * iqr_val
# Identifikasi outlier
is_outlier <- data[[var]] < lower_bound | data[[var]] > upper_bound
# Plot histogram
hist_data <- ggplot(data.frame(value = data[[var]]), aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", alpha = 0.7) +
geom_density(color = "darkblue", linewidth = 1) +
geom_vline(xintercept = c(lower_bound, upper_bound),
color = "red", linetype = "dashed", linewidth = 1) +
labs(title = paste("Distribusi dan Outlier:", var),
x = var, y = "Density") +
theme_minimal() +
annotate("text", x = lower_bound, y = 0,
label = "Bawah", vjust = 2, color = "red") +
annotate("text", x = upper_bound, y = 0,
label = "Atas", vjust = 2, color = "red")
print(hist_data)
}
# Scatter plot matrix untuk melihat outlier multivariat
if(length(numeric_vars) >= 3) {
pairs(data[, numeric_vars[1:min(4, length(numeric_vars))]],
main = "Scatter Plot Matrix untuk Deteksi Outlier",
pch = 19, col = alpha("blue", 0.6))
}
}
# Jalankan visualisasi
visualize_outliers(data)## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Statistik Deskriptif
## [1] "Statistik Deskriptif:"
## Annual_Income Travel_Insurance
## Min. : 300000 Min. :0.0000
## 1st Qu.: 600000 1st Qu.:0.0000
## Median : 900000 Median :0.0000
## Mean : 932763 Mean :0.3573
## 3rd Qu.:1250000 3rd Qu.:1.0000
## Max. :1800000 Max. :1.0000
# Scatter plot
ggplot(data, aes(x = Annual_Income, y = Travel_Insurance)) +
geom_point(color = "blue", size = 3) +
labs(title = "Hubungan Annual Income dan Travel Insurance",
x = "Annual_Income", y = "Travel_Insurance") +
theme_minimal()# Korelasi
cor_test <- cor.test(data$Annual_Income, data$Travel_Insurance)
print(paste("Korelasi Pearson:", round(cor_test$estimate, 4)))## [1] "Korelasi Pearson: 0.3968"
## [1] "p-value korelasi: 0"
#Analisis Regresi
# MEMBANGUN MODEL REGRESI
model <- lm(Travel_Insurance ~ Annual_Income, data = data)
print("Ringkasan Model Regresi:")## [1] "Ringkasan Model Regresi:"
##
## Call:
## lm(formula = Travel_Insurance ~ Annual_Income, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7697 -0.3408 -0.1389 0.4069 0.9620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.134e-01 2.636e-02 -4.302 1.78e-05 ***
## Annual_Income 5.047e-07 2.621e-08 19.258 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4401 on 1985 degrees of freedom
## Multiple R-squared: 0.1574, Adjusted R-squared: 0.157
## F-statistic: 370.9 on 1 and 1985 DF, p-value: < 2.2e-16
##
## === UJI ASUMSI REGRESI LINEAR ===
# Normalitas Residual
shapiro_test <- shapiro.test(residuals(model))
cat("1. UJI NORMALITAS (Shapiro-Wilk):\n")## 1. UJI NORMALITAS (Shapiro-Wilk):
## Statistik W = 0.908
## p-value = 0
if(shapiro_test$p.value > 0.05) {
cat(" Keputusan: Residual berdistribusi normal\n")
} else {
cat(" Keputusan: Residual tidak normal\n")
}## Keputusan: Residual tidak normal
# Q-Q Plot
qqnorm(residuals(model), main = "Q-Q Plot Residual")
qqline(residuals(model), col = "red")##
## 2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):
## Statistik LM = 22.6864
## p-value = 0
if(bp_test$p.value > 0.05) {
cat(" Keputusan: Varian residual homogen\n")
} else {
cat(" Keputusan: Ada heteroskedastisitas\n")
}## Keputusan: Ada heteroskedastisitas
# Plot Residual vs Fitted
plot(fitted(model), residuals(model),
main = "Residual vs Fitted Values",
xlab = "Fitted Values", ylab = "Residuals",
pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)##
## 3. UJI AUTOKORELASI (Durbin-Watson):
## Statistik DW = 2.014
## p-value = 0.6224
if(dw_test$p.value > 0.05) {
cat(" Keputusan: Tidak ada autokorelasi\n")
} else {
cat(" Keputusan: Ada autokorelasi\n")
}## Keputusan: Tidak ada autokorelasi
##
## === INTERPRETASI KOEFISIEN ===
intercept <- coef(model)[1]
slope <- coef(model)[2]
cat("Persamaan Regresi: Travel_Insurance =", round(intercept, 2), "+", round(slope, 2), "* Annual_Income\n")## Persamaan Regresi: Travel_Insurance = -0.11 + 0 * Annual_Income
##
## Interpretasi:
## 1. Intercept (β0 = -0.11 ):
## Travel_Insurance ketika Annual_Income = 0 adalah -0.11
## 2. Slope (β1 = 0 ):
cat(" Setiap kenaikan 1 satuan Annual_Income, nilai prediksi Travel_Insurance meningkat sebesar", round(slope, 6))## Setiap kenaikan 1 satuan Annual_Income, nilai prediksi Travel_Insurance meningkat sebesar 1e-06
##
## === ESTIMASI PARAMETER ===
## Interval Kepercayaan 95%:
## Intercept: [-0.165, -0.062]
## Slope: [0, 0]
##
## Uji Hipotesis untuk Slope (β1):
## H0: β1 = 0 (tidak ada hubungan linear)
## H1: β1 ≠ 0 (ada hubungan linear)
summary_model <- summary(model)
slope_pvalue <- summary_model$coefficients[2, 4]
cat(" p-value =", round(slope_pvalue, 6), "\n")## p-value = 0
if(slope_pvalue < 0.05) {
cat(" Keputusan: Tolak H0, ada hubungan linear signifikan\n")
} else {
cat(" Keputusan: Gagal tolak H0, tidak ada hubungan linear signifikan\n")
}## Keputusan: Tolak H0, ada hubungan linear signifikan
##
## Koefisien Determinasi (R²):
## R² = 0.1574
cat(" Artinya:", round(r_squared * 100, 2), "% variasi Travel Insurance dapat dijelaskan oleh Annual Income\n")## Artinya: 15.74 % variasi Travel Insurance dapat dijelaskan oleh Annual Income
# VISUALISASI MODEL
ggplot(data, aes(x = Annual_Income, y = Travel_Insurance)) +
geom_point(color = "blue", size = 3) +
geom_smooth(method = "lm", se = TRUE, color = "red", fill = "pink") +
labs(title = "Garis Regresi Linear",
subtitle = paste("Y =", round(intercept, 2), "+", round(slope, 2), "X"),
x = "Annual_Income", y = "Travel_Insurance") +
theme_minimal()## `geom_smooth()` using formula = 'y ~ x'
# PREDIKSI
new_data <- data.frame(Annual_Income = c(400000, 700000))
prediction <- predict(model, newdata = new_data, interval = "confidence")
cat("\n=== PREDIKSI ===\n")##
## === PREDIKSI ===
cat("Untuk Annual Income 400000, prediksi Travel Insurance =", round(prediction[1, "fit"], 2), "\n")## Untuk Annual Income 400000, prediksi Travel Insurance = 0.09
cat("Untuk Annual Income 700000, prediksi Travel Insurance =", round(prediction[2, "fit"], 2), "\n")## Untuk Annual Income 700000, prediksi Travel Insurance = 0.24
##
## === RINGKASAN ANALISIS ===
## 1. Model: Travel_Insurance = β0 + β1*Annual_Income + ε
## 2. Estimasi: Y = -0.113 + 0 * X
## 3. R² = 0.1574 ( 15.7 %)
f_stat <- summary_model$fstatistic
f_pvalue <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
cat("4. Uji F (model): p-value =", round(f_pvalue, 6), "\n")## 4. Uji F (model): p-value = 0
## 5. Asumsi:
## - Normalitas: p = 0
## - Homoskedastisitas: p = 0
## - Autokorelasi: p = 0.6224
# Simpan hasil
hasil <- list(
model = model,
coefficients = coef(model),
r_squared = r_squared,
assumptions = list(
normality = shapiro_test$p.value,
homoscedasticity = bp_test$p.value,
autocorrelation = dw_test$p.value
),
confidence_intervals = conf_int
)```