OUTLIER
# 3. Deteksi outlier
detect_outliers <- function(data) {
cat("\n=== DETEKSI OUTLIER ===\n\n")
outlier_report <- list()
for(var in names(data)[sapply(data, is.numeric)]) {
cat("Analisis outlier untuk variabel:", var, "\n")
# Statistik deskriptif
stats <- summary(data[[var]])
iqr_val <- IQR(data[[var]], na.rm = TRUE)
q1 <- quantile(data[[var]], 0.25, na.rm = TRUE)
q3 <- quantile(data[[var]], 0.75, na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_val
upper_bound <- q3 + 1.5 * iqr_val
# Deteksi outlier dengan metode IQR
outliers_iqr <- data[[var]][data[[var]] < lower_bound | data[[var]] > upper_bound]
# Deteksi outlier dengan metode Z-score
z_scores <- scale(data[[var]])
outliers_z <- data[[var]][abs(z_scores) > 3]
# Deteksi outlier dengan metode Grubbs (uji statistik)
if(length(na.omit(data[[var]])) > 6) {
tryCatch({
grubbs_test <- grubbs.test(na.omit(data[[var]]))
grubbs_outlier <- ifelse(grubbs_test$p.value < 0.05, "Terdeteksi", "Tidak terdeteksi")
}, error = function(e) {
grubbs_outlier <- "Tidak dapat dihitung"
})
} else {
grubbs_outlier <- "Data tidak cukup"
}
# Ringkasan
outlier_report[[var]] <- list(
n_outliers_iqr = length(outliers_iqr),
n_outliers_z = length(outliers_z),
grubbs_result = grubbs_outlier,
lower_bound = lower_bound,
upper_bound = upper_bound,
outlier_values = unique(round(outliers_iqr, 2))
)
cat(" - Outlier (IQR method):", length(outliers_iqr), "\n")
cat(" - Outlier (Z-score > 3):", length(outliers_z), "\n")
cat(" - Uji Grubbs:", grubbs_outlier, "\n")
cat(" - Batas bawah:", round(lower_bound, 2), "\n")
cat(" - Batas atas:", round(upper_bound, 2), "\n")
if(length(outliers_iqr) > 0) {
cat(" - Nilai outlier:", paste(head(unique(round(outliers_iqr, 2)), 5), collapse = ", "), "\n")
}
cat("\n")
}
return(outlier_report)
}
# Deteksi outlier
outlier_analysis <- detect_outliers(data)
##
## === DETEKSI OUTLIER ===
##
## Analisis outlier untuk variabel: id
## - Outlier (IQR method): 0
## - Outlier (Z-score > 3): 0
## - Uji Grubbs: Tidak terdeteksi
## - Batas bawah: -498.5
## - Batas atas: 1499.5
##
## Analisis outlier untuk variabel: age
## - Outlier (IQR method): 12
## - Outlier (Z-score > 3): 3
## - Uji Grubbs: Terdeteksi
## - Batas bawah: 7.5
## - Batas atas: 83.5
## - Nilai outlier: 94, 150, -5, 84, 5
##
## Analisis outlier untuk variabel: income
## - Outlier (IQR method): 57
## - Outlier (Z-score > 3): 52
## - Uji Grubbs: Terdeteksi
## - Batas bawah: -5850.62
## - Batas atas: 107870.4
## - Nilai outlier: NA, 5e+05, -10000, 1e+06, 113681
##
## Analisis outlier untuk variabel: blood_pressure_systolic
## - Outlier (IQR method): 26
## - Outlier (Z-score > 3): 22
## - Uji Grubbs: Tidak terdeteksi
## - Batas bawah: 80.59
## - Batas atas: 159.07
## - Nilai outlier: NA, 165.33, 77.27, 161.03, 80.51
##
## Analisis outlier untuk variabel: blood_pressure_diastolic
## - Outlier (IQR method): 12
## - Outlier (Z-score > 3): 1
## - Uji Grubbs: Terdeteksi
## - Batas bawah: 54.21
## - Batas atas: 105.93
## - Nilai outlier: 105.98, 50.82, 106.54, 50.5, 51.61
##
## Analisis outlier untuk variabel: cholesterol
## - Outlier (IQR method): 41
## - Outlier (Z-score > 3): 36
## - Uji Grubbs: Tidak terdeteksi
## - Batas bawah: 90.7
## - Batas atas: 306.12
## - Nilai outlier: NA, 89.86, 318.86, 84.49, 330.87
##
## Analisis outlier untuk variabel: bmi
## - Outlier (IQR method): 11
## - Outlier (Z-score > 3): 4
## - Uji Grubbs: Terdeteksi
## - Batas bawah: 11.21
## - Batas atas: 38.82
## - Nilai outlier: 10.38, 60, 8.55, 10, 10.57
# 4. Visualisasi outlier
visualize_outliers <- function(data) {
cat("\n=== VISUALISASI OUTLIER ===\n")
numeric_vars <- names(data)[sapply(data, is.numeric)]
# Boxplot untuk setiap variabel numerik
par(mfrow = c(2, 3))
for(var in numeric_vars[1:min(6, length(numeric_vars))]) {
boxplot(data[[var]], main = var, col = "lightblue",
ylab = "Nilai", outline = TRUE)
grid()
}
par(mfrow = c(1, 1))
# Histogram dengan overlay outlier
for(var in numeric_vars[1:min(3, length(numeric_vars))]) {
# Hitung batas outlier
q1 <- quantile(data[[var]], 0.25, na.rm = TRUE)
q3 <- quantile(data[[var]], 0.75, na.rm = TRUE)
iqr_val <- IQR(data[[var]], na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_val
upper_bound <- q3 + 1.5 * iqr_val
# Identifikasi outlier
is_outlier <- data[[var]] < lower_bound | data[[var]] > upper_bound
# Plot histogram
hist_data <- ggplot(data.frame(value = data[[var]]), aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", alpha = 0.7) +
geom_density(color = "darkblue", linewidth = 1) +
geom_vline(xintercept = c(lower_bound, upper_bound),
color = "red", linetype = "dashed", linewidth = 1) +
labs(title = paste("Distribusi dan Outlier:", var),
x = var, y = "Density") +
theme_minimal() +
annotate("text", x = lower_bound, y = 0,
label = "Bawah", vjust = 2, color = "red") +
annotate("text", x = upper_bound, y = 0,
label = "Atas", vjust = 2, color = "red")
print(hist_data)
}
# Scatter plot matrix untuk melihat outlier multivariat
if(length(numeric_vars) >= 3) {
pairs(data[, numeric_vars[1:min(4, length(numeric_vars))]],
main = "Scatter Plot Matrix untuk Deteksi Outlier",
pch = 19, col = alpha("blue", 0.6))
}
}
# Jalankan visualisasi
visualize_outliers(data)
##
## === VISUALISASI OUTLIER ===

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.


## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density()`).


# 5. Metode penanganan outlier
handle_outliers <- function(data, method = "cap") {
cat("\n=== PENANGANAN OUTLIER ===\n")
cat("Metode yang digunakan:", method, "\n\n")
data_processed <- data
for(var in names(data_processed)[sapply(data_processed, is.numeric)]) {
# Hitung batas outlier
q1 <- quantile(data_processed[[var]], 0.25, na.rm = TRUE)
q3 <- quantile(data_processed[[var]], 0.75, na.rm = TRUE)
iqr_val <- IQR(data_processed[[var]], na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_val
upper_bound <- q3 + 1.5 * iqr_val
# Identifikasi outlier
is_outlier <- data_processed[[var]] < lower_bound | data_processed[[var]] > upper_bound
n_outliers <- sum(is_outlier, na.rm = TRUE)
if(n_outliers > 0) {
cat("Variabel", var, "memiliki", n_outliers, "outlier\n")
if(method == "cap") {
# Cap outliers (Winsorizing)
data_processed[[var]][data_processed[[var]] < lower_bound] <- lower_bound
data_processed[[var]][data_processed[[var]] > upper_bound] <- upper_bound
cat(" Outlier di-cap ke batas [", round(lower_bound, 2), ", ",
round(upper_bound, 2), "]\n", sep = "")
} else if(method == "remove") {
# Hapus outlier (set sebagai NA)
data_processed[[var]][is_outlier] <- NA
cat(" Outlier dihapus (dijadikan NA)\n")
} else if(method == "transform") {
# Transformasi logaritmik
if(all(data_processed[[var]] > 0, na.rm = TRUE)) {
data_processed[[var]] <- log(data_processed[[var]] + 1)
cat(" Dilakukan transformasi log\n")
} else {
cat(" Transformasi log tidak dapat dilakukan (nilai negatif)\n")
}
} else if(method == "median") {
# Ganti dengan median
median_val <- median(data_processed[[var]], na.rm = TRUE)
data_processed[[var]][is_outlier] <- median_val
cat(" Diganti dengan median:", round(median_val, 2), "\n")
}
}
}
return(data_processed)
}
# Contoh penggunaan metode penanganan outlier
data_capped <- handle_outliers(data, method = "cap")
##
## === PENANGANAN OUTLIER ===
## Metode yang digunakan: cap
##
## Variabel age memiliki 12 outlier
## Outlier di-cap ke batas [7.5, 83.5]
## Variabel income memiliki 7 outlier
## Outlier di-cap ke batas [-5850.62, 107870.4]
## Variabel blood_pressure_systolic memiliki 6 outlier
## Outlier di-cap ke batas [80.59, 159.07]
## Variabel blood_pressure_diastolic memiliki 12 outlier
## Outlier di-cap ke batas [54.21, 105.93]
## Variabel cholesterol memiliki 11 outlier
## Outlier di-cap ke batas [90.7, 306.12]
## Variabel bmi memiliki 11 outlier
## Outlier di-cap ke batas [11.21, 38.82]
data_no_outliers <- handle_outliers(data, method = "remove")
##
## === PENANGANAN OUTLIER ===
## Metode yang digunakan: remove
##
## Variabel age memiliki 12 outlier
## Outlier dihapus (dijadikan NA)
## Variabel income memiliki 7 outlier
## Outlier dihapus (dijadikan NA)
## Variabel blood_pressure_systolic memiliki 6 outlier
## Outlier dihapus (dijadikan NA)
## Variabel blood_pressure_diastolic memiliki 12 outlier
## Outlier dihapus (dijadikan NA)
## Variabel cholesterol memiliki 11 outlier
## Outlier dihapus (dijadikan NA)
## Variabel bmi memiliki 11 outlier
## Outlier dihapus (dijadikan NA)
CONTOH CODE VISUALISASI
# ========================================================
# VISUALISASI HASIL PEMBERSIHAN DATA
# ========================================================
# 1. FUNGSI UTAMA VISUALISASI PERBANDINGAN
visualize_cleaning_results <- function(data_before, data_after, results_list) {
cat("\n=== VISUALISASI HASIL PEMBERSIHAN DATA ===\n")
# Set layout untuk multiple plots
par(mfrow = c(2, 3))
# 1. PERBANDINGAN MISSING VALUES
visualize_missing_comparison <- function(data_before, data_after) {
missing_before <- colSums(is.na(data_before))
missing_after <- colSums(is.na(data_after))
# Hanya ambil variabel dengan missing values
vars_with_missing <- names(missing_before)[missing_before > 0 | missing_after > 0]
if(length(vars_with_missing) > 0) {
comparison_df <- data.frame(
Variable = vars_with_missing,
Before = missing_before[vars_with_missing],
After = missing_after[vars_with_missing]
)
# Konversi ke long format untuk ggplot
comparison_long <- comparison_df %>%
pivot_longer(cols = c(Before, After),
names_to = "Status",
values_to = "Missing_Count")
p1 <- ggplot(comparison_long, aes(x = Variable, y = Missing_Count, fill = Status)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
scale_fill_manual(values = c("Before" = "#E74C3C", "After" = "#2ECC71")) +
labs(title = "Missing Values: Sebelum vs Sesudah",
x = "Variabel",
y = "Jumlah Missing Values",
fill = "Status") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")) +
geom_text(aes(label = Missing_Count),
position = position_dodge(width = 0.7),
vjust = -0.5, size = 3)
print(p1)
} else {
plot.new()
text(0.5, 0.5, "Tidak ada missing values", cex = 1.2)
}
}
# 2. PERBANDINGAN DISTRIBUSI VARIABEL NUMERIK UTAMA
visualize_distribution_comparison <- function(data_before, data_after) {
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
if(length(numeric_vars) > 0) {
# Pilih variabel dengan perubahan terbesar
selected_var <- numeric_vars[1]
# Buat data frame untuk plotting
plot_data <- data.frame(
Value = c(data_before[[selected_var]], data_after[[selected_var]]),
Status = rep(c("Sebelum", "Sesudah"),
c(length(data_before[[selected_var]]),
length(data_after[[selected_var]])))
)
p2 <- ggplot(plot_data, aes(x = Value, fill = Status)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
labs(title = paste("Distribusi", selected_var, ": Sebelum vs Sesudah"),
x = selected_var,
y = "Densitas",
fill = "Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Tambahkan garis vertikal untuk mean
mean_before <- mean(data_before[[selected_var]], na.rm = TRUE)
mean_after <- mean(data_after[[selected_var]], na.rm = TRUE)
p2 <- p2 +
geom_vline(xintercept = mean_before, color = "#E74C3C",
linetype = "dashed", linewidth = 1) +
geom_vline(xintercept = mean_after, color = "#2ECC71",
linetype = "dashed", linewidth = 1) +
annotate("text", x = mean_before, y = 0,
label = paste("Mean sebelum:", round(mean_before, 2)),
vjust = 2, color = "#E74C3C") +
annotate("text", x = mean_after, y = 0,
label = paste("Mean sesudah:", round(mean_after, 2)),
vjust = 4, color = "#2ECC71")
print(p2)
}
}
# 3. BOXPLOT OUTLIER PERBANDINGAN
visualize_outlier_comparison <- function(data_before, data_after) {
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
if(length(numeric_vars) >= 3) {
# Pilih 3 variabel pertama
selected_vars <- numeric_vars[1:min(3, length(numeric_vars))]
# Gabungkan data untuk plotting
plot_list <- list()
for(i in 1:length(selected_vars)) {
var <- selected_vars[i]
# Hitung outlier bounds
q1 <- quantile(data_before[[var]], 0.25, na.rm = TRUE)
q3 <- quantile(data_before[[var]], 0.75, na.rm = TRUE)
iqr_val <- IQR(data_before[[var]], na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_val
upper_bound <- q3 + 1.5 * iqr_val
# Hitung persentase outlier
outliers_before <- sum(data_before[[var]] < lower_bound |
data_before[[var]] > upper_bound, na.rm = TRUE)
outliers_after <- sum(data_after[[var]] < lower_bound |
data_after[[var]] > upper_bound, na.rm = TRUE)
total_before <- sum(!is.na(data_before[[var]]))
total_after <- sum(!is.na(data_after[[var]]))
percent_before <- round(outliers_before/total_before * 100, 1)
percent_after <- round(outliers_after/total_after * 100, 1)
plot_list[[i]] <- data.frame(
Variable = var,
Status = c("Sebelum", "Sesudah"),
Outlier_Percent = c(percent_before, percent_after)
)
}
plot_df <- do.call(rbind, plot_list)
p3 <- ggplot(plot_df, aes(x = Variable, y = Outlier_Percent, fill = Status)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
labs(title = "Persentase Outlier (IQR Method)",
x = "Variabel",
y = "Persentase Outlier (%)",
fill = "Status") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5),
plot.title = element_text(hjust = 0.5, face = "bold")) +
geom_text(aes(label = paste0(Outlier_Percent, "%")),
position = position_dodge(width = 0.7),
vjust = -0.5, size = 3) +
ylim(0, max(plot_df$Outlier_Percent) * 1.2)
print(p3)
}
}
# 4. SCATTER PLOT PERBANDINGAN UNTUK DUA VARIABEL
visualize_scatter_comparison <- function(data_before, data_after) {
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
if(length(numeric_vars) >= 2) {
var1 <- numeric_vars[1]
var2 <- numeric_vars[2]
# Gabungkan data
before_data <- data.frame(
x = data_before[[var1]],
y = data_before[[var2]],
Status = "Sebelum"
)
after_data <- data.frame(
x = data_after[[var1]],
y = data_after[[var2]],
Status = "Sesudah"
)
combined_data <- rbind(before_data, after_data)
p4 <- ggplot(combined_data, aes(x = x, y = y, color = Status)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
labs(title = paste("Scatter Plot:", var1, "vs", var2),
x = var1,
y = var2,
color = "Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
stat_smooth(method = "lm", se = FALSE, alpha = 0.5)
print(p4)
}
}
# 5. PERBANDINGAN STATISTIK DESKRIPTIF
visualize_descriptive_stats <- function(data_before, data_after) {
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
if(length(numeric_vars) > 0) {
# Hitung statistik deskriptif
stats_before <- data.frame(
Variable = character(),
Mean = numeric(),
SD = numeric(),
stringsAsFactors = FALSE
)
stats_after <- data.frame(
Variable = character(),
Mean = numeric(),
SD = numeric(),
stringsAsFactors = FALSE
)
for(var in numeric_vars) {
stats_before <- rbind(stats_before, data.frame(
Variable = var,
Mean = mean(data_before[[var]], na.rm = TRUE),
SD = sd(data_before[[var]], na.rm = TRUE)
))
stats_after <- rbind(stats_after, data.frame(
Variable = var,
Mean = mean(data_after[[var]], na.rm = TRUE),
SD = sd(data_after[[var]], na.rm = TRUE)
))
}
# Gabungkan dan format untuk plotting
stats_combined <- rbind(
cbind(stats_before, Status = "Sebelum"),
cbind(stats_after, Status = "Sesudah")
)
# Plot untuk 3 variabel pertama
selected_vars <- numeric_vars[1:min(3, length(numeric_vars))]
stats_filtered <- stats_combined %>% filter(Variable %in% selected_vars)
p5 <- ggplot(stats_filtered, aes(x = Variable, y = Mean, fill = Status)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
geom_errorbar(aes(ymin = Mean - SD, ymax = Mean + SD),
position = position_dodge(width = 0.7),
width = 0.2) +
scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
labs(title = "Perbandingan Mean ± SD",
x = "Variabel",
y = "Nilai Mean",
fill = "Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 0, hjust = 0.5)) +
geom_text(aes(label = round(Mean, 2)),
position = position_dodge(width = 0.7),
vjust = -0.5, size = 3)
print(p5)
}
}
# 6. SUMMARY PEMBERSIHAN DATA
visualize_cleaning_summary <- function(results_list) {
# Buat data frame summary
summary_data <- data.frame(
Kategori = c("Missing Values", "Outlier", "Inkonsistensi"),
Sebelum = c(
sum(is.na(data_before)),
ifelse(!is.null(results_list$outlier_analysis),
sum(sapply(results_list$outlier_analysis, function(x) x$n_outliers_iqr)),
0),
ifelse(!is.null(results_list$inconsistencies),
sum(sapply(results_list$inconsistencies, nrow)),
0)
),
Sesudah = c(
sum(is.na(data_after)),
NA, # Outlier setelah penanganan
ifelse(!is.null(results_list$inconsistencies),
sum(sapply(results_list$inconsistencies, nrow)),
0)
)
)
# Hitung persentase perbaikan
summary_data$Perbaikan <- round((summary_data$Sebelum - summary_data$Sesudah) /
summary_data$Sebelum * 100, 1)
summary_data$Perbaikan[is.na(summary_data$Perbaikan)] <- 100
# Visualisasi
p6 <- ggplot(summary_data, aes(x = Kategori)) +
geom_bar(aes(y = Sebelum, fill = "Sebelum"),
stat = "identity", position = "dodge", width = 0.4) +
geom_bar(aes(y = Sesudah, fill = "Sesudah"),
stat = "identity", position = "dodge", width = 0.4) +
scale_fill_manual(name = "Status",
values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
labs(title = "Ringkasan Hasil Pembersihan Data",
x = "Kategori Masalah",
y = "Jumlah Kasus") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "bottom") +
geom_text(aes(y = Sebelum, label = Sebelum),
position = position_dodge(width = 0.4),
vjust = -0.5, size = 4, color = "#E74C3C") +
geom_text(aes(y = Sesudah, label = ifelse(is.na(Sesudah), "N/A", Sesudah)),
position = position_dodge(width = 0.4),
vjust = -0.5, size = 4, color = "#2ECC71") +
geom_text(aes(y = pmax(Sebelum, Sesudah, na.rm = TRUE),
label = ifelse(Kategori != "Outlier",
paste0(Perbaikan, "%"), "")),
vjust = 2, size = 3.5, fontface = "bold")
print(p6)
}
# Eksekusi semua visualisasi
visualize_missing_comparison(data_before, data_after)
visualize_distribution_comparison(data_before, data_after)
visualize_outlier_comparison(data_before, data_after)
visualize_scatter_comparison(data_before, data_after)
visualize_descriptive_stats(data_before, data_after)
visualize_cleaning_summary(results_list)
# Reset layout
par(mfrow = c(1, 1))
}
# 2. FUNGSI DASHBOARD INTERAKTIF (jika ingin lebih interaktif)
create_interactive_dashboard <- function(data_before, data_after, results_list) {
cat("\n=== MEMBUAT DASHBOARD INTERAKTIF ===\n")
# Install dan load plotly jika belum ada
if (!require("plotly")) {
install.packages("plotly")
library(plotly)
}
# 1. Heatmap Missing Values Interaktif
create_missing_heatmap <- function(data_before, data_after) {
# Hitung missing values
missing_before <- colSums(is.na(data_before))
missing_after <- colSums(is.na(data_after))
# Buat data frame
heat_data <- data.frame(
Variable = names(data_before),
Missing_Before = missing_before,
Missing_After = missing_after,
Reduction = missing_before - missing_after
)
# Buat heatmap interaktif
p <- plot_ly(
x = heat_data$Variable,
y = c("Sebelum", "Sesudah"),
z = as.matrix(rbind(heat_data$Missing_Before, heat_data$Missing_After)),
type = "heatmap",
colorscale = list(c(0, "#2ECC71"), c(1, "#E74C3C")),
hovertemplate = paste(
"Variabel: %{x}<br>",
"Status: %{y}<br>",
"Missing Values: %{z}<br>",
"<extra></extra>"
)
) %>%
layout(
title = "Heatmap Missing Values: Sebelum vs Sesudah",
xaxis = list(title = "Variabel"),
yaxis = list(title = "Status")
)
return(p)
}
# 2. Boxplot Interaktif untuk Outlier
create_interactive_boxplot <- function(data_before, data_after) {
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
if(length(numeric_vars) > 0) {
# Pilih variabel pertama
var <- numeric_vars[1]
# Buat data untuk plotly
plot_data <- data.frame(
Value = c(data_before[[var]], data_after[[var]]),
Status = rep(c("Sebelum", "Sesudah"),
c(length(data_before[[var]]), length(data_after[[var]])))
)
p <- plot_ly(plot_data, y = ~Value, color = ~Status, type = "box",
colors = c("#E74C3C", "#2ECC71")) %>%
layout(
title = paste("Boxplot", var, ": Sebelum vs Sesudah"),
yaxis = list(title = var),
boxmode = "group"
)
return(p)
}
}
# 3. Scatter Plot 3D Interaktif
create_3d_scatter <- function(data_before, data_after) {
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
if(length(numeric_vars) >= 3) {
# Buat data untuk plotting
before_df <- data.frame(
x = data_before[[numeric_vars[1]]],
y = data_before[[numeric_vars[2]]],
z = data_before[[numeric_vars[3]]],
Status = "Sebelum"
)
after_df <- data.frame(
x = data_after[[numeric_vars[1]]],
y = data_after[[numeric_vars[2]]],
z = data_after[[numeric_vars[3]]],
Status = "Sesudah"
)
plot_data <- rbind(before_df, after_df)
p <- plot_ly(plot_data, x = ~x, y = ~y, z = ~z,
color = ~Status, colors = c("#E74C3C", "#2ECC71"),
type = "scatter3d", mode = "markers",
marker = list(size = 3, opacity = 0.6),
hovertemplate = paste(
numeric_vars[1], ": %{x}<br>",
numeric_vars[2], ": %{y}<br>",
numeric_vars[3], ": %{z}<br>",
"<extra></extra>"
)) %>%
layout(
title = "Scatter Plot 3D: Sebelum vs Sesudah",
scene = list(
xaxis = list(title = numeric_vars[1]),
yaxis = list(title = numeric_vars[2]),
zaxis = list(title = numeric_vars[3])
)
)
return(p)
}
}
# Eksekusi pembuatan dashboard
cat("\n1. Membuat Heatmap Missing Values...\n")
heatmap_plot <- create_missing_heatmap(data_before, data_after)
cat("2. Membuat Boxplot Interaktif...\n")
boxplot_plot <- create_interactive_boxplot(data_before, data_after)
cat("3. Membuat Scatter Plot 3D...\n")
scatter_3d_plot <- create_3d_scatter(data_before, data_after)
# Tampilkan plots
print(heatmap_plot)
print(boxplot_plot)
if(!is.null(scatter_3d_plot)) {
print(scatter_3d_plot)
}
cat("\nDashboard interaktif telah dibuat!\n")
}
# 3. FUNGSI LAPORAN PDF
generate_pdf_report <- function(data_before, data_after, results_list, filename = "cleaning_report.pdf") {
cat("\n=== MEMBUAT LAPORAN PDF ===\n")
# Install dan load paket yang diperlukan
if (!require("gridExtra")) {
install.packages("gridExtra")
library(gridExtra)
}
# Buat fungsi untuk membuat table grob
create_summary_table <- function(data_before, data_after) {
# Statistik summary
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
summary_table <- data.frame(
Variabel = character(),
`Mean Sebelum` = numeric(),
`Mean Sesudah` = numeric(),
`SD Sebelum` = numeric(),
`SD Sesudah` = numeric(),
`% Perubahan` = numeric(),
stringsAsFactors = FALSE
)
for(var in numeric_vars[1:min(5, length(numeric_vars))]) {
mean_before <- mean(data_before[[var]], na.rm = TRUE)
mean_after <- mean(data_after[[var]], na.rm = TRUE)
sd_before <- sd(data_before[[var]], na.rm = TRUE)
sd_after <- sd(data_after[[var]], na.rm = TRUE)
percent_change <- round((mean_after - mean_before) / mean_before * 100, 2)
summary_table <- rbind(summary_table, data.frame(
Variabel = var,
`Mean Sebelum` = round(mean_before, 2),
`Mean Sesudah` = round(mean_after, 2),
`SD Sebelum` = round(sd_before, 2),
`SD Sesudah` = round(sd_after, 2),
`% Perubahan` = percent_change
))
}
return(summary_table)
}
# Buat PDF
pdf(filename, width = 11, height = 8.5)
# Halaman 1: Title Page
grid.arrange(
grobs = list(
grid::textGrob("LAPORAN PEMBERSIHAN DATA",
gp = grid::gpar(fontsize = 24, fontface = "bold")),
grid::textGrob("\n\nAnalisis Kualitas Data dan Hasil Pembersihan",
gp = grid::gpar(fontsize = 18)),
grid::textGrob(paste("\n\n\nTanggal:", Sys.Date()),
gp = grid::gpar(fontsize = 14)),
grid::textGrob("\n\n\nRingkasan Eksekusi:",
gp = grid::gpar(fontsize = 16, fontface = "bold")),
grid::tableGrob(create_summary_table(data_before, data_after),
theme = gridExtra::ttheme_default(
base_size = 10,
padding = unit(c(4, 4), "mm")
))
),
ncol = 1,
heights = c(1, 0.5, 0.5, 0.5, 2)
)
# Halaman 2: Visualizations
visualize_cleaning_results(data_before, data_after, results_list)
# Halaman 3: Detailed Statistics
grid.arrange(
grobs = list(
grid::textGrob("STATISTIK DETAIL PEMBERSIHAN",
gp = grid::gpar(fontsize = 18, fontface = "bold")),
grid::tableGrob(data.frame(
Metrik = c("Jumlah Observasi", "Jumlah Variabel",
"Missing Values Awal", "Missing Values Akhir",
"Persentase Data Bersih"),
Nilai = c(
nrow(data_before),
ncol(data_before),
sum(is.na(data_before)),
sum(is.na(data_after)),
paste0(round((1 - sum(is.na(data_after))/(nrow(data_after)*ncol(data_after))) * 100, 2), "%")
)
), theme = gridExtra::ttheme_default(base_size = 12))
),
ncol = 1,
heights = c(0.5, 1)
)
dev.off()
cat("\nLaporan PDF telah disimpan sebagai:", filename, "\n")
}
# 4. FUNGSI UTAMA UNTUK MENJALANKAN SEMUA VISUALISASI
run_all_visualizations <- function(data_before, data_after, results_list) {
cat("========================================\n")
cat("VISUALISASI KOMPREHENSIF HASIL PEMBERSIHAN\n")
cat("========================================\n\n")
# 1. Visualisasi statis (6 panel)
cat("1. Membuat visualisasi statis 6-panel...\n")
visualize_cleaning_results(data_before, data_after, results_list)
# 2. Dashboard interaktif
cat("\n2. Membuat dashboard interaktif...\n")
tryCatch({
create_interactive_dashboard(data_before, data_after, results_list)
}, error = function(e) {
cat("Dashboard interaktif tidak dapat dibuat:", e$message, "\n")
})
# 3. Simpan laporan PDF
cat("\n3. Membuat laporan PDF...\n")
generate_pdf_report(data_before, data_after, results_list)
# 4. Simpan plot individual
cat("\n4. Menyimpan plot individual...\n")
save_individual_plots(data_before, data_after, results_list)
cat("\n========================================\n")
cat("SEMUA VISUALISASI TELAH SELESAI DIBUAT\n")
cat("========================================\n")
}
# 5. FUNGSI UNTUK MENYIMPAN PLOT INDIVIDUAL
save_individual_plots <- function(data_before, data_after, results_list) {
# Buat folder untuk menyimpan plot
if (!dir.exists("cleaning_plots")) {
dir.create("cleaning_plots")
}
# 1. Plot missing values comparison
png("cleaning_plots/missing_values_comparison.png", width = 800, height = 600)
visualize_cleaning_results(data_before, data_after, results_list)
dev.off()
# 2. Distribution plots untuk setiap variabel numerik
numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
for(var in numeric_vars[1:min(5, length(numeric_vars))]) {
png(paste0("cleaning_plots/distribution_", var, ".png"), width = 800, height = 600)
plot_data <- data.frame(
Value = c(data_before[[var]], data_after[[var]]),
Status = rep(c("Sebelum", "Sesudah"),
c(length(data_before[[var]]), length(data_after[[var]])))
)
p <- ggplot(plot_data, aes(x = Value, fill = Status)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
labs(title = paste("Distribusi", var, ": Sebelum vs Sesudah"),
x = var, y = "Densitas") +
theme_minimal()
print(p)
dev.off()
}
# 3. Boxplot comparison
png("cleaning_plots/boxplot_comparison.png", width = 800, height = 600)
if(length(numeric_vars) > 0) {
# Gabungkan data
plot_data_before <- data_before[, numeric_vars[1:min(3, length(numeric_vars))]]
plot_data_after <- data_after[, numeric_vars[1:min(3, length(numeric_vars))]]
# Convert to long format
plot_data_before_long <- plot_data_before %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
mutate(Status = "Sebelum")
plot_data_after_long <- plot_data_after %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
mutate(Status = "Sesudah")
plot_data_combined <- rbind(plot_data_before_long, plot_data_after_long)
p <- ggplot(plot_data_combined, aes(x = Variable, y = Value, fill = Status)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
labs(title = "Perbandingan Boxplot: Sebelum vs Sesudah",
x = "Variabel", y = "Nilai") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
dev.off()
cat("\nPlot individual telah disimpan di folder 'cleaning_plots/'\n")
}
ANALISIS REGRESI LINEAR SEDERHANA
# ====================================================
# ANALISIS REGRESI LINEAR SEDERHANA
# Lengkap dengan Pengujian Hipotesis
# ====================================================
# 1. MEMUAT DATA CONTOH
# ----------------------
# Membuat data contoh sederhana untuk analisis
# Variabel X (prediktor) dan Y (respons)
set.seed(123) # Untuk hasil yang reproducible
# Data contoh: Hubungan antara Jam Belajar (X) dan Nilai Ujian (Y)
jam_belajar <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
nilai_ujian <- c(55, 58, 63, 65, 68, 70, 73, 75, 78, 80, 82, 85, 88, 90, 92) + rnorm(15, 0, 3)
# Buat data frame
data_studi <- data.frame(
jam_belajar = jam_belajar,
nilai_ujian = nilai_ujian
)
# Tampilkan data
cat("DATA CONTOH:\n")
## DATA CONTOH:
cat("Jam Belajar vs Nilai Ujian\n")
## Jam Belajar vs Nilai Ujian
## jam_belajar nilai_ujian
## 1 1 53.31857
## 2 2 57.30947
## 3 3 67.67612
## 4 4 65.21153
## 5 5 68.38786
## 6 6 75.14519
## 7 7 74.38275
## 8 8 71.20482
## 9 9 75.93944
## 10 10 78.66301
## 11 11 85.67225
## 12 12 86.07944
## 13 13 89.20231
## 14 14 90.33205
## 15 15 90.33248
# 2. ANALISIS DESKRIPTIF
# ----------------------
cat("\n\n=== ANALISIS DESKRIPTIF ===\n")
##
##
## === ANALISIS DESKRIPTIF ===
# Statistik deskriptif
cat("\n1. STATISTIK DESKRIPTIF:\n")
##
## 1. STATISTIK DESKRIPTIF:
desc_stats <- data.frame(
Variabel = c("Jam Belajar (X)", "Nilai Ujian (Y)"),
Mean = c(mean(data_studi$jam_belajar), mean(data_studi$nilai_ujian)),
SD = c(sd(data_studi$jam_belajar), sd(data_studi$nilai_ujian)),
Min = c(min(data_studi$jam_belajar), min(data_studi$nilai_ujian)),
Max = c(max(data_studi$jam_belajar), max(data_studi$nilai_ujian))
)
print(desc_stats)
## Variabel Mean SD Min Max
## 1 Jam Belajar (X) 8.00000 4.472136 1.00000 15.00000
## 2 Nilai Ujian (Y) 75.25715 11.688944 53.31857 90.33248
# Korelasi
cat("\n2. KORELASI:\n")
##
## 2. KORELASI:
correlation <- cor(data_studi$jam_belajar, data_studi$nilai_ujian)
cat("Koefisien Korelasi (r) =", round(correlation, 4), "\n")
## Koefisien Korelasi (r) = 0.9685
# Interpretasi korelasi
if (abs(correlation) >= 0.9) {
cat("Interpretasi: Korelasi sangat kuat\n")
} else if (abs(correlation) >= 0.7) {
cat("Interpretasi: Korelasi kuat\n")
} else if (abs(correlation) >= 0.5) {
cat("Interpretasi: Korelasi sedang\n")
} else if (abs(correlation) >= 0.3) {
cat("Interpretasi: Korelasi lemah\n")
} else {
cat("Interpretasi: Korelasi sangat lemah/tidak ada\n")
}
## Interpretasi: Korelasi sangat kuat
# 3. VISUALISASI DATA
# -------------------
cat("\n\n=== VISUALISASI DATA ===\n")
##
##
## === VISUALISASI DATA ===
# Scatter plot
par(mfrow = c(1, 2)) # Dua plot berdampingan
# Plot 1: Scatter plot tanpa garis regresi
plot(data_studi$jam_belajar, data_studi$nilai_ujian,
main = "Scatter Plot: Jam Belajar vs Nilai Ujian",
xlab = "Jam Belajar (X)",
ylab = "Nilai Ujian (Y)",
pch = 19, col = "blue", cex = 1.5)
grid()
# Plot 2: Scatter plot dengan garis regresi
plot(data_studi$jam_belajar, data_studi$nilai_ujian,
main = "Data dengan Garis Regresi",
xlab = "Jam Belajar (X)",
ylab = "Nilai Ujian (Y)",
pch = 19, col = "blue", cex = 1.5)
grid()

# Kembalikan ke satu plot per window
par(mfrow = c(1, 1))
# 4. REGRESI LINEAR SEDERHANA
# ---------------------------
cat("\n\n=== REGRESI LINEAR SEDERHANA ===\n")
##
##
## === REGRESI LINEAR SEDERHANA ===
# Model regresi linear sederhana
model <- lm(nilai_ujian ~ jam_belajar, data = data_studi)
# Ringkasan model
cat("\n1. RINGKASAN MODEL:\n")
##
## 1. RINGKASAN MODEL:
summary_model <- summary(model)
print(summary_model)
##
## Call:
## lm(formula = nilai_ujian ~ jam_belajar, data = data_studi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2191 -2.2466 0.0798 1.4727 5.0758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.0063 1.6416 33.51 5.25e-14 ***
## jam_belajar 2.5314 0.1806 14.02 3.17e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.021 on 13 degrees of freedom
## Multiple R-squared: 0.938, Adjusted R-squared: 0.9332
## F-statistic: 196.6 on 1 and 13 DF, p-value: 3.167e-09
# Ekstrak komponen penting
cat("\n2. PERSAMAAN REGRESI:\n")
##
## 2. PERSAMAAN REGRESI:
intercept <- coef(model)[1]
slope <- coef(model)[2]
cat("Persamaan Regresi: Y =", round(intercept, 4), "+", round(slope, 4), "* X\n")
## Persamaan Regresi: Y = 55.0063 + 2.5314 * X
cat("Artinya: Setiap penambahan 1 jam belajar, nilai ujian meningkat", round(slope, 2), "poin\n")
## Artinya: Setiap penambahan 1 jam belajar, nilai ujian meningkat 2.53 poin
# Koefisien determinasi
cat("\n3. KOEFISIEN DETERMINASI (R-squared):\n")
##
## 3. KOEFISIEN DETERMINASI (R-squared):
r_squared <- summary_model$r.squared
cat("R-squared =", round(r_squared, 4), "\n")
## R-squared = 0.938
cat("Artinya:", round(r_squared * 100, 2), "% variasi nilai ujian dapat dijelaskan oleh jam belajar\n")
## Artinya: 93.8 % variasi nilai ujian dapat dijelaskan oleh jam belajar
# 5. ASUMSI REGRESI LINEAR
# ------------------------
cat("\n\n=== UJI ASUMSI REGRESI LINEAR ===\n")
##
##
## === UJI ASUMSI REGRESI LINEAR ===
# 5.1 Normalitas Residual
cat("\n1. UJI NORMALITAS RESIDUAL (Shapiro-Wilk):\n")
##
## 1. UJI NORMALITAS RESIDUAL (Shapiro-Wilk):
residuals <- resid(model)
shapiro_test <- shapiro.test(residuals)
cat("Statistik Uji W =", round(shapiro_test$statistic, 4), "\n")
## Statistik Uji W = 0.9529
cat("p-value =", round(shapiro_test$p.value, 4), "\n")
## p-value = 0.5715
if (shapiro_test$p.value > 0.05) {
cat("Keputusan: Residual berdistribusi normal (p > 0.05)\n")
} else {
cat("Keputusan: Residual TIDAK berdistribusi normal (p < 0.05)\n")
}
## Keputusan: Residual berdistribusi normal (p > 0.05)
# Plot normalitas residual
par(mfrow = c(1, 2))
qqnorm(residuals, main = "Q-Q Plot Residual")
qqline(residuals, col = "red")
hist(residuals, main = "Histogram Residual",
xlab = "Residual", col = "lightblue", border = "black")

# 5.2 Homoskedastisitas (Uji Breusch-Pagan)
cat("\n2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):\n")
##
## 2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):
if (!require(lmtest)) {
install.packages("lmtest")
library(lmtest)
}
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bp_test <- bptest(model)
cat("Statistik Uji LM =", round(bp_test$statistic, 4), "\n")
## Statistik Uji LM = 3.2658
cat("p-value =", round(bp_test$p.value, 4), "\n")
## p-value = 0.0707
if (bp_test$p.value > 0.05) {
cat("Keputusan: Varian residual homogen (homoskedastisitas terpenuhi)\n")
} else {
cat("Keputusan: Varian residual TIDAK homogen (ada heteroskedastisitas)\n")
}
## Keputusan: Varian residual homogen (homoskedastisitas terpenuhi)
# Plot residual vs fitted
par(mfrow = c(1, 2))
plot(fitted(model), residuals,
main = "Residual vs Fitted Values",
xlab = "Fitted Values", ylab = "Residuals",
pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)
grid()
# Plot residual vs prediktor
plot(data_studi$jam_belajar, residuals,
main = "Residual vs Prediktor (X)",
xlab = "Jam Belajar", ylab = "Residuals",
pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)
grid()

par(mfrow = c(1, 1))
# 6. PENGUJIAN HIPOTESIS
# ----------------------
cat("\n\n=== PENGUJIAN HIPOTESIS ===\n")
##
##
## === PENGUJIAN HIPOTESIS ===
# 6.1 Hipotesis untuk Koefisien Regresi (Slope)
cat("\n1. UJI HIPOTESIS UNTUK SLOPE (β1):\n")
##
## 1. UJI HIPOTESIS UNTUK SLOPE (β1):
cat(" Hipotesis Nol (H0): β1 = 0 (Tidak ada hubungan linear)\n")
## Hipotesis Nol (H0): β1 = 0 (Tidak ada hubungan linear)
cat(" Hipotesis Alternatif (H1): β1 ≠ 0 (Ada hubungan linear)\n")
## Hipotesis Alternatif (H1): β1 ≠ 0 (Ada hubungan linear)
# Dari output summary model
coef_table <- summary_model$coefficients
slope_p_value <- coef_table[2, 4]
cat("\n Hasil Uji:\n")
##
## Hasil Uji:
cat(" t-statistik =", round(coef_table[2, 3], 4), "\n")
## t-statistik = 14.0201
cat(" p-value =", round(slope_p_value, 6), "\n")
## p-value = 0
alpha <- 0.05
cat(" Tingkat signifikansi (α) =", alpha, "\n")
## Tingkat signifikansi (α) = 0.05
if (slope_p_value < alpha) {
cat("\n KEPUTUSAN: Tolak H0\n")
cat(" KESIMPULAN: Ada bukti statistik yang cukup untuk menyatakan bahwa\n")
cat(" terdapat hubungan linear yang signifikan antara jam belajar dan nilai ujian\n")
} else {
cat("\n KEPUTUSAN: Gagal tolak H0\n")
cat(" KESIMPULAN: Tidak ada bukti statistik yang cukup untuk menyatakan bahwa\n")
cat(" terdapat hubungan linear yang signifikan antara jam belajar dan nilai ujian\n")
}
##
## KEPUTUSAN: Tolak H0
## KESIMPULAN: Ada bukti statistik yang cukup untuk menyatakan bahwa
## terdapat hubungan linear yang signifikan antara jam belajar dan nilai ujian
# 6.2 Interval Kepercayaan
cat("\n2. INTERVAL KEPERCAYAAN 95% UNTUK KOEFISIEN:\n")
##
## 2. INTERVAL KEPERCAYAAN 95% UNTUK KOEFISIEN:
conf_int <- confint(model, level = 0.95)
cat(" Intercept (β0): [", round(conf_int[1, 1], 4), ", ", round(conf_int[1, 2], 4), "]\n", sep = "")
## Intercept (β0): [51.4598, 58.5527]
cat(" Slope (β1): [", round(conf_int[2, 1], 4), ", ", round(conf_int[2, 2], 4), "]\n", sep = "")
## Slope (β1): [2.1413, 2.9214]
# 7. PREDIKSI
# -----------
cat("\n\n=== PREDIKSI ===\n")
##
##
## === PREDIKSI ===
# Prediksi untuk nilai X tertentu
jam_prediksi <- c(5, 10, 15)
prediksi <- predict(model, newdata = data.frame(jam_belajar = jam_prediksi),
interval = "confidence", level = 0.95)
cat("\n1. PREDIKSI NILAI UJIAN:\n")
##
## 1. PREDIKSI NILAI UJIAN:
prediksi_df <- data.frame(
Jam_Belajar = jam_prediksi,
Prediksi_Nilai = round(prediksi[, "fit"], 2),
Batas_Bawah = round(prediksi[, "lwr"], 2),
Batas_Atas = round(prediksi[, "upr"], 2)
)
print(prediksi_df)
## Jam_Belajar Prediksi_Nilai Batas_Bawah Batas_Atas
## 1 5 67.66 65.61 69.71
## 2 10 80.32 78.46 82.18
## 3 15 92.98 89.77 96.19
# Plot dengan interval kepercayaan
x_range <- seq(min(data_studi$jam_belajar), max(data_studi$jam_belajar), length.out = 100)
pred_range <- predict(model, newdata = data.frame(jam_belajar = x_range),
interval = "confidence", level = 0.95)
plot(data_studi$jam_belajar, data_studi$nilai_ujian,
main = "Regresi Linear dengan Interval Kepercayaan",
xlab = "Jam Belajar", ylab = "Nilai Ujian",
pch = 19, col = "blue", cex = 1.2)
# Garis regresi
lines(x_range, pred_range[, "fit"], col = "red", lwd = 2)
# Interval kepercayaan
polygon(c(x_range, rev(x_range)),
c(pred_range[, "lwr"], rev(pred_range[, "upr"])),
col = rgb(1, 0, 0, 0.2), border = NA)
legend("topleft",
legend = c("Data", "Garis Regresi", "95% CI"),
col = c("blue", "red", rgb(1, 0, 0, 0.2)),
pch = c(19, NA, 15),
lty = c(NA, 1, NA),
lwd = c(NA, 2, NA),
pt.cex = c(1.2, NA, 2))

# 8. LAPORAN LENGKAP
# ------------------
cat("\n\n=== LAPORAN ANALISIS LENGKAP ===\n")
##
##
## === LAPORAN ANALISIS LENGKAP ===
cat("====================================\n")
## ====================================
cat("\nA. INFORMASI DATA\n")
##
## A. INFORMASI DATA
cat(" Jumlah observasi:", nrow(data_studi), "\n")
## Jumlah observasi: 15
cat(" Variabel prediktor (X): Jam Belajar\n")
## Variabel prediktor (X): Jam Belajar
cat(" Variabel respons (Y): Nilai Ujian\n")
## Variabel respons (Y): Nilai Ujian
cat("\nB. HASIL REGRESI LINEAR\n")
##
## B. HASIL REGRESI LINEAR
cat(" Persamaan: Y =", round(intercept, 4), "+", round(slope, 4), "X\n")
## Persamaan: Y = 55.0063 + 2.5314 X
cat(" Koefisien determinasi (R²):", round(r_squared, 4), "\n")
## Koefisien determinasi (R²): 0.938
cat(" Koefisien korelasi (r):", round(correlation, 4), "\n")
## Koefisien korelasi (r): 0.9685
cat("\nC. PENGUJIAN HIPOTESIS\n")
##
## C. PENGUJIAN HIPOTESIS
cat(" Hipotesis H0: β1 = 0 (Tidak ada hubungan linear)\n")
## Hipotesis H0: β1 = 0 (Tidak ada hubungan linear)
cat(" Hipotesis H1: β1 ≠ 0 (Ada hubungan linear)\n")
## Hipotesis H1: β1 ≠ 0 (Ada hubungan linear)
cat(" p-value:", round(slope_p_value, 6), "\n")
## p-value: 0
if (slope_p_value < 0.05) {
cat(" KESIMPULAN: Hubungan linear SIGNIFIKAN (p < 0.05)\n")
} else {
cat(" KESIMPULAN: Hubungan linear TIDAK signifikan (p ≥ 0.05)\n")
}
## KESIMPULAN: Hubungan linear SIGNIFIKAN (p < 0.05)
cat("\nD. ASUMSI REGRESI\n")
##
## D. ASUMSI REGRESI
cat(" 1. Normalitas residual: p-value =", round(shapiro_test$p.value, 4))
## 1. Normalitas residual: p-value = 0.5715
if (shapiro_test$p.value > 0.05) {
cat(" (Terpenuhi)\n")
} else {
cat(" (Tidak terpenuhi)\n")
}
## (Terpenuhi)
cat(" 2. Homoskedastisitas: p-value =", round(bp_test$p.value, 4))
## 2. Homoskedastisitas: p-value = 0.0707
if (bp_test$p.value > 0.05) {
cat(" (Terpenuhi)\n")
} else {
cat(" (Tidak terpenuhi)\n")
}
## (Terpenuhi)
cat("\nE. INTERPRETASI PRAKTIS\n")
##
## E. INTERPRETASI PRAKTIS
cat(" Setiap penambahan 1 jam belajar diharapkan meningkatkan nilai ujian sebesar",
round(slope, 2), "poin\n")
## Setiap penambahan 1 jam belajar diharapkan meningkatkan nilai ujian sebesar 2.53 poin
cat(" Model ini menjelaskan", round(r_squared * 100, 1),
"% variasi dalam nilai ujian\n")
## Model ini menjelaskan 93.8 % variasi dalam nilai ujian
# 9. FUNGSI UNTUK ANALISIS UMUM
# ------------------------------
cat("\n\n=== FUNGSI UMUM UNTUK ANALISIS REGRESI ===\n")
##
##
## === FUNGSI UMUM UNTUK ANALISIS REGRESI ===
# Fungsi untuk analisis regresi linear sederhana
analisis_regresi_sederhana <- function(x, y, x_name = "X", y_name = "Y", alpha = 0.05) {
# Buat data frame
data_analisis <- data.frame(x = x, y = y)
# Model regresi
model <- lm(y ~ x, data = data_analisis)
summary_model <- summary(model)
# Hasil utama
intercept <- coef(model)[1]
slope <- coef(model)[2]
r_squared <- summary_model$r.squared
p_value <- summary_model$coefficients[2, 4]
# Uji asumsi
shapiro_test <- shapiro.test(resid(model))
bp_test <- bptest(model)
# Buat laporan
cat("\n=== HASIL ANALISIS REGRESI LINEAR ===\n")
cat("Variabel X:", x_name, "\n")
cat("Variabel Y:", y_name, "\n")
cat("\n1. Persamaan Regresi:\n")
cat(" Y =", round(intercept, 4), "+", round(slope, 4), "X\n")
cat("\n2. Uji Signifikansi:\n")
cat(" R-squared =", round(r_squared, 4), "\n")
cat(" p-value =", round(p_value, 6), "\n")
if (p_value < alpha) {
cat(" Kesimpulan: Hubungan SIGNIFIKAN (p <", alpha, ")\n")
} else {
cat(" Kesimpulan: Hubungan TIDAK signifikan (p ≥", alpha, ")\n")
}
cat("\n3. Uji Asumsi:\n")
cat(" Normalitas residual: p =", round(shapiro_test$p.value, 4))
if (shapiro_test$p.value > alpha) {
cat(" (OK)\n")
} else {
cat(" (VIOLATION)\n")
}
cat(" Homoskedastisitas: p =", round(bp_test$p.value, 4))
if (bp_test$p.value > alpha) {
cat(" (OK)\n")
} else {
cat(" (VIOLATION)\n")
}
# Plot
par(mfrow = c(2, 2))
plot(x, y, main = paste("Scatter Plot:", x_name, "vs", y_name),
xlab = x_name, ylab = y_name, pch = 19, col = "blue")
abline(model, col = "red", lwd = 2)
qqnorm(resid(model), main = "Q-Q Plot Residual")
qqline(resid(model), col = "red")
plot(fitted(model), resid(model),
main = "Residual vs Fitted",
xlab = "Fitted Values", ylab = "Residuals",
pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)
hist(resid(model), main = "Histogram Residual",
xlab = "Residuals", col = "lightblue")
par(mfrow = c(1, 1))
return(list(
model = model,
intercept = intercept,
slope = slope,
r_squared = r_squared,
p_value = p_value,
assumptions = list(
normality = shapiro_test$p.value,
homoscedasticity = bp_test$p.value
)
))
}
# Contoh penggunaan fungsi
cat("\nContoh penggunaan fungsi:\n")
##
## Contoh penggunaan fungsi:
cat("hasil <- analisis_regresi_sederhana(jam_belajar, nilai_ujian, 'Jam Belajar', 'Nilai Ujian')\n")
## hasil <- analisis_regresi_sederhana(jam_belajar, nilai_ujian, 'Jam Belajar', 'Nilai Ujian')
# 10. SIMPAN HASIL ANALISIS
# -------------------------
cat("\n\n=== MENYIMPAN HASIL ANALISIS ===\n")
##
##
## === MENYIMPAN HASIL ANALISIS ===
# Buat data frame hasil
hasil_analisis <- data.frame(
Komponen = c("Intercept (β0)", "Slope (β1)", "R-squared", "p-value slope",
"p-value normalitas", "p-value homoskedastisitas"),
Nilai = c(round(intercept, 4), round(slope, 4), round(r_squared, 4),
round(slope_p_value, 6), round(shapiro_test$p.value, 4),
round(bp_test$p.value, 4)),
Interpretasi = c(
"Nilai ujian ketika jam belajar = 0",
"Peningkatan nilai per jam belajar",
"Proporsi variasi yang dijelaskan",
ifelse(slope_p_value < 0.05, "Signifikan", "Tidak signifikan"),
ifelse(shapiro_test$p.value > 0.05, "Normal", "Tidak normal"),
ifelse(bp_test$p.value > 0.05, "Homogen", "Heterogen")
)
)
cat("\nTabel Hasil Analisis:\n")
##
## Tabel Hasil Analisis:
## Komponen Nilai Interpretasi
## 1 Intercept (β0) 55.0063 Nilai ujian ketika jam belajar = 0
## 2 Slope (β1) 2.5314 Peningkatan nilai per jam belajar
## 3 R-squared 0.9380 Proporsi variasi yang dijelaskan
## 4 p-value slope 0.0000 Signifikan
## 5 p-value normalitas 0.5715 Normal
## 6 p-value homoskedastisitas 0.0707 Homogen
# Simpan ke file CSV
write.csv(hasil_analisis, "hasil_regresi.csv", row.names = FALSE)
cat("\nHasil analisis disimpan di 'hasil_regresi.csv'\n")
##
## Hasil analisis disimpan di 'hasil_regresi.csv'
cat("\n====================================\n")
##
## ====================================
cat("ANALISIS REGRESI LINEAR SELESAI\n")
## ANALISIS REGRESI LINEAR SELESAI
cat("====================================\n")
## ====================================