DATA

# Install dan load packages yang diperlukan
#if (!require("tidyverse")) install.packages("tidyverse")
#if (!require("VIM")) install.packages("VIM")
#if (!require("naniar")) install.packages("naniar")
#if (!require("outliers")) install.packages("outliers")
#if (!require("ggplot2")) install.packages("ggplot2")

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
library(naniar)
library(outliers)
library(ggplot2)
library(mice)
## Warning: package 'mice' was built under R version 4.5.2
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
# Set seed untuk reproduksibilitas
set.seed(123)

# Buat data simulasi dengan masalah kualitas data
create_sample_data <- function(n = 1000) {
  data <- data.frame(
    id = 1:n,
    age = round(rnorm(n, mean = 45, sd = 15)),
    income = round(rnorm(n, mean = 50000, sd = 20000)),
    blood_pressure_systolic = rnorm(n, mean = 120, sd = 15),
    blood_pressure_diastolic = rnorm(n, mean = 80, sd = 10),
    cholesterol = rnorm(n, mean = 200, sd = 40),
    diabetes = sample(c("Yes", "No", NA), n, replace = TRUE, prob = c(0.1, 0.85, 0.05)),
    smoking_status = sample(c("Never", "Former", "Current", NA), n, replace = TRUE),
    bmi = rnorm(n, mean = 25, sd = 5)
  )
  
  # Tambahkan missing values secara sistematis
  data$income[sample(1:n, 50)] <- NA
  data$cholesterol[sample(1:n, 30)] <- NA
  data$blood_pressure_systolic[sample(1:n, 20)] <- NA
  
  # Tambahkan outlier
  data$income[c(10, 50, 100)] <- c(500000, -10000, 1000000)  # Outlier ekstrem
  data$age[c(200, 300)] <- c(150, -5)  # Usia tidak realistis
  data$bmi[c(150, 250)] <- c(60, 10)  # BMI ekstrem
  
  # Tambahkan inkonsistensi
  data$blood_pressure_diastolic[400] <- 200  # Diastolic lebih tinggi dari systolic
  data$blood_pressure_systolic[400] <- 100
  
  return(data)
}

# Muat data
data <- create_sample_data()
head(data)
##   id age income blood_pressure_systolic blood_pressure_diastolic cholesterol
## 1  1  37  30084                112.3259                 78.49693    207.8620
## 2  2  42  29201                123.5541                 76.72243    226.0045
## 3  3  68  49640                111.8762                 65.51835    226.8402
## 4  4  46  47356                138.2884                 73.02715    148.6337
## 5  5  47   -987                122.6120                105.98490    118.9556
## 6  6  71  70811                110.7710                 79.62585    288.2130
##   diabetes smoking_status      bmi
## 1       No          Never 21.50386
## 2       No          Never 29.98226
## 3      Yes        Current 21.53627
## 4       No         Former 24.48258
## 5       No        Current 28.01933
## 6       No           <NA> 21.95978
tail(data)
##        id age income blood_pressure_systolic blood_pressure_diastolic
## 995   995  50  24400                108.2535                 82.99491
## 996   996  44  51533                135.9145                 65.71525
## 997   997  61  55103                113.3174                 92.67497
## 998   998  25  55549                113.5623                 92.14506
## 999   999  37  60737                137.8352                 73.25144
## 1000 1000  41  40790                132.5144                 91.21022
##      cholesterol diabetes smoking_status      bmi
## 995     142.8945       No        Current 23.99396
## 996     165.9833       No         Former 16.11022
## 997     264.9788       No         Former 25.56115
## 998     194.9345       No          Never 25.77177
## 999     251.0241       No        Current 21.90856
## 1000    207.1798       No        Current 24.88377

DATA HILANG

# 1. Ringkasan missing values
analyze_missing_values <- function(data) {
  cat("=== ANALISIS DATA HILANG ===\n\n")
  
  # Total missing values
  total_missing <- sum(is.na(data))
  cat("Total nilai hilang:", total_missing, "\n")
  cat("Persentase nilai hilang:", round(total_missing / (nrow(data) * ncol(data)) * 100, 2), "%\n\n")
  
  # Missing values per variabel
  missing_per_var <- sapply(data, function(x) sum(is.na(x)))
  missing_df <- data.frame(
    Variable = names(missing_per_var),
    Missing_Count = missing_per_var,
    Missing_Percentage = round(missing_per_var / nrow(data) * 100, 2)
  ) %>%
    arrange(desc(Missing_Count))
  
  print("Missing values per variabel:")
  print(missing_df)
  
  # Pattern missing values
  cat("\nPattern data hilang (5 observasi pertama):\n")
  print(head(md.pattern(data, plot = FALSE), 5))
  
  # Visualisasi missing values
  par(mfrow = c(1, 2))
  
  # Plot 1: Aggregation plot
  tryCatch({
    aggr_plot <- aggr(data, col = c('navyblue', 'red'), 
                     numbers = TRUE, sortVars = TRUE, 
                     labels = names(data), cex.axis = 0.7, 
                     gap = 3, ylab = c("Histogram data hilang", "Pattern"))
  }, error = function(e) {
    cat("Gagal membuat plot aggr, menggunakan alternatif...\n")
  })
  
  # Plot 2: Heatmap missing values
  vis_miss(data, warn_large_data = FALSE) +
    theme_minimal() +
    labs(title = "Heatmap Data Hilang")
  
  par(mfrow = c(1, 1))
  
  return(missing_df)
}

# Jalankan analisis missing values
missing_analysis <- analyze_missing_values(data)
## === ANALISIS DATA HILANG ===
## 
## Total nilai hilang: 381 
## Persentase nilai hilang: 4.23 %
## 
## [1] "Missing values per variabel:"
##                                          Variable Missing_Count
## smoking_status                     smoking_status           230
## diabetes                                 diabetes            51
## income                                     income            50
## cholesterol                           cholesterol            30
## blood_pressure_systolic   blood_pressure_systolic            20
## id                                             id             0
## age                                           age             0
## blood_pressure_diastolic blood_pressure_diastolic             0
## bmi                                           bmi             0
##                          Missing_Percentage
## smoking_status                         23.0
## diabetes                                5.1
## income                                  5.0
## cholesterol                             3.0
## blood_pressure_systolic                 2.0
## id                                      0.0
## age                                     0.0
## blood_pressure_diastolic                0.0
## bmi                                     0.0
## 
## Pattern data hilang (5 observasi pertama):
##     id age blood_pressure_diastolic bmi blood_pressure_systolic cholesterol
## 660  1   1                        1   1                       1           1
## 198  1   1                        1   1                       1           1
## 34   1   1                        1   1                       1           1
## 12   1   1                        1   1                       1           1
## 34   1   1                        1   1                       1           1
##     income diabetes smoking_status  
## 660      1        1              1 0
## 198      1        1              0 1
## 34       1        0              1 1
## 12       1        0              0 2
## 34       0        1              1 1
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##                  Variable Count
##            smoking_status 0.230
##                  diabetes 0.051
##                    income 0.050
##               cholesterol 0.030
##   blood_pressure_systolic 0.020
##                        id 0.000
##                       age 0.000
##  blood_pressure_diastolic 0.000
##                       bmi 0.000

IMPUTASI DATA HILANG

# 2. Metode penanganan data hilang
handle_missing_values <- function(data, method = "mean") {
  cat("\n=== PENANGANAN DATA HILANG ===\n")
  cat("Metode yang digunakan:", method, "\n\n")
  
  data_clean <- data
  
  # Identifikasi variabel numerik dan kategorikal
  numeric_vars <- sapply(data_clean, is.numeric)
  categorical_vars <- sapply(data_clean, function(x) is.character(x) | is.factor(x))
  
  # Tangani missing values untuk variabel numerik
  for(var in names(data_clean)[numeric_vars]) {
    na_count <- sum(is.na(data_clean[[var]]))
    
    if(na_count > 0) {
      cat("Variabel", var, "memiliki", na_count, "nilai hilang\n")
      
      if(method == "mean") {
        # Imputasi dengan mean
        mean_val <- mean(data_clean[[var]], na.rm = TRUE)
        data_clean[[var]][is.na(data_clean[[var]])] <- mean_val
        cat("  Diimputasi dengan mean:", round(mean_val, 2), "\n")
        
      } else if(method == "median") {
        # Imputasi dengan median
        median_val <- median(data_clean[[var]], na.rm = TRUE)
        data_clean[[var]][is.na(data_clean[[var]])] <- median_val
        cat("  Diimputasi dengan median:", median_val, "\n")
        
      } else if(method == "knn") {
        # Imputasi dengan KNN (sederhana)
        require(VIM)
        temp_data <- data_clean[, numeric_vars]
        knn_imputed <- kNN(temp_data, variable = var, k = 5)
        data_clean[[var]] <- knn_imputed[[var]]
        cat("  Diimputasi dengan KNN (k=5)\n")
        
      } else if(method == "delete") {
        # Hapus baris dengan missing values
        cat("  Opsi hapus: akan dihapus dalam langkah terpisah\n")
      }
    }
  }
  
  # Tangani missing values untuk variabel kategorikal
  for(var in names(data_clean)[categorical_vars]) {
    na_count <- sum(is.na(data_clean[[var]]))
    
    if(na_count > 0) {
      cat("Variabel kategorikal", var, "memiliki", na_count, "nilai hilang\n")
      
      # Imputasi dengan modus
      mode_val <- names(sort(table(data_clean[[var]]), decreasing = TRUE))[1]
      data_clean[[var]][is.na(data_clean[[var]])] <- mode_val
      cat("  Diimputasi dengan modus:", mode_val, "\n")
    }
  }
  
  # Opsi: Hapus baris dengan missing values
  if(method == "delete") {
    rows_before <- nrow(data_clean)
    data_clean <- na.omit(data_clean)
    rows_after <- nrow(data_clean)
    cat("\nDihapus", rows_before - rows_after, "baris dengan missing values\n")
  }
  
  # Verifikasi setelah imputasi
  cat("\nMissing values setelah penanganan:\n")
  print(colSums(is.na(data_clean)))
  
  return(data_clean)
}

# Contoh penggunaan metode imputasi yang berbeda
data_mean_imputed <- handle_missing_values(data, method = "mean")
## 
## === PENANGANAN DATA HILANG ===
## Metode yang digunakan: mean 
## 
## Variabel income memiliki 50 nilai hilang
##   Diimputasi dengan mean: 52323.07 
## Variabel blood_pressure_systolic memiliki 20 nilai hilang
##   Diimputasi dengan mean: 119.62 
## Variabel cholesterol memiliki 30 nilai hilang
##   Diimputasi dengan mean: 198.52 
## Variabel kategorikal diabetes memiliki 51 nilai hilang
##   Diimputasi dengan modus: No 
## Variabel kategorikal smoking_status memiliki 230 nilai hilang
##   Diimputasi dengan modus: Never 
## 
## Missing values setelah penanganan:
##                       id                      age                   income 
##                        0                        0                        0 
##  blood_pressure_systolic blood_pressure_diastolic              cholesterol 
##                        0                        0                        0 
##                 diabetes           smoking_status                      bmi 
##                        0                        0                        0
data_median_imputed <- handle_missing_values(data, method = "median")
## 
## === PENANGANAN DATA HILANG ===
## Metode yang digunakan: median 
## 
## Variabel income memiliki 50 nilai hilang
##   Diimputasi dengan median: 51343 
## Variabel blood_pressure_systolic memiliki 20 nilai hilang
##   Diimputasi dengan median: 119.0977 
## Variabel cholesterol memiliki 30 nilai hilang
##   Diimputasi dengan median: 198.4721 
## Variabel kategorikal diabetes memiliki 51 nilai hilang
##   Diimputasi dengan modus: No 
## Variabel kategorikal smoking_status memiliki 230 nilai hilang
##   Diimputasi dengan modus: Never 
## 
## Missing values setelah penanganan:
##                       id                      age                   income 
##                        0                        0                        0 
##  blood_pressure_systolic blood_pressure_diastolic              cholesterol 
##                        0                        0                        0 
##                 diabetes           smoking_status                      bmi 
##                        0                        0                        0

OUTLIER

# 3. Deteksi outlier
detect_outliers <- function(data) {
  cat("\n=== DETEKSI OUTLIER ===\n\n")
  
  outlier_report <- list()
  
  for(var in names(data)[sapply(data, is.numeric)]) {
    cat("Analisis outlier untuk variabel:", var, "\n")
    
    # Statistik deskriptif
    stats <- summary(data[[var]])
    iqr_val <- IQR(data[[var]], na.rm = TRUE)
    q1 <- quantile(data[[var]], 0.25, na.rm = TRUE)
    q3 <- quantile(data[[var]], 0.75, na.rm = TRUE)
    lower_bound <- q1 - 1.5 * iqr_val
    upper_bound <- q3 + 1.5 * iqr_val
    
    # Deteksi outlier dengan metode IQR
    outliers_iqr <- data[[var]][data[[var]] < lower_bound | data[[var]] > upper_bound]
    
    # Deteksi outlier dengan metode Z-score
    z_scores <- scale(data[[var]])
    outliers_z <- data[[var]][abs(z_scores) > 3]
    
    # Deteksi outlier dengan metode Grubbs (uji statistik)
    if(length(na.omit(data[[var]])) > 6) {
      tryCatch({
        grubbs_test <- grubbs.test(na.omit(data[[var]]))
        grubbs_outlier <- ifelse(grubbs_test$p.value < 0.05, "Terdeteksi", "Tidak terdeteksi")
      }, error = function(e) {
        grubbs_outlier <- "Tidak dapat dihitung"
      })
    } else {
      grubbs_outlier <- "Data tidak cukup"
    }
    
    # Ringkasan
    outlier_report[[var]] <- list(
      n_outliers_iqr = length(outliers_iqr),
      n_outliers_z = length(outliers_z),
      grubbs_result = grubbs_outlier,
      lower_bound = lower_bound,
      upper_bound = upper_bound,
      outlier_values = unique(round(outliers_iqr, 2))
    )
    
    cat("  - Outlier (IQR method):", length(outliers_iqr), "\n")
    cat("  - Outlier (Z-score > 3):", length(outliers_z), "\n")
    cat("  - Uji Grubbs:", grubbs_outlier, "\n")
    cat("  - Batas bawah:", round(lower_bound, 2), "\n")
    cat("  - Batas atas:", round(upper_bound, 2), "\n")
    
    if(length(outliers_iqr) > 0) {
      cat("  - Nilai outlier:", paste(head(unique(round(outliers_iqr, 2)), 5), collapse = ", "), "\n")
    }
    cat("\n")
  }
  
  return(outlier_report)
}

# Deteksi outlier
outlier_analysis <- detect_outliers(data)
## 
## === DETEKSI OUTLIER ===
## 
## Analisis outlier untuk variabel: id 
##   - Outlier (IQR method): 0 
##   - Outlier (Z-score > 3): 0 
##   - Uji Grubbs: Tidak terdeteksi 
##   - Batas bawah: -498.5 
##   - Batas atas: 1499.5 
## 
## Analisis outlier untuk variabel: age 
##   - Outlier (IQR method): 12 
##   - Outlier (Z-score > 3): 3 
##   - Uji Grubbs: Terdeteksi 
##   - Batas bawah: 7.5 
##   - Batas atas: 83.5 
##   - Nilai outlier: 94, 150, -5, 84, 5 
## 
## Analisis outlier untuk variabel: income 
##   - Outlier (IQR method): 57 
##   - Outlier (Z-score > 3): 52 
##   - Uji Grubbs: Terdeteksi 
##   - Batas bawah: -5850.62 
##   - Batas atas: 107870.4 
##   - Nilai outlier: NA, 5e+05, -10000, 1e+06, 113681 
## 
## Analisis outlier untuk variabel: blood_pressure_systolic 
##   - Outlier (IQR method): 26 
##   - Outlier (Z-score > 3): 22 
##   - Uji Grubbs: Tidak terdeteksi 
##   - Batas bawah: 80.59 
##   - Batas atas: 159.07 
##   - Nilai outlier: NA, 165.33, 77.27, 161.03, 80.51 
## 
## Analisis outlier untuk variabel: blood_pressure_diastolic 
##   - Outlier (IQR method): 12 
##   - Outlier (Z-score > 3): 1 
##   - Uji Grubbs: Terdeteksi 
##   - Batas bawah: 54.21 
##   - Batas atas: 105.93 
##   - Nilai outlier: 105.98, 50.82, 106.54, 50.5, 51.61 
## 
## Analisis outlier untuk variabel: cholesterol 
##   - Outlier (IQR method): 41 
##   - Outlier (Z-score > 3): 36 
##   - Uji Grubbs: Tidak terdeteksi 
##   - Batas bawah: 90.7 
##   - Batas atas: 306.12 
##   - Nilai outlier: NA, 89.86, 318.86, 84.49, 330.87 
## 
## Analisis outlier untuk variabel: bmi 
##   - Outlier (IQR method): 11 
##   - Outlier (Z-score > 3): 4 
##   - Uji Grubbs: Terdeteksi 
##   - Batas bawah: 11.21 
##   - Batas atas: 38.82 
##   - Nilai outlier: 10.38, 60, 8.55, 10, 10.57
# 4. Visualisasi outlier
visualize_outliers <- function(data) {
  cat("\n=== VISUALISASI OUTLIER ===\n")
  
  numeric_vars <- names(data)[sapply(data, is.numeric)]
  
  # Boxplot untuk setiap variabel numerik
  par(mfrow = c(2, 3))
  for(var in numeric_vars[1:min(6, length(numeric_vars))]) {
    boxplot(data[[var]], main = var, col = "lightblue", 
            ylab = "Nilai", outline = TRUE)
    grid()
  }
  par(mfrow = c(1, 1))
  
  # Histogram dengan overlay outlier
  for(var in numeric_vars[1:min(3, length(numeric_vars))]) {
    # Hitung batas outlier
    q1 <- quantile(data[[var]], 0.25, na.rm = TRUE)
    q3 <- quantile(data[[var]], 0.75, na.rm = TRUE)
    iqr_val <- IQR(data[[var]], na.rm = TRUE)
    lower_bound <- q1 - 1.5 * iqr_val
    upper_bound <- q3 + 1.5 * iqr_val
    
    # Identifikasi outlier
    is_outlier <- data[[var]] < lower_bound | data[[var]] > upper_bound
    
    # Plot histogram
    hist_data <- ggplot(data.frame(value = data[[var]]), aes(x = value)) +
      geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", alpha = 0.7) +
      geom_density(color = "darkblue", linewidth = 1) +
      geom_vline(xintercept = c(lower_bound, upper_bound), 
                 color = "red", linetype = "dashed", linewidth = 1) +
      labs(title = paste("Distribusi dan Outlier:", var),
           x = var, y = "Density") +
      theme_minimal() +
      annotate("text", x = lower_bound, y = 0, 
               label = "Bawah", vjust = 2, color = "red") +
      annotate("text", x = upper_bound, y = 0, 
               label = "Atas", vjust = 2, color = "red")
    
    print(hist_data)
  }
  
  # Scatter plot matrix untuk melihat outlier multivariat
  if(length(numeric_vars) >= 3) {
    pairs(data[, numeric_vars[1:min(4, length(numeric_vars))]], 
          main = "Scatter Plot Matrix untuk Deteksi Outlier",
          pch = 19, col = alpha("blue", 0.6))
  }
}

# Jalankan visualisasi
visualize_outliers(data)
## 
## === VISUALISASI OUTLIER ===

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density()`).

# 5. Metode penanganan outlier
handle_outliers <- function(data, method = "cap") {
  cat("\n=== PENANGANAN OUTLIER ===\n")
  cat("Metode yang digunakan:", method, "\n\n")
  
  data_processed <- data
  
  for(var in names(data_processed)[sapply(data_processed, is.numeric)]) {
    # Hitung batas outlier
    q1 <- quantile(data_processed[[var]], 0.25, na.rm = TRUE)
    q3 <- quantile(data_processed[[var]], 0.75, na.rm = TRUE)
    iqr_val <- IQR(data_processed[[var]], na.rm = TRUE)
    lower_bound <- q1 - 1.5 * iqr_val
    upper_bound <- q3 + 1.5 * iqr_val
    
    # Identifikasi outlier
    is_outlier <- data_processed[[var]] < lower_bound | data_processed[[var]] > upper_bound
    n_outliers <- sum(is_outlier, na.rm = TRUE)
    
    if(n_outliers > 0) {
      cat("Variabel", var, "memiliki", n_outliers, "outlier\n")
      
      if(method == "cap") {
        # Cap outliers (Winsorizing)
        data_processed[[var]][data_processed[[var]] < lower_bound] <- lower_bound
        data_processed[[var]][data_processed[[var]] > upper_bound] <- upper_bound
        cat("  Outlier di-cap ke batas [", round(lower_bound, 2), ", ", 
            round(upper_bound, 2), "]\n", sep = "")
        
      } else if(method == "remove") {
        # Hapus outlier (set sebagai NA)
        data_processed[[var]][is_outlier] <- NA
        cat("  Outlier dihapus (dijadikan NA)\n")
        
      } else if(method == "transform") {
        # Transformasi logaritmik
        if(all(data_processed[[var]] > 0, na.rm = TRUE)) {
          data_processed[[var]] <- log(data_processed[[var]] + 1)
          cat("  Dilakukan transformasi log\n")
        } else {
          cat("  Transformasi log tidak dapat dilakukan (nilai negatif)\n")
        }
        
      } else if(method == "median") {
        # Ganti dengan median
        median_val <- median(data_processed[[var]], na.rm = TRUE)
        data_processed[[var]][is_outlier] <- median_val
        cat("  Diganti dengan median:", round(median_val, 2), "\n")
      }
    }
  }
  
  return(data_processed)
}

# Contoh penggunaan metode penanganan outlier
data_capped <- handle_outliers(data, method = "cap")
## 
## === PENANGANAN OUTLIER ===
## Metode yang digunakan: cap 
## 
## Variabel age memiliki 12 outlier
##   Outlier di-cap ke batas [7.5, 83.5]
## Variabel income memiliki 7 outlier
##   Outlier di-cap ke batas [-5850.62, 107870.4]
## Variabel blood_pressure_systolic memiliki 6 outlier
##   Outlier di-cap ke batas [80.59, 159.07]
## Variabel blood_pressure_diastolic memiliki 12 outlier
##   Outlier di-cap ke batas [54.21, 105.93]
## Variabel cholesterol memiliki 11 outlier
##   Outlier di-cap ke batas [90.7, 306.12]
## Variabel bmi memiliki 11 outlier
##   Outlier di-cap ke batas [11.21, 38.82]
data_no_outliers <- handle_outliers(data, method = "remove")
## 
## === PENANGANAN OUTLIER ===
## Metode yang digunakan: remove 
## 
## Variabel age memiliki 12 outlier
##   Outlier dihapus (dijadikan NA)
## Variabel income memiliki 7 outlier
##   Outlier dihapus (dijadikan NA)
## Variabel blood_pressure_systolic memiliki 6 outlier
##   Outlier dihapus (dijadikan NA)
## Variabel blood_pressure_diastolic memiliki 12 outlier
##   Outlier dihapus (dijadikan NA)
## Variabel cholesterol memiliki 11 outlier
##   Outlier dihapus (dijadikan NA)
## Variabel bmi memiliki 11 outlier
##   Outlier dihapus (dijadikan NA)

INKONSISTENSI DATA

# 6. Deteksi inkonsistensi logis
detect_inconsistencies <- function(data) {
  cat("\n=== DETEKSI INKONSISTENSI LOGIS ===\n\n")
  
  inconsistencies <- list()
  
  # 1. Cek inkonsistensi nilai (usia tidak realistis)
  if("age" %in% names(data)) {
    age_inconsistencies <- which(data$age < 0 | data$age > 120)
    if(length(age_inconsistencies) > 0) {
      inconsistencies$age <- data.frame(
        id = data$id[age_inconsistencies],
        age = data$age[age_inconsistencies],
        issue = ifelse(data$age[age_inconsistencies] < 0, "Usia negatif", "Usia > 120 tahun")
      )
      cat("Inkonsistensi usia:", length(age_inconsistencies), "kasus\n")
    }
  }
  
  # 2. Cek inkonsistensi tekanan darah (diastolic > systolic)
  if(all(c("blood_pressure_systolic", "blood_pressure_diastolic") %in% names(data))) {
    bp_inconsistencies <- which(data$blood_pressure_diastolic > data$blood_pressure_systolic)
    if(length(bp_inconsistencies) > 0) {
      inconsistencies$blood_pressure <- data.frame(
        id = data$id[bp_inconsistencies],
        systolic = data$blood_pressure_systolic[bp_inconsistencies],
        diastolic = data$blood_pressure_diastolic[bp_inconsistencies],
        issue = "Diastolic > Systolic"
      )
      cat("Inkonsistensi tekanan darah:", length(bp_inconsistencies), "kasus\n")
    }
  }
  
  # 3. Cek inkonsistensi BMI (tidak realistis)
  if("bmi" %in% names(data)) {
    bmi_inconsistencies <- which(data$bmi < 10 | data$bmi > 60)
    if(length(bmi_inconsistencies) > 0) {
      inconsistencies$bmi <- data.frame(
        id = data$id[bmi_inconsistencies],
        bmi = data$bmi[bmi_inconsistencies],
        issue = ifelse(data$bmi[bmi_inconsistencies] < 10, "BMI < 10", "BMI > 60")
      )
      cat("Inkonsistensi BMI:", length(bmi_inconsistencies), "kasus\n")
    }
  }
  
  # 4. Cek inkonsistensi pendapatan (negatif)
  if("income" %in% names(data)) {
    income_inconsistencies <- which(data$income < 0)
    if(length(income_inconsistencies) > 0) {
      inconsistencies$income <- data.frame(
        id = data$id[income_inconsistencies],
        income = data$income[income_inconsistencies],
        issue = "Pendapatan negatif"
      )
      cat("Inkonsistensi pendapatan:", length(income_inconsistencies), "kasus\n")
    }
  }
  
  # 5. Cek duplikasi ID
  if("id" %in% names(data)) {
    duplicate_ids <- data$id[duplicated(data$id)]
    if(length(duplicate_ids) > 0) {
      inconsistencies$duplicate_id <- data.frame(
        id = duplicate_ids,
        issue = "ID duplikat"
      )
      cat("ID duplikat:", length(duplicate_ids), "kasus\n")
    }
  }
  
  # Tampilkan ringkasan
  if(length(inconsistencies) > 0) {
    cat("\nRingkasan inkonsistensi:\n")
    for(issue in names(inconsistencies)) {
      cat("  -", issue, ":", nrow(inconsistencies[[issue]]), "kasus\n")
    }
    
    # Tampilkan contoh inkonsistensi
    cat("\nContoh inkonsistensi (5 pertama):\n")
    for(issue in names(inconsistencies)[1:min(3, length(inconsistencies))]) {
      cat("\n", issue, ":\n", sep = "")
      print(head(inconsistencies[[issue]], 5))
    }
  } else {
    cat("Tidak ditemukan inkonsistensi logis.\n")
  }
  
  return(inconsistencies)
}

# Jalankan deteksi inkonsistensi
inconsistencies <- detect_inconsistencies(data)
## 
## === DETEKSI INKONSISTENSI LOGIS ===
## 
## Inkonsistensi usia: 2 kasus
## Inkonsistensi tekanan darah: 10 kasus
## Inkonsistensi BMI: 3 kasus
## Inkonsistensi pendapatan: 6 kasus
## 
## Ringkasan inkonsistensi:
##   - age : 2 kasus
##   - blood_pressure : 10 kasus
##   - bmi : 3 kasus
##   - income : 6 kasus
## 
## Contoh inkonsistensi (5 pertama):
## 
## age:
##    id age            issue
## 1 200 150 Usia > 120 tahun
## 2 300  -5     Usia negatif
## 
## blood_pressure:
##    id systolic diastolic                issue
## 1  86 85.18854  90.92972 Diastolic > Systolic
## 2 157 86.15198  90.74102 Diastolic > Systolic
## 3 246 88.63662  98.80106 Diastolic > Systolic
## 4 254 90.25569  91.20569 Diastolic > Systolic
## 5 383 77.27180  81.53781 Diastolic > Systolic
## 
## bmi:
##    id      bmi    issue
## 1 163 8.553121 BMI < 10
## 2 384 9.864771 BMI < 10
## 3 964 7.931060 BMI < 10

CONTOH CODE VISUALISASI

# ========================================================
# VISUALISASI HASIL PEMBERSIHAN DATA
# ========================================================

# 1. FUNGSI UTAMA VISUALISASI PERBANDINGAN
visualize_cleaning_results <- function(data_before, data_after, results_list) {
  
  cat("\n=== VISUALISASI HASIL PEMBERSIHAN DATA ===\n")
  
  # Set layout untuk multiple plots
  par(mfrow = c(2, 3))
  
  # 1. PERBANDINGAN MISSING VALUES
  visualize_missing_comparison <- function(data_before, data_after) {
    missing_before <- colSums(is.na(data_before))
    missing_after <- colSums(is.na(data_after))
    
    # Hanya ambil variabel dengan missing values
    vars_with_missing <- names(missing_before)[missing_before > 0 | missing_after > 0]
    
    if(length(vars_with_missing) > 0) {
      comparison_df <- data.frame(
        Variable = vars_with_missing,
        Before = missing_before[vars_with_missing],
        After = missing_after[vars_with_missing]
      )
      
      # Konversi ke long format untuk ggplot
      comparison_long <- comparison_df %>%
        pivot_longer(cols = c(Before, After), 
                     names_to = "Status", 
                     values_to = "Missing_Count")
      
      p1 <- ggplot(comparison_long, aes(x = Variable, y = Missing_Count, fill = Status)) +
        geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
        scale_fill_manual(values = c("Before" = "#E74C3C", "After" = "#2ECC71")) +
        labs(title = "Missing Values: Sebelum vs Sesudah",
             x = "Variabel", 
             y = "Jumlah Missing Values",
             fill = "Status") +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              plot.title = element_text(hjust = 0.5, face = "bold")) +
        geom_text(aes(label = Missing_Count), 
                  position = position_dodge(width = 0.7), 
                  vjust = -0.5, size = 3)
      
      print(p1)
    } else {
      plot.new()
      text(0.5, 0.5, "Tidak ada missing values", cex = 1.2)
    }
  }
  
  # 2. PERBANDINGAN DISTRIBUSI VARIABEL NUMERIK UTAMA
  visualize_distribution_comparison <- function(data_before, data_after) {
    numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
    
    if(length(numeric_vars) > 0) {
      # Pilih variabel dengan perubahan terbesar
      selected_var <- numeric_vars[1]
      
      # Buat data frame untuk plotting
      plot_data <- data.frame(
        Value = c(data_before[[selected_var]], data_after[[selected_var]]),
        Status = rep(c("Sebelum", "Sesudah"), 
                    c(length(data_before[[selected_var]]), 
                      length(data_after[[selected_var]])))
      )
      
      p2 <- ggplot(plot_data, aes(x = Value, fill = Status)) +
        geom_density(alpha = 0.5) +
        scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
        labs(title = paste("Distribusi", selected_var, ": Sebelum vs Sesudah"),
             x = selected_var,
             y = "Densitas",
             fill = "Status") +
        theme_minimal() +
        theme(plot.title = element_text(hjust = 0.5, face = "bold"))
      
      # Tambahkan garis vertikal untuk mean
      mean_before <- mean(data_before[[selected_var]], na.rm = TRUE)
      mean_after <- mean(data_after[[selected_var]], na.rm = TRUE)
      
      p2 <- p2 + 
        geom_vline(xintercept = mean_before, color = "#E74C3C", 
                   linetype = "dashed", linewidth = 1) +
        geom_vline(xintercept = mean_after, color = "#2ECC71", 
                   linetype = "dashed", linewidth = 1) +
        annotate("text", x = mean_before, y = 0, 
                 label = paste("Mean sebelum:", round(mean_before, 2)),
                 vjust = 2, color = "#E74C3C") +
        annotate("text", x = mean_after, y = 0, 
                 label = paste("Mean sesudah:", round(mean_after, 2)),
                 vjust = 4, color = "#2ECC71")
      
      print(p2)
    }
  }
  
  # 3. BOXPLOT OUTLIER PERBANDINGAN
  visualize_outlier_comparison <- function(data_before, data_after) {
    numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
    
    if(length(numeric_vars) >= 3) {
      # Pilih 3 variabel pertama
      selected_vars <- numeric_vars[1:min(3, length(numeric_vars))]
      
      # Gabungkan data untuk plotting
      plot_list <- list()
      for(i in 1:length(selected_vars)) {
        var <- selected_vars[i]
        
        # Hitung outlier bounds
        q1 <- quantile(data_before[[var]], 0.25, na.rm = TRUE)
        q3 <- quantile(data_before[[var]], 0.75, na.rm = TRUE)
        iqr_val <- IQR(data_before[[var]], na.rm = TRUE)
        lower_bound <- q1 - 1.5 * iqr_val
        upper_bound <- q3 + 1.5 * iqr_val
        
        # Hitung persentase outlier
        outliers_before <- sum(data_before[[var]] < lower_bound | 
                               data_before[[var]] > upper_bound, na.rm = TRUE)
        outliers_after <- sum(data_after[[var]] < lower_bound | 
                              data_after[[var]] > upper_bound, na.rm = TRUE)
        
        total_before <- sum(!is.na(data_before[[var]]))
        total_after <- sum(!is.na(data_after[[var]]))
        
        percent_before <- round(outliers_before/total_before * 100, 1)
        percent_after <- round(outliers_after/total_after * 100, 1)
        
        plot_list[[i]] <- data.frame(
          Variable = var,
          Status = c("Sebelum", "Sesudah"),
          Outlier_Percent = c(percent_before, percent_after)
        )
      }
      
      plot_df <- do.call(rbind, plot_list)
      
      p3 <- ggplot(plot_df, aes(x = Variable, y = Outlier_Percent, fill = Status)) +
        geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
        scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
        labs(title = "Persentase Outlier (IQR Method)",
             x = "Variabel",
             y = "Persentase Outlier (%)",
             fill = "Status") +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 0, hjust = 0.5),
              plot.title = element_text(hjust = 0.5, face = "bold")) +
        geom_text(aes(label = paste0(Outlier_Percent, "%")), 
                  position = position_dodge(width = 0.7), 
                  vjust = -0.5, size = 3) +
        ylim(0, max(plot_df$Outlier_Percent) * 1.2)
      
      print(p3)
    }
  }
  
  # 4. SCATTER PLOT PERBANDINGAN UNTUK DUA VARIABEL
  visualize_scatter_comparison <- function(data_before, data_after) {
    numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
    
    if(length(numeric_vars) >= 2) {
      var1 <- numeric_vars[1]
      var2 <- numeric_vars[2]
      
      # Gabungkan data
      before_data <- data.frame(
        x = data_before[[var1]],
        y = data_before[[var2]],
        Status = "Sebelum"
      )
      
      after_data <- data.frame(
        x = data_after[[var1]],
        y = data_after[[var2]],
        Status = "Sesudah"
      )
      
      combined_data <- rbind(before_data, after_data)
      
      p4 <- ggplot(combined_data, aes(x = x, y = y, color = Status)) +
        geom_point(alpha = 0.6, size = 1.5) +
        scale_color_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
        labs(title = paste("Scatter Plot:", var1, "vs", var2),
             x = var1,
             y = var2,
             color = "Status") +
        theme_minimal() +
        theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
        stat_smooth(method = "lm", se = FALSE, alpha = 0.5)
      
      print(p4)
    }
  }
  
  # 5. PERBANDINGAN STATISTIK DESKRIPTIF
  visualize_descriptive_stats <- function(data_before, data_after) {
    numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
    
    if(length(numeric_vars) > 0) {
      # Hitung statistik deskriptif
      stats_before <- data.frame(
        Variable = character(),
        Mean = numeric(),
        SD = numeric(),
        stringsAsFactors = FALSE
      )
      
      stats_after <- data.frame(
        Variable = character(),
        Mean = numeric(),
        SD = numeric(),
        stringsAsFactors = FALSE
      )
      
      for(var in numeric_vars) {
        stats_before <- rbind(stats_before, data.frame(
          Variable = var,
          Mean = mean(data_before[[var]], na.rm = TRUE),
          SD = sd(data_before[[var]], na.rm = TRUE)
        ))
        
        stats_after <- rbind(stats_after, data.frame(
          Variable = var,
          Mean = mean(data_after[[var]], na.rm = TRUE),
          SD = sd(data_after[[var]], na.rm = TRUE)
        ))
      }
      
      # Gabungkan dan format untuk plotting
      stats_combined <- rbind(
        cbind(stats_before, Status = "Sebelum"),
        cbind(stats_after, Status = "Sesudah")
      )
      
      # Plot untuk 3 variabel pertama
      selected_vars <- numeric_vars[1:min(3, length(numeric_vars))]
      stats_filtered <- stats_combined %>% filter(Variable %in% selected_vars)
      
      p5 <- ggplot(stats_filtered, aes(x = Variable, y = Mean, fill = Status)) +
        geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +
        geom_errorbar(aes(ymin = Mean - SD, ymax = Mean + SD),
                      position = position_dodge(width = 0.7),
                      width = 0.2) +
        scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
        labs(title = "Perbandingan Mean ± SD",
             x = "Variabel",
             y = "Nilai Mean",
             fill = "Status") +
        theme_minimal() +
        theme(plot.title = element_text(hjust = 0.5, face = "bold"),
              axis.text.x = element_text(angle = 0, hjust = 0.5)) +
        geom_text(aes(label = round(Mean, 2)), 
                  position = position_dodge(width = 0.7), 
                  vjust = -0.5, size = 3)
      
      print(p5)
    }
  }
  
  # 6. SUMMARY PEMBERSIHAN DATA
  visualize_cleaning_summary <- function(results_list) {
    # Buat data frame summary
    summary_data <- data.frame(
      Kategori = c("Missing Values", "Outlier", "Inkonsistensi"),
      Sebelum = c(
        sum(is.na(data_before)),
        ifelse(!is.null(results_list$outlier_analysis), 
               sum(sapply(results_list$outlier_analysis, function(x) x$n_outliers_iqr)),
               0),
        ifelse(!is.null(results_list$inconsistencies),
               sum(sapply(results_list$inconsistencies, nrow)),
               0)
      ),
      Sesudah = c(
        sum(is.na(data_after)),
        NA, # Outlier setelah penanganan
        ifelse(!is.null(results_list$inconsistencies),
               sum(sapply(results_list$inconsistencies, nrow)),
               0)
      )
    )
    
    # Hitung persentase perbaikan
    summary_data$Perbaikan <- round((summary_data$Sebelum - summary_data$Sesudah) / 
                                     summary_data$Sebelum * 100, 1)
    summary_data$Perbaikan[is.na(summary_data$Perbaikan)] <- 100
    
    # Visualisasi
    p6 <- ggplot(summary_data, aes(x = Kategori)) +
      geom_bar(aes(y = Sebelum, fill = "Sebelum"), 
               stat = "identity", position = "dodge", width = 0.4) +
      geom_bar(aes(y = Sesudah, fill = "Sesudah"), 
               stat = "identity", position = "dodge", width = 0.4) +
      scale_fill_manual(name = "Status", 
                        values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
      labs(title = "Ringkasan Hasil Pembersihan Data",
           x = "Kategori Masalah",
           y = "Jumlah Kasus") +
      theme_minimal() +
      theme(plot.title = element_text(hjust = 0.5, face = "bold"),
            legend.position = "bottom") +
      geom_text(aes(y = Sebelum, label = Sebelum), 
                position = position_dodge(width = 0.4), 
                vjust = -0.5, size = 4, color = "#E74C3C") +
      geom_text(aes(y = Sesudah, label = ifelse(is.na(Sesudah), "N/A", Sesudah)), 
                position = position_dodge(width = 0.4), 
                vjust = -0.5, size = 4, color = "#2ECC71") +
      geom_text(aes(y = pmax(Sebelum, Sesudah, na.rm = TRUE), 
                    label = ifelse(Kategori != "Outlier", 
                                  paste0(Perbaikan, "%"), "")), 
                vjust = 2, size = 3.5, fontface = "bold")
    
    print(p6)
  }
  
  # Eksekusi semua visualisasi
  visualize_missing_comparison(data_before, data_after)
  visualize_distribution_comparison(data_before, data_after)
  visualize_outlier_comparison(data_before, data_after)
  visualize_scatter_comparison(data_before, data_after)
  visualize_descriptive_stats(data_before, data_after)
  visualize_cleaning_summary(results_list)
  
  # Reset layout
  par(mfrow = c(1, 1))
}

# 2. FUNGSI DASHBOARD INTERAKTIF (jika ingin lebih interaktif)
create_interactive_dashboard <- function(data_before, data_after, results_list) {
  cat("\n=== MEMBUAT DASHBOARD INTERAKTIF ===\n")
  
  # Install dan load plotly jika belum ada
  if (!require("plotly")) {
    install.packages("plotly")
    library(plotly)
  }
  
  # 1. Heatmap Missing Values Interaktif
  create_missing_heatmap <- function(data_before, data_after) {
    # Hitung missing values
    missing_before <- colSums(is.na(data_before))
    missing_after <- colSums(is.na(data_after))
    
    # Buat data frame
    heat_data <- data.frame(
      Variable = names(data_before),
      Missing_Before = missing_before,
      Missing_After = missing_after,
      Reduction = missing_before - missing_after
    )
    
    # Buat heatmap interaktif
    p <- plot_ly(
      x = heat_data$Variable,
      y = c("Sebelum", "Sesudah"),
      z = as.matrix(rbind(heat_data$Missing_Before, heat_data$Missing_After)),
      type = "heatmap",
      colorscale = list(c(0, "#2ECC71"), c(1, "#E74C3C")),
      hovertemplate = paste(
        "Variabel: %{x}<br>",
        "Status: %{y}<br>",
        "Missing Values: %{z}<br>",
        "<extra></extra>"
      )
    ) %>%
      layout(
        title = "Heatmap Missing Values: Sebelum vs Sesudah",
        xaxis = list(title = "Variabel"),
        yaxis = list(title = "Status")
      )
    
    return(p)
  }
  
  # 2. Boxplot Interaktif untuk Outlier
  create_interactive_boxplot <- function(data_before, data_after) {
    numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
    
    if(length(numeric_vars) > 0) {
      # Pilih variabel pertama
      var <- numeric_vars[1]
      
      # Buat data untuk plotly
      plot_data <- data.frame(
        Value = c(data_before[[var]], data_after[[var]]),
        Status = rep(c("Sebelum", "Sesudah"), 
                    c(length(data_before[[var]]), length(data_after[[var]])))
      )
      
      p <- plot_ly(plot_data, y = ~Value, color = ~Status, type = "box",
                   colors = c("#E74C3C", "#2ECC71")) %>%
        layout(
          title = paste("Boxplot", var, ": Sebelum vs Sesudah"),
          yaxis = list(title = var),
          boxmode = "group"
        )
      
      return(p)
    }
  }
  
  # 3. Scatter Plot 3D Interaktif
  create_3d_scatter <- function(data_before, data_after) {
    numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
    
    if(length(numeric_vars) >= 3) {
      # Buat data untuk plotting
      before_df <- data.frame(
        x = data_before[[numeric_vars[1]]],
        y = data_before[[numeric_vars[2]]],
        z = data_before[[numeric_vars[3]]],
        Status = "Sebelum"
      )
      
      after_df <- data.frame(
        x = data_after[[numeric_vars[1]]],
        y = data_after[[numeric_vars[2]]],
        z = data_after[[numeric_vars[3]]],
        Status = "Sesudah"
      )
      
      plot_data <- rbind(before_df, after_df)
      
      p <- plot_ly(plot_data, x = ~x, y = ~y, z = ~z,
                   color = ~Status, colors = c("#E74C3C", "#2ECC71"),
                   type = "scatter3d", mode = "markers",
                   marker = list(size = 3, opacity = 0.6),
                   hovertemplate = paste(
                     numeric_vars[1], ": %{x}<br>",
                     numeric_vars[2], ": %{y}<br>",
                     numeric_vars[3], ": %{z}<br>",
                     "<extra></extra>"
                   )) %>%
        layout(
          title = "Scatter Plot 3D: Sebelum vs Sesudah",
          scene = list(
            xaxis = list(title = numeric_vars[1]),
            yaxis = list(title = numeric_vars[2]),
            zaxis = list(title = numeric_vars[3])
          )
        )
      
      return(p)
    }
  }
  
  # Eksekusi pembuatan dashboard
  cat("\n1. Membuat Heatmap Missing Values...\n")
  heatmap_plot <- create_missing_heatmap(data_before, data_after)
  
  cat("2. Membuat Boxplot Interaktif...\n")
  boxplot_plot <- create_interactive_boxplot(data_before, data_after)
  
  cat("3. Membuat Scatter Plot 3D...\n")
  scatter_3d_plot <- create_3d_scatter(data_before, data_after)
  
  # Tampilkan plots
  print(heatmap_plot)
  print(boxplot_plot)
  
  if(!is.null(scatter_3d_plot)) {
    print(scatter_3d_plot)
  }
  
  cat("\nDashboard interaktif telah dibuat!\n")
}

# 3. FUNGSI LAPORAN PDF
generate_pdf_report <- function(data_before, data_after, results_list, filename = "cleaning_report.pdf") {
  cat("\n=== MEMBUAT LAPORAN PDF ===\n")
  
  # Install dan load paket yang diperlukan
  if (!require("gridExtra")) {
    install.packages("gridExtra")
    library(gridExtra)
  }
  
  # Buat fungsi untuk membuat table grob
  create_summary_table <- function(data_before, data_after) {
    # Statistik summary
    numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
    
    summary_table <- data.frame(
      Variabel = character(),
      `Mean Sebelum` = numeric(),
      `Mean Sesudah` = numeric(),
      `SD Sebelum` = numeric(),
      `SD Sesudah` = numeric(),
      `% Perubahan` = numeric(),
      stringsAsFactors = FALSE
    )
    
    for(var in numeric_vars[1:min(5, length(numeric_vars))]) {
      mean_before <- mean(data_before[[var]], na.rm = TRUE)
      mean_after <- mean(data_after[[var]], na.rm = TRUE)
      sd_before <- sd(data_before[[var]], na.rm = TRUE)
      sd_after <- sd(data_after[[var]], na.rm = TRUE)
      
      percent_change <- round((mean_after - mean_before) / mean_before * 100, 2)
      
      summary_table <- rbind(summary_table, data.frame(
        Variabel = var,
        `Mean Sebelum` = round(mean_before, 2),
        `Mean Sesudah` = round(mean_after, 2),
        `SD Sebelum` = round(sd_before, 2),
        `SD Sesudah` = round(sd_after, 2),
        `% Perubahan` = percent_change
      ))
    }
    
    return(summary_table)
  }
  
  # Buat PDF
  pdf(filename, width = 11, height = 8.5)
  
  # Halaman 1: Title Page
  grid.arrange(
    grobs = list(
      grid::textGrob("LAPORAN PEMBERSIHAN DATA", 
                    gp = grid::gpar(fontsize = 24, fontface = "bold")),
      grid::textGrob("\n\nAnalisis Kualitas Data dan Hasil Pembersihan",
                    gp = grid::gpar(fontsize = 18)),
      grid::textGrob(paste("\n\n\nTanggal:", Sys.Date()),
                    gp = grid::gpar(fontsize = 14)),
      grid::textGrob("\n\n\nRingkasan Eksekusi:",
                    gp = grid::gpar(fontsize = 16, fontface = "bold")),
      grid::tableGrob(create_summary_table(data_before, data_after),
                     theme = gridExtra::ttheme_default(
                       base_size = 10,
                       padding = unit(c(4, 4), "mm")
                     ))
    ),
    ncol = 1,
    heights = c(1, 0.5, 0.5, 0.5, 2)
  )
  
  # Halaman 2: Visualizations
  visualize_cleaning_results(data_before, data_after, results_list)
  
  # Halaman 3: Detailed Statistics
  grid.arrange(
    grobs = list(
      grid::textGrob("STATISTIK DETAIL PEMBERSIHAN",
                    gp = grid::gpar(fontsize = 18, fontface = "bold")),
      grid::tableGrob(data.frame(
        Metrik = c("Jumlah Observasi", "Jumlah Variabel", 
                  "Missing Values Awal", "Missing Values Akhir",
                  "Persentase Data Bersih"),
        Nilai = c(
          nrow(data_before),
          ncol(data_before),
          sum(is.na(data_before)),
          sum(is.na(data_after)),
          paste0(round((1 - sum(is.na(data_after))/(nrow(data_after)*ncol(data_after))) * 100, 2), "%")
        )
      ), theme = gridExtra::ttheme_default(base_size = 12))
    ),
    ncol = 1,
    heights = c(0.5, 1)
  )
  
  dev.off()
  
  cat("\nLaporan PDF telah disimpan sebagai:", filename, "\n")
}

# 4. FUNGSI UTAMA UNTUK MENJALANKAN SEMUA VISUALISASI
run_all_visualizations <- function(data_before, data_after, results_list) {
  cat("========================================\n")
  cat("VISUALISASI KOMPREHENSIF HASIL PEMBERSIHAN\n")
  cat("========================================\n\n")
  
  # 1. Visualisasi statis (6 panel)
  cat("1. Membuat visualisasi statis 6-panel...\n")
  visualize_cleaning_results(data_before, data_after, results_list)
  
  # 2. Dashboard interaktif
  cat("\n2. Membuat dashboard interaktif...\n")
  tryCatch({
    create_interactive_dashboard(data_before, data_after, results_list)
  }, error = function(e) {
    cat("Dashboard interaktif tidak dapat dibuat:", e$message, "\n")
  })
  
  # 3. Simpan laporan PDF
  cat("\n3. Membuat laporan PDF...\n")
  generate_pdf_report(data_before, data_after, results_list)
  
  # 4. Simpan plot individual
  cat("\n4. Menyimpan plot individual...\n")
  save_individual_plots(data_before, data_after, results_list)
  
  cat("\n========================================\n")
  cat("SEMUA VISUALISASI TELAH SELESAI DIBUAT\n")
  cat("========================================\n")
}

# 5. FUNGSI UNTUK MENYIMPAN PLOT INDIVIDUAL
save_individual_plots <- function(data_before, data_after, results_list) {
  # Buat folder untuk menyimpan plot
  if (!dir.exists("cleaning_plots")) {
    dir.create("cleaning_plots")
  }
  
  # 1. Plot missing values comparison
  png("cleaning_plots/missing_values_comparison.png", width = 800, height = 600)
  visualize_cleaning_results(data_before, data_after, results_list)
  dev.off()
  
  # 2. Distribution plots untuk setiap variabel numerik
  numeric_vars <- names(data_before)[sapply(data_before, is.numeric)]
  
  for(var in numeric_vars[1:min(5, length(numeric_vars))]) {
    png(paste0("cleaning_plots/distribution_", var, ".png"), width = 800, height = 600)
    
    plot_data <- data.frame(
      Value = c(data_before[[var]], data_after[[var]]),
      Status = rep(c("Sebelum", "Sesudah"), 
                  c(length(data_before[[var]]), length(data_after[[var]])))
    )
    
    p <- ggplot(plot_data, aes(x = Value, fill = Status)) +
      geom_density(alpha = 0.5) +
      scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
      labs(title = paste("Distribusi", var, ": Sebelum vs Sesudah"),
           x = var, y = "Densitas") +
      theme_minimal()
    
    print(p)
    dev.off()
  }
  
  # 3. Boxplot comparison
  png("cleaning_plots/boxplot_comparison.png", width = 800, height = 600)
  if(length(numeric_vars) > 0) {
    # Gabungkan data
    plot_data_before <- data_before[, numeric_vars[1:min(3, length(numeric_vars))]]
    plot_data_after <- data_after[, numeric_vars[1:min(3, length(numeric_vars))]]
    
    # Convert to long format
    plot_data_before_long <- plot_data_before %>%
      pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
      mutate(Status = "Sebelum")
    
    plot_data_after_long <- plot_data_after %>%
      pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
      mutate(Status = "Sesudah")
    
    plot_data_combined <- rbind(plot_data_before_long, plot_data_after_long)
    
    p <- ggplot(plot_data_combined, aes(x = Variable, y = Value, fill = Status)) +
      geom_boxplot(alpha = 0.7) +
      scale_fill_manual(values = c("Sebelum" = "#E74C3C", "Sesudah" = "#2ECC71")) +
      labs(title = "Perbandingan Boxplot: Sebelum vs Sesudah",
           x = "Variabel", y = "Nilai") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    print(p)
  }
  dev.off()
  
  cat("\nPlot individual telah disimpan di folder 'cleaning_plots/'\n")
}

ANALISIS REGRESI LINEAR SEDERHANA

# ====================================================
# ANALISIS REGRESI LINEAR SEDERHANA
# Lengkap dengan Pengujian Hipotesis
# ====================================================

# 1. MEMUAT DATA CONTOH
# ----------------------
# Membuat data contoh sederhana untuk analisis
# Variabel X (prediktor) dan Y (respons)

set.seed(123) # Untuk hasil yang reproducible

# Data contoh: Hubungan antara Jam Belajar (X) dan Nilai Ujian (Y)
jam_belajar <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
nilai_ujian <- c(55, 58, 63, 65, 68, 70, 73, 75, 78, 80, 82, 85, 88, 90, 92) + rnorm(15, 0, 3)

# Buat data frame
data_studi <- data.frame(
  jam_belajar = jam_belajar,
  nilai_ujian = nilai_ujian
)

# Tampilkan data
cat("DATA CONTOH:\n")
## DATA CONTOH:
cat("Jam Belajar vs Nilai Ujian\n")
## Jam Belajar vs Nilai Ujian
print(data_studi)
##    jam_belajar nilai_ujian
## 1            1    53.31857
## 2            2    57.30947
## 3            3    67.67612
## 4            4    65.21153
## 5            5    68.38786
## 6            6    75.14519
## 7            7    74.38275
## 8            8    71.20482
## 9            9    75.93944
## 10          10    78.66301
## 11          11    85.67225
## 12          12    86.07944
## 13          13    89.20231
## 14          14    90.33205
## 15          15    90.33248
# 2. ANALISIS DESKRIPTIF
# ----------------------
cat("\n\n=== ANALISIS DESKRIPTIF ===\n")
## 
## 
## === ANALISIS DESKRIPTIF ===
# Statistik deskriptif
cat("\n1. STATISTIK DESKRIPTIF:\n")
## 
## 1. STATISTIK DESKRIPTIF:
desc_stats <- data.frame(
  Variabel = c("Jam Belajar (X)", "Nilai Ujian (Y)"),
  Mean = c(mean(data_studi$jam_belajar), mean(data_studi$nilai_ujian)),
  SD = c(sd(data_studi$jam_belajar), sd(data_studi$nilai_ujian)),
  Min = c(min(data_studi$jam_belajar), min(data_studi$nilai_ujian)),
  Max = c(max(data_studi$jam_belajar), max(data_studi$nilai_ujian))
)
print(desc_stats)
##          Variabel     Mean        SD      Min      Max
## 1 Jam Belajar (X)  8.00000  4.472136  1.00000 15.00000
## 2 Nilai Ujian (Y) 75.25715 11.688944 53.31857 90.33248
# Korelasi
cat("\n2. KORELASI:\n")
## 
## 2. KORELASI:
correlation <- cor(data_studi$jam_belajar, data_studi$nilai_ujian)
cat("Koefisien Korelasi (r) =", round(correlation, 4), "\n")
## Koefisien Korelasi (r) = 0.9685
# Interpretasi korelasi
if (abs(correlation) >= 0.9) {
  cat("Interpretasi: Korelasi sangat kuat\n")
} else if (abs(correlation) >= 0.7) {
  cat("Interpretasi: Korelasi kuat\n")
} else if (abs(correlation) >= 0.5) {
  cat("Interpretasi: Korelasi sedang\n")
} else if (abs(correlation) >= 0.3) {
  cat("Interpretasi: Korelasi lemah\n")
} else {
  cat("Interpretasi: Korelasi sangat lemah/tidak ada\n")
}
## Interpretasi: Korelasi sangat kuat
# 3. VISUALISASI DATA
# -------------------
cat("\n\n=== VISUALISASI DATA ===\n")
## 
## 
## === VISUALISASI DATA ===
# Scatter plot
par(mfrow = c(1, 2)) # Dua plot berdampingan

# Plot 1: Scatter plot tanpa garis regresi
plot(data_studi$jam_belajar, data_studi$nilai_ujian,
     main = "Scatter Plot: Jam Belajar vs Nilai Ujian",
     xlab = "Jam Belajar (X)",
     ylab = "Nilai Ujian (Y)",
     pch = 19, col = "blue", cex = 1.5)
grid()

# Plot 2: Scatter plot dengan garis regresi
plot(data_studi$jam_belajar, data_studi$nilai_ujian,
     main = "Data dengan Garis Regresi",
     xlab = "Jam Belajar (X)",
     ylab = "Nilai Ujian (Y)",
     pch = 19, col = "blue", cex = 1.5)
grid()

# Kembalikan ke satu plot per window
par(mfrow = c(1, 1))

# 4. REGRESI LINEAR SEDERHANA
# ---------------------------
cat("\n\n=== REGRESI LINEAR SEDERHANA ===\n")
## 
## 
## === REGRESI LINEAR SEDERHANA ===
# Model regresi linear sederhana
model <- lm(nilai_ujian ~ jam_belajar, data = data_studi)

# Ringkasan model
cat("\n1. RINGKASAN MODEL:\n")
## 
## 1. RINGKASAN MODEL:
summary_model <- summary(model)
print(summary_model)
## 
## Call:
## lm(formula = nilai_ujian ~ jam_belajar, data = data_studi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2191 -2.2466  0.0798  1.4727  5.0758 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.0063     1.6416   33.51 5.25e-14 ***
## jam_belajar   2.5314     0.1806   14.02 3.17e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.021 on 13 degrees of freedom
## Multiple R-squared:  0.938,  Adjusted R-squared:  0.9332 
## F-statistic: 196.6 on 1 and 13 DF,  p-value: 3.167e-09
# Ekstrak komponen penting
cat("\n2. PERSAMAAN REGRESI:\n")
## 
## 2. PERSAMAAN REGRESI:
intercept <- coef(model)[1]
slope <- coef(model)[2]
cat("Persamaan Regresi: Y =", round(intercept, 4), "+", round(slope, 4), "* X\n")
## Persamaan Regresi: Y = 55.0063 + 2.5314 * X
cat("Artinya: Setiap penambahan 1 jam belajar, nilai ujian meningkat", round(slope, 2), "poin\n")
## Artinya: Setiap penambahan 1 jam belajar, nilai ujian meningkat 2.53 poin
# Koefisien determinasi
cat("\n3. KOEFISIEN DETERMINASI (R-squared):\n")
## 
## 3. KOEFISIEN DETERMINASI (R-squared):
r_squared <- summary_model$r.squared
cat("R-squared =", round(r_squared, 4), "\n")
## R-squared = 0.938
cat("Artinya:", round(r_squared * 100, 2), "% variasi nilai ujian dapat dijelaskan oleh jam belajar\n")
## Artinya: 93.8 % variasi nilai ujian dapat dijelaskan oleh jam belajar
# 5. ASUMSI REGRESI LINEAR
# ------------------------
cat("\n\n=== UJI ASUMSI REGRESI LINEAR ===\n")
## 
## 
## === UJI ASUMSI REGRESI LINEAR ===
# 5.1 Normalitas Residual
cat("\n1. UJI NORMALITAS RESIDUAL (Shapiro-Wilk):\n")
## 
## 1. UJI NORMALITAS RESIDUAL (Shapiro-Wilk):
residuals <- resid(model)
shapiro_test <- shapiro.test(residuals)
cat("Statistik Uji W =", round(shapiro_test$statistic, 4), "\n")
## Statistik Uji W = 0.9529
cat("p-value =", round(shapiro_test$p.value, 4), "\n")
## p-value = 0.5715
if (shapiro_test$p.value > 0.05) {
  cat("Keputusan: Residual berdistribusi normal (p > 0.05)\n")
} else {
  cat("Keputusan: Residual TIDAK berdistribusi normal (p < 0.05)\n")
}
## Keputusan: Residual berdistribusi normal (p > 0.05)
# Plot normalitas residual
par(mfrow = c(1, 2))
qqnorm(residuals, main = "Q-Q Plot Residual")
qqline(residuals, col = "red")
hist(residuals, main = "Histogram Residual", 
     xlab = "Residual", col = "lightblue", border = "black")

# 5.2 Homoskedastisitas (Uji Breusch-Pagan)
cat("\n2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):\n")
## 
## 2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):
if (!require(lmtest)) {
  install.packages("lmtest")
  library(lmtest)
}
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bp_test <- bptest(model)
cat("Statistik Uji LM =", round(bp_test$statistic, 4), "\n")
## Statistik Uji LM = 3.2658
cat("p-value =", round(bp_test$p.value, 4), "\n")
## p-value = 0.0707
if (bp_test$p.value > 0.05) {
  cat("Keputusan: Varian residual homogen (homoskedastisitas terpenuhi)\n")
} else {
  cat("Keputusan: Varian residual TIDAK homogen (ada heteroskedastisitas)\n")
}
## Keputusan: Varian residual homogen (homoskedastisitas terpenuhi)
# Plot residual vs fitted
par(mfrow = c(1, 2))
plot(fitted(model), residuals,
     main = "Residual vs Fitted Values",
     xlab = "Fitted Values", ylab = "Residuals",
     pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)
grid()

# Plot residual vs prediktor
plot(data_studi$jam_belajar, residuals,
     main = "Residual vs Prediktor (X)",
     xlab = "Jam Belajar", ylab = "Residuals",
     pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)
grid()

par(mfrow = c(1, 1))

# 6. PENGUJIAN HIPOTESIS
# ----------------------
cat("\n\n=== PENGUJIAN HIPOTESIS ===\n")
## 
## 
## === PENGUJIAN HIPOTESIS ===
# 6.1 Hipotesis untuk Koefisien Regresi (Slope)
cat("\n1. UJI HIPOTESIS UNTUK SLOPE (β1):\n")
## 
## 1. UJI HIPOTESIS UNTUK SLOPE (β1):
cat("   Hipotesis Nol (H0): β1 = 0 (Tidak ada hubungan linear)\n")
##    Hipotesis Nol (H0): β1 = 0 (Tidak ada hubungan linear)
cat("   Hipotesis Alternatif (H1): β1 ≠ 0 (Ada hubungan linear)\n")
##    Hipotesis Alternatif (H1): β1 ≠ 0 (Ada hubungan linear)
# Dari output summary model
coef_table <- summary_model$coefficients
slope_p_value <- coef_table[2, 4]

cat("\n   Hasil Uji:\n")
## 
##    Hasil Uji:
cat("   t-statistik =", round(coef_table[2, 3], 4), "\n")
##    t-statistik = 14.0201
cat("   p-value =", round(slope_p_value, 6), "\n")
##    p-value = 0
alpha <- 0.05
cat("   Tingkat signifikansi (α) =", alpha, "\n")
##    Tingkat signifikansi (α) = 0.05
if (slope_p_value < alpha) {
  cat("\n   KEPUTUSAN: Tolak H0\n")
  cat("   KESIMPULAN: Ada bukti statistik yang cukup untuk menyatakan bahwa\n")
  cat("               terdapat hubungan linear yang signifikan antara jam belajar dan nilai ujian\n")
} else {
  cat("\n   KEPUTUSAN: Gagal tolak H0\n")
  cat("   KESIMPULAN: Tidak ada bukti statistik yang cukup untuk menyatakan bahwa\n")
  cat("               terdapat hubungan linear yang signifikan antara jam belajar dan nilai ujian\n")
}
## 
##    KEPUTUSAN: Tolak H0
##    KESIMPULAN: Ada bukti statistik yang cukup untuk menyatakan bahwa
##                terdapat hubungan linear yang signifikan antara jam belajar dan nilai ujian
# 6.2 Interval Kepercayaan
cat("\n2. INTERVAL KEPERCAYAAN 95% UNTUK KOEFISIEN:\n")
## 
## 2. INTERVAL KEPERCAYAAN 95% UNTUK KOEFISIEN:
conf_int <- confint(model, level = 0.95)
cat("   Intercept (β0): [", round(conf_int[1, 1], 4), ", ", round(conf_int[1, 2], 4), "]\n", sep = "")
##    Intercept (β0): [51.4598, 58.5527]
cat("   Slope (β1):     [", round(conf_int[2, 1], 4), ", ", round(conf_int[2, 2], 4), "]\n", sep = "")
##    Slope (β1):     [2.1413, 2.9214]
# 7. PREDIKSI
# -----------
cat("\n\n=== PREDIKSI ===\n")
## 
## 
## === PREDIKSI ===
# Prediksi untuk nilai X tertentu
jam_prediksi <- c(5, 10, 15)
prediksi <- predict(model, newdata = data.frame(jam_belajar = jam_prediksi),
                    interval = "confidence", level = 0.95)

cat("\n1. PREDIKSI NILAI UJIAN:\n")
## 
## 1. PREDIKSI NILAI UJIAN:
prediksi_df <- data.frame(
  Jam_Belajar = jam_prediksi,
  Prediksi_Nilai = round(prediksi[, "fit"], 2),
  Batas_Bawah = round(prediksi[, "lwr"], 2),
  Batas_Atas = round(prediksi[, "upr"], 2)
)
print(prediksi_df)
##   Jam_Belajar Prediksi_Nilai Batas_Bawah Batas_Atas
## 1           5          67.66       65.61      69.71
## 2          10          80.32       78.46      82.18
## 3          15          92.98       89.77      96.19
# Plot dengan interval kepercayaan
x_range <- seq(min(data_studi$jam_belajar), max(data_studi$jam_belajar), length.out = 100)
pred_range <- predict(model, newdata = data.frame(jam_belajar = x_range),
                      interval = "confidence", level = 0.95)

plot(data_studi$jam_belajar, data_studi$nilai_ujian,
     main = "Regresi Linear dengan Interval Kepercayaan",
     xlab = "Jam Belajar", ylab = "Nilai Ujian",
     pch = 19, col = "blue", cex = 1.2)

# Garis regresi
lines(x_range, pred_range[, "fit"], col = "red", lwd = 2)

# Interval kepercayaan
polygon(c(x_range, rev(x_range)), 
        c(pred_range[, "lwr"], rev(pred_range[, "upr"])),
        col = rgb(1, 0, 0, 0.2), border = NA)

legend("topleft", 
       legend = c("Data", "Garis Regresi", "95% CI"),
       col = c("blue", "red", rgb(1, 0, 0, 0.2)),
       pch = c(19, NA, 15),
       lty = c(NA, 1, NA),
       lwd = c(NA, 2, NA),
       pt.cex = c(1.2, NA, 2))

# 8. LAPORAN LENGKAP
# ------------------
cat("\n\n=== LAPORAN ANALISIS LENGKAP ===\n")
## 
## 
## === LAPORAN ANALISIS LENGKAP ===
cat("====================================\n")
## ====================================
cat("\nA. INFORMASI DATA\n")
## 
## A. INFORMASI DATA
cat("   Jumlah observasi:", nrow(data_studi), "\n")
##    Jumlah observasi: 15
cat("   Variabel prediktor (X): Jam Belajar\n")
##    Variabel prediktor (X): Jam Belajar
cat("   Variabel respons (Y): Nilai Ujian\n")
##    Variabel respons (Y): Nilai Ujian
cat("\nB. HASIL REGRESI LINEAR\n")
## 
## B. HASIL REGRESI LINEAR
cat("   Persamaan: Y =", round(intercept, 4), "+", round(slope, 4), "X\n")
##    Persamaan: Y = 55.0063 + 2.5314 X
cat("   Koefisien determinasi (R²):", round(r_squared, 4), "\n")
##    Koefisien determinasi (R²): 0.938
cat("   Koefisien korelasi (r):", round(correlation, 4), "\n")
##    Koefisien korelasi (r): 0.9685
cat("\nC. PENGUJIAN HIPOTESIS\n")
## 
## C. PENGUJIAN HIPOTESIS
cat("   Hipotesis H0: β1 = 0 (Tidak ada hubungan linear)\n")
##    Hipotesis H0: β1 = 0 (Tidak ada hubungan linear)
cat("   Hipotesis H1: β1 ≠ 0 (Ada hubungan linear)\n")
##    Hipotesis H1: β1 ≠ 0 (Ada hubungan linear)
cat("   p-value:", round(slope_p_value, 6), "\n")
##    p-value: 0
if (slope_p_value < 0.05) {
  cat("   KESIMPULAN: Hubungan linear SIGNIFIKAN (p < 0.05)\n")
} else {
  cat("   KESIMPULAN: Hubungan linear TIDAK signifikan (p ≥ 0.05)\n")
}
##    KESIMPULAN: Hubungan linear SIGNIFIKAN (p < 0.05)
cat("\nD. ASUMSI REGRESI\n")
## 
## D. ASUMSI REGRESI
cat("   1. Normalitas residual: p-value =", round(shapiro_test$p.value, 4))
##    1. Normalitas residual: p-value = 0.5715
if (shapiro_test$p.value > 0.05) {
  cat(" (Terpenuhi)\n")
} else {
  cat(" (Tidak terpenuhi)\n")
}
##  (Terpenuhi)
cat("   2. Homoskedastisitas: p-value =", round(bp_test$p.value, 4))
##    2. Homoskedastisitas: p-value = 0.0707
if (bp_test$p.value > 0.05) {
  cat(" (Terpenuhi)\n")
} else {
  cat(" (Tidak terpenuhi)\n")
}
##  (Terpenuhi)
cat("\nE. INTERPRETASI PRAKTIS\n")
## 
## E. INTERPRETASI PRAKTIS
cat("   Setiap penambahan 1 jam belajar diharapkan meningkatkan nilai ujian sebesar", 
    round(slope, 2), "poin\n")
##    Setiap penambahan 1 jam belajar diharapkan meningkatkan nilai ujian sebesar 2.53 poin
cat("   Model ini menjelaskan", round(r_squared * 100, 1), 
    "% variasi dalam nilai ujian\n")
##    Model ini menjelaskan 93.8 % variasi dalam nilai ujian
# 9. FUNGSI UNTUK ANALISIS UMUM
# ------------------------------
cat("\n\n=== FUNGSI UMUM UNTUK ANALISIS REGRESI ===\n")
## 
## 
## === FUNGSI UMUM UNTUK ANALISIS REGRESI ===
# Fungsi untuk analisis regresi linear sederhana
analisis_regresi_sederhana <- function(x, y, x_name = "X", y_name = "Y", alpha = 0.05) {
  # Buat data frame
  data_analisis <- data.frame(x = x, y = y)
  
  # Model regresi
  model <- lm(y ~ x, data = data_analisis)
  summary_model <- summary(model)
  
  # Hasil utama
  intercept <- coef(model)[1]
  slope <- coef(model)[2]
  r_squared <- summary_model$r.squared
  p_value <- summary_model$coefficients[2, 4]
  
  # Uji asumsi
  shapiro_test <- shapiro.test(resid(model))
  bp_test <- bptest(model)
  
  # Buat laporan
  cat("\n=== HASIL ANALISIS REGRESI LINEAR ===\n")
  cat("Variabel X:", x_name, "\n")
  cat("Variabel Y:", y_name, "\n")
  cat("\n1. Persamaan Regresi:\n")
  cat("   Y =", round(intercept, 4), "+", round(slope, 4), "X\n")
  
  cat("\n2. Uji Signifikansi:\n")
  cat("   R-squared =", round(r_squared, 4), "\n")
  cat("   p-value =", round(p_value, 6), "\n")
  
  if (p_value < alpha) {
    cat("   Kesimpulan: Hubungan SIGNIFIKAN (p <", alpha, ")\n")
  } else {
    cat("   Kesimpulan: Hubungan TIDAK signifikan (p ≥", alpha, ")\n")
  }
  
  cat("\n3. Uji Asumsi:\n")
  cat("   Normalitas residual: p =", round(shapiro_test$p.value, 4))
  if (shapiro_test$p.value > alpha) {
    cat(" (OK)\n")
  } else {
    cat(" (VIOLATION)\n")
  }
  
  cat("   Homoskedastisitas: p =", round(bp_test$p.value, 4))
  if (bp_test$p.value > alpha) {
    cat(" (OK)\n")
  } else {
    cat(" (VIOLATION)\n")
  }
  
  # Plot
  par(mfrow = c(2, 2))
  plot(x, y, main = paste("Scatter Plot:", x_name, "vs", y_name),
       xlab = x_name, ylab = y_name, pch = 19, col = "blue")
  abline(model, col = "red", lwd = 2)
  
  qqnorm(resid(model), main = "Q-Q Plot Residual")
  qqline(resid(model), col = "red")
  
  plot(fitted(model), resid(model),
       main = "Residual vs Fitted",
       xlab = "Fitted Values", ylab = "Residuals",
       pch = 19, col = "blue")
  abline(h = 0, col = "red", lty = 2)
  
  hist(resid(model), main = "Histogram Residual",
       xlab = "Residuals", col = "lightblue")
  
  par(mfrow = c(1, 1))
  
  return(list(
    model = model,
    intercept = intercept,
    slope = slope,
    r_squared = r_squared,
    p_value = p_value,
    assumptions = list(
      normality = shapiro_test$p.value,
      homoscedasticity = bp_test$p.value
    )
  ))
}

# Contoh penggunaan fungsi
cat("\nContoh penggunaan fungsi:\n")
## 
## Contoh penggunaan fungsi:
cat("hasil <- analisis_regresi_sederhana(jam_belajar, nilai_ujian, 'Jam Belajar', 'Nilai Ujian')\n")
## hasil <- analisis_regresi_sederhana(jam_belajar, nilai_ujian, 'Jam Belajar', 'Nilai Ujian')
# 10. SIMPAN HASIL ANALISIS
# -------------------------
cat("\n\n=== MENYIMPAN HASIL ANALISIS ===\n")
## 
## 
## === MENYIMPAN HASIL ANALISIS ===
# Buat data frame hasil
hasil_analisis <- data.frame(
  Komponen = c("Intercept (β0)", "Slope (β1)", "R-squared", "p-value slope",
               "p-value normalitas", "p-value homoskedastisitas"),
  Nilai = c(round(intercept, 4), round(slope, 4), round(r_squared, 4),
            round(slope_p_value, 6), round(shapiro_test$p.value, 4),
            round(bp_test$p.value, 4)),
  Interpretasi = c(
    "Nilai ujian ketika jam belajar = 0",
    "Peningkatan nilai per jam belajar",
    "Proporsi variasi yang dijelaskan",
    ifelse(slope_p_value < 0.05, "Signifikan", "Tidak signifikan"),
    ifelse(shapiro_test$p.value > 0.05, "Normal", "Tidak normal"),
    ifelse(bp_test$p.value > 0.05, "Homogen", "Heterogen")
  )
)

cat("\nTabel Hasil Analisis:\n")
## 
## Tabel Hasil Analisis:
print(hasil_analisis)
##                    Komponen   Nilai                       Interpretasi
## 1            Intercept (β0) 55.0063 Nilai ujian ketika jam belajar = 0
## 2                Slope (β1)  2.5314  Peningkatan nilai per jam belajar
## 3                 R-squared  0.9380   Proporsi variasi yang dijelaskan
## 4             p-value slope  0.0000                         Signifikan
## 5        p-value normalitas  0.5715                             Normal
## 6 p-value homoskedastisitas  0.0707                            Homogen
# Simpan ke file CSV
write.csv(hasil_analisis, "hasil_regresi.csv", row.names = FALSE)
cat("\nHasil analisis disimpan di 'hasil_regresi.csv'\n")
## 
## Hasil analisis disimpan di 'hasil_regresi.csv'
cat("\n====================================\n")
## 
## ====================================
cat("ANALISIS REGRESI LINEAR SELESAI\n")
## ANALISIS REGRESI LINEAR SELESAI
cat("====================================\n")
## ====================================