#Read Data

# MEMUAT PACKAGE DAN DATA
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
library(mice)
## Warning: package 'mice' was built under R version 4.5.2
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(visdat)
## Warning: package 'visdat' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Set seed untuk reproduksibilitas
set.seed(123)

# Muat data
data <- read.csv("C:/Users/ASUS/Downloads/AnReg/best sellin books 2023.csv")
head(data)
##   id                                                                 Book.name
## 1 #1 Atomic Habits: An Easy & Proven Way to Build Good Habits & Break Bad Ones
## 2 #2                           Iron Flame (Standard Edition) (The Empyrean, 2)
## 3 #3                                                                     Spare
## 4 #4                                                               Fourth Wing
## 5 #5                                                           The Woman in Me
## 6 #6                                             Lessons in Chemistry: A Novel
##                             Author             Rating reviews.count      form
## 1                      James Clear 4.8 out of 5 stars        145747 Hardcover
## 2                   Rebecca Yarros 4.7 out of 5 stars        395512 Hardcover
## 3 Prince Harry  The Duke of Sussex 4.5 out of 5 stars        116101 Hardcover
## 4                   Rebecca Yarros 4.8 out of 5 stars        472618 Paperback
## 5                   Britney Spears 4.4 out of 5 stars         51520 Hardcover
## 6                    Bonnie Garmus 4.6 out of 5 stars        322650 Hardcover
##    price Reading.age Print.Length  Publishing.date                        Genre
## 1 $18.88                      320       16/10/2018             Self-Improvement
## 2 $11.05                      640       07/11/2023 Fiction & Action & Adventure
## 3 $11.99                      416 January 10, 2023        Biographies & Memoirs
## 4 $13.62                      544       17/09/2024 Fiction & Action & Adventure
## 5 $11.37                      288 October 24, 2023        Biographies & Memoirs
## 6 $13.94                      400    April 5, 2022 Fiction & Action & Adventure

#Missing Data

# Ringkasan missing values
analyze_missing_values <- function(data) {
  cat(" ANALISIS DATA HILANG \n\n")
  
  # Total missing values
  total_missing <- sum(is.na(data))
  cat("Total nilai hilang:", total_missing, "\n")
  cat("Persentase nilai hilang:", round(total_missing / (nrow(data) * ncol(data)) * 100, 2), "%\n\n")

 # Missing values per variabel
  missing_per_var <- sapply(data, function(x) sum(is.na(x)))
  missing_df <- data.frame(
    Variable = names(missing_per_var),
    Missing_Count = missing_per_var,
    Missing_Percentage = round(missing_per_var / nrow(data) * 100, 2)
  ) %>%
    arrange(desc(Missing_Count))
  
  print("Missing values per variabel:")
  print(missing_df)
  
  # Pattern missing values
  cat("\nPattern data hilang (5 observasi pertama):\n")
  print(head(md.pattern(data, plot = FALSE), 5))
  
  # Visualisasi missing values
  par(mfrow = c(1, 2))
  
  # Plot 1: Aggregation plot
  tryCatch({
    aggr_plot <- aggr(data, col = c('navyblue', 'red'), 
                     numbers = TRUE, sortVars = TRUE, 
                     labels = names(data), cex.axis = 0.7, 
                     gap = 3, ylab = c("Histogram data hilang", "Pattern"))
  }, error = function(e) {
    cat("Gagal membuat plot aggr, menggunakan alternatif...\n")
  })
  
  # Plot 2: Heatmap missing values
  vis_miss(data, warn_large_data = FALSE) +
    theme_minimal() +
    labs(title = "Heatmap Data Hilang")
  
  par(mfrow = c(1, 1))
  
  return(missing_df)
}

# Jalankan analisis missing values
missing_analysis <- analyze_missing_values(data)
##  ANALISIS DATA HILANG 
## 
## Total nilai hilang: 2 
## Persentase nilai hilang: 0.18 %
## 
## [1] "Missing values per variabel:"
##                        Variable Missing_Count Missing_Percentage
## Print.Length       Print.Length             2                  2
## id                           id             0                  0
## Book.name             Book.name             0                  0
## Author                   Author             0                  0
## Rating                   Rating             0                  0
## reviews.count     reviews.count             0                  0
## form                       form             0                  0
## price                     price             0                  0
## Reading.age         Reading.age             0                  0
## Publishing.date Publishing.date             0                  0
## Genre                     Genre             0                  0
## 
## Pattern data hilang (5 observasi pertama):
##    id Book.name Author Rating reviews.count form price Reading.age
## 98  1         1      1      1             1    1     1           1
## 2   1         1      1      1             1    1     1           1
##     0         0      0      0             0    0     0           0
##    Publishing.date Genre Print.Length  
## 98               1     1            1 0
## 2                1     1            0 1
##                  0     0            2 2
## Gagal membuat plot aggr, menggunakan alternatif...

#Penanganan Data Hilang

# Metode penanganan data hilang
handle_missing_values <- function(data, method = "mean") {
  cat("\n PENANGANAN DATA HILANG \n")
  cat("Metode yang digunakan:", method, "\n\n")
  
  data_clean <- data
  
  # Identifikasi variabel numerik dan kategorikal
  numeric_vars <- sapply(data_clean, is.numeric)
  categorical_vars <- sapply(data_clean, function(x) is.character(x) | is.factor(x))
  
  # Tangani missing values untuk variabel numerik
  for(var in names(data_clean)[numeric_vars]) {
    na_count <- sum(is.na(data_clean[[var]]))
    
    if(na_count > 0) {
      cat("Variabel", var, "memiliki", na_count, "nilai hilang\n")
      
      if(method == "mean") {
        # Imputasi dengan mean
        mean_val <- mean(data_clean[[var]], na.rm = TRUE)
        data_clean[[var]][is.na(data_clean[[var]])] <- mean_val
        cat("  Diimputasi dengan mean:", round(mean_val, 2), "\n")
        
      } else if(method == "median") {
        # Imputasi dengan median
        median_val <- median(data_clean[[var]], na.rm = TRUE)
        data_clean[[var]][is.na(data_clean[[var]])] <- median_val
        cat("  Diimputasi dengan median:", median_val, "\n")
        
      } else if(method == "knn") {
        # Imputasi dengan KNN (sederhana)
        require(VIM)
        temp_data <- data_clean[, numeric_vars]
        knn_imputed <- kNN(temp_data, variable = var, k = 5)
        data_clean[[var]] <- knn_imputed[[var]]
        cat("  Diimputasi dengan KNN (k=5)\n")
        
      } else if(method == "delete") {
        # Hapus baris dengan missing values
        cat("  Opsi hapus: akan dihapus dalam langkah terpisah\n")
      }
    }
  }
  
  # Tangani missing values untuk variabel kategorikal
  for(var in names(data_clean)[categorical_vars]) {
    na_count <- sum(is.na(data_clean[[var]]))
    
    if(na_count > 0) {
      cat("Variabel kategorikal", var, "memiliki", na_count, "nilai hilang\n")
      
      # Imputasi dengan modus
      mode_val <- names(sort(table(data_clean[[var]]), decreasing = TRUE))[1]
      data_clean[[var]][is.na(data_clean[[var]])] <- mode_val
      cat("  Diimputasi dengan modus:", mode_val, "\n")
    }
  }
  
  # Opsi: Hapus baris dengan missing values
  if(method == "delete") {
    rows_before <- nrow(data_clean)
    data_clean <- na.omit(data_clean)
    rows_after <- nrow(data_clean)
    cat("\nDihapus", rows_before - rows_after, "baris dengan missing values\n")
  }
  
  # Verifikasi setelah imputasi
  cat("\nMissing values setelah penanganan:\n")
  print(colSums(is.na(data_clean)))
  
  return(data_clean)
}

# Contoh penggunaan metode imputasi yang berbeda
data_mean_imputed <- handle_missing_values(data, method = "mean")
## 
##  PENANGANAN DATA HILANG 
## Metode yang digunakan: mean 
## 
## Variabel Print.Length memiliki 2 nilai hilang
##   Diimputasi dengan mean: 361.22 
## 
## Missing values setelah penanganan:
##              id       Book.name          Author          Rating   reviews.count 
##               0               0               0               0               0 
##            form           price     Reading.age    Print.Length Publishing.date 
##               0               0               0               0               0 
##           Genre 
##               0
data_median_imputed <- handle_missing_values(data, method = "median")
## 
##  PENANGANAN DATA HILANG 
## Metode yang digunakan: median 
## 
## Variabel Print.Length memiliki 2 nilai hilang
##   Diimputasi dengan median: 344 
## 
## Missing values setelah penanganan:
##              id       Book.name          Author          Rating   reviews.count 
##               0               0               0               0               0 
##            form           price     Reading.age    Print.Length Publishing.date 
##               0               0               0               0               0 
##           Genre 
##               0

#Outlier

visualize_outliers <- function(data) {
  
  cat("\n VISUALISASI OUTLIER \n")
  
  # Bersihkan karakter non-numerik 
  data$price <- as.numeric(gsub("[^0-9.]", "", data$price))
  data$reviews.count <- as.numeric(gsub("[^0-9.]", "", data$reviews.count))
  
  # Cek apakah semua NA
  if(all(is.na(data$price)) | all(is.na(data$reviews.count))) {
    stop("Kolom price atau reviews.count menjadi NA semua setelah konversi. Cek format data!")
  }
  
  # Hapus NA supaya plot tidak error
  clean_data <- na.omit(data[, c("price", "reviews.count")])
  
  #  BOX PLOT
  par(mfrow = c(1, 2))
  
  boxplot(clean_data$price,
          main = "Boxplot Price",
          col = "lightblue",
          ylab = "Price",
          outline = TRUE)
  grid()
  
  boxplot(clean_data$reviews.count,
          main = "Boxplot Reviews Count",
          col = "lightgreen",
          ylab = "Reviews Count",
          outline = TRUE)
  grid()
  
  par(mfrow = c(1, 1))
  
  # HISTOGRAM
  plot_hist_outlier <- function(variable, var_name) {
    
    q1 <- quantile(variable, 0.25)
    q3 <- quantile(variable, 0.75)
    iqr_val <- IQR(variable)
    
    lower_bound <- q1 - 1.5 * iqr_val
    upper_bound <- q3 + 1.5 * iqr_val
    
    ggplot(data.frame(value = variable), aes(x = value)) +
      geom_histogram(aes(y = after_stat(density)),
                     bins = 30,
                     fill = "lightblue",
                     alpha = 0.7) +
      geom_density(color = "darkblue", linewidth = 1) +
      geom_vline(xintercept = c(lower_bound, upper_bound),
                 color = "red",
                 linetype = "dashed",
                 linewidth = 1) +
      labs(title = paste("Distribusi & Outlier:", var_name),
           x = var_name,
           y = "Density") +
      theme_minimal()
  }
  
  print(plot_hist_outlier(clean_data$price, "Price"))
  print(plot_hist_outlier(clean_data$reviews.count, "Reviews Count"))
  
  # ================= SCATTER =================
  print(
    ggplot(clean_data, aes(x = price, y = reviews.count)) +
      geom_point(color = "blue", alpha = 0.6, size = 3) +
      labs(title = "Scatter Plot: Price vs Reviews Count",
           x = "Price",
           y = "Reviews Count") +
      theme_minimal()
  )
}

# Jalankan
visualize_outliers(data)
## 
##  VISUALISASI OUTLIER

#Penanganan Outlier

# Metode penanganan outlier
handle_outliers <- function(data, method = "cap") {
  cat("\n PENANGANAN OUTLIER \n")
  cat("Metode yang digunakan:", method, "\n\n")
  
  data_processed <- data
  
  for(var in names(data_processed)[sapply(data_processed, is.numeric)]) {
    # Hitung batas outlier
    q1 <- quantile(data_processed[[var]], 0.25, na.rm = TRUE)
    q3 <- quantile(data_processed[[var]], 0.75, na.rm = TRUE)
    iqr_val <- IQR(data_processed[[var]], na.rm = TRUE)
    lower_bound <- q1 - 1.5 * iqr_val
    upper_bound <- q3 + 1.5 * iqr_val
    
    # Identifikasi outlier
    is_outlier <- data_processed[[var]] < lower_bound | data_processed[[var]] > upper_bound
    n_outliers <- sum(is_outlier, na.rm = TRUE)
    
    if(n_outliers > 0) {
      cat("Variabel", var, "memiliki", n_outliers, "outlier\n")
      
      if(method == "cap") {
        # Cap outliers (Winsorizing)
        data_processed[[var]][data_processed[[var]] < lower_bound] <- lower_bound
        data_processed[[var]][data_processed[[var]] > upper_bound] <- upper_bound
        cat("  Outlier di-cap ke batas [", round(lower_bound, 2), ", ", 
            round(upper_bound, 2), "]\n", sep = "")
        
      } else if(method == "remove") {
        # Hapus outlier (set sebagai NA)
        data_processed[[var]][is_outlier] <- NA
        cat("  Outlier dihapus (dijadikan NA)\n")
        
      } else if(method == "transform") {
        # Transformasi logaritmik
        if(all(data_processed[[var]] > 0, na.rm = TRUE)) {
          data_processed[[var]] <- log(data_processed[[var]] + 1)
          cat("  Dilakukan transformasi log\n")
        } else {
          cat("  Transformasi log tidak dapat dilakukan (nilai negatif)\n")
        }
        
      } else if(method == "median") {
        # Ganti dengan median
        median_val <- median(data_processed[[var]], na.rm = TRUE)
        data_processed[[var]][is_outlier] <- median_val
        cat("  Diganti dengan median:", round(median_val, 2), "\n")
      }
    }
  }
  
  return(data_processed)
}

# Contoh penggunaan metode penanganan outlier
data_capped <- handle_outliers(data, method = "cap")
## 
##  PENANGANAN OUTLIER 
## Metode yang digunakan: cap 
## 
## Variabel reviews.count memiliki 11 outlier
##   Outlier di-cap ke batas [-128434.2, 283837.8]
## Variabel Print.Length memiliki 3 outlier
##   Outlier di-cap ke batas [-152, 808]
data_no_outliers <- handle_outliers(data, method = "remove")
## 
##  PENANGANAN OUTLIER 
## Metode yang digunakan: remove 
## 
## Variabel reviews.count memiliki 11 outlier
##   Outlier dihapus (dijadikan NA)
## Variabel Print.Length memiliki 3 outlier
##   Outlier dihapus (dijadikan NA)

#Analisis Deskriptif dan Visualisasi

# ANALISIS DESKRIPTIF DAN VISUALISASI
print("Statistik Deskriptif:")
## [1] "Statistik Deskriptif:"
summary(data)
##       id             Book.name            Author             Rating         
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  reviews.count        form              price           Reading.age       
##  Min.   :  3296   Length:100         Length:100         Length:100        
##  1st Qu.: 26168   Class :character   Class :character   Class :character  
##  Median : 77085   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :111733                                                           
##  3rd Qu.:129236                                                           
##  Max.   :653111                                                           
##                                                                           
##   Print.Length    Publishing.date       Genre          
##  Min.   :  24.0   Length:100         Length:100        
##  1st Qu.: 208.0   Class :character   Class :character  
##  Median : 344.0   Mode  :character   Mode  :character  
##  Mean   : 361.2                                        
##  3rd Qu.: 448.0                                        
##  Max.   :2896.0                                        
##  NA's   :2
# Scatter plot
ggplot(data, aes(x = price, y = reviews.count)) +
  geom_point(color = "blue", size = 3) +
  labs(title = "Hubungan price dan reviews.count",
       x = "price", y = "review.count") +
  theme_minimal()

str(data$price)
##  chr [1:100] "$18.88" "$11.05" "$11.99" "$13.62" "$11.37" "$13.94" "$13.99" ...
str(data$reviews.count)
##  int [1:100] 145747 395512 116101 472618 51520 322650 87375 82564 228242 19946 ...
# Bersihkan price 
data$price <- gsub("\\$", "", data$price)
data$price <- gsub(",", "", data$price)
data$price <- as.numeric(data$price)

# Pastikan reviews.count numeric juga
data$reviews.count <- as.numeric(data$reviews.count)


# Korelasi
cor_test <- cor.test(data$price, data$reviews.count)
print(paste("Korelasi Pearson:", round(cor_test$estimate, 4)))
## [1] "Korelasi Pearson: -0.0837"
print(paste("p-value korelasi:", round(cor_test$p.value, 4)))
## [1] "p-value korelasi: 0.4077"

#Membangun Model

#MEMBANGUN MODEL REGRESI
model <- lm(reviews.count ~ price, data = data)

cat("Ringkasan Model Regresi:\n")
## Ringkasan Model Regresi:
print(summary(model))
## 
## Call:
## lm(formula = reviews.count ~ price, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -114972  -86607  -33963   21690  538407 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   127742      22872   5.585 2.09e-07 ***
## price          -1346       1618  -0.832    0.408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 123500 on 98 degrees of freedom
## Multiple R-squared:  0.007007,   Adjusted R-squared:  -0.003125 
## F-statistic: 0.6916 on 1 and 98 DF,  p-value: 0.4077
# UJI ASUMSI REGRESI LINEAR
cat("\n UJI ASUMSI REGRESI LINEAR \n")
## 
##  UJI ASUMSI REGRESI LINEAR
# Normalitas Residual
shapiro_test <- shapiro.test(residuals(model))

cat("\n1. UJI NORMALITAS (Shapiro-Wilk):\n")
## 
## 1. UJI NORMALITAS (Shapiro-Wilk):
cat("   Statistik W =", round(shapiro_test$statistic, 4), "\n")
##    Statistik W = 0.7791
cat("   p-value =", round(shapiro_test$p.value, 4), "\n")
##    p-value = 0
if(shapiro_test$p.value > 0.05) {
  cat("   Keputusan: Residual berdistribusi normal\n")
} else {
  cat("   Keputusan: Residual tidak berdistribusi normal\n")
}
##    Keputusan: Residual tidak berdistribusi normal
# Q-Q Plot
qqnorm(residuals(model), main = "Q-Q Plot Residual")
qqline(residuals(model), col = "red")

# Homoskedastisitas
bp_test <- bptest(model)

cat("\n2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):\n")
## 
## 2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):
cat("   Statistik LM =", round(bp_test$statistic, 4), "\n")
##    Statistik LM = 0.5164
cat("   p-value =", round(bp_test$p.value, 4), "\n")
##    p-value = 0.4724
if(bp_test$p.value > 0.05) {
  cat("   Keputusan: Varian residual homogen (tidak ada heteroskedastisitas)\n")
} else {
  cat("   Keputusan: Terjadi heteroskedastisitas\n")
}
##    Keputusan: Varian residual homogen (tidak ada heteroskedastisitas)
# Plot Residual vs Fitted
plot(fitted(model), residuals(model),
     main = "Residual vs Fitted Values",
     xlab = "Fitted Values",
     ylab = "Residuals",
     pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)

# UJI AUTOKORELASI (Durbin-Watson)
dw_test <- dwtest(model)

cat("\n3. UJI AUTOKORELASI (Durbin-Watson):\n")
## 
## 3. UJI AUTOKORELASI (Durbin-Watson):
cat("   Statistik DW =", round(dw_test$statistic, 4), "\n")
##    Statistik DW = 1.753
cat("   p-value =", round(dw_test$p.value, 4), "\n")
##    p-value = 0.105
if(dw_test$p.value > 0.05) {
  cat("   Keputusan: Tidak terdapat autokorelasi\n")
} else {
  cat("   Keputusan: Terdapat autokorelasi\n")
}
##    Keputusan: Tidak terdapat autokorelasi
# INTERPRETASI KOEFISIEN
cat("\n INTERPRETASI KOEFISIEN \n")
## 
##  INTERPRETASI KOEFISIEN
intercept <- coef(model)[1]
slope <- coef(model)[2]

cat("Persamaan Regresi: reviews.count =", 
    round(intercept, 2), "+", 
    round(slope, 2), "* price\n")
## Persamaan Regresi: reviews.count = 127741.8 + -1345.54 * price
cat("\nInterpretasi:\n")
## 
## Interpretasi:
cat("1. Intercept (β0 =", round(intercept, 2), "):\n")
## 1. Intercept (β0 = 127741.8 ):
cat("   Perkiraan jumlah reviews.count ketika price = 0 adalah", 
    round(intercept, 2), "\n")
##    Perkiraan jumlah reviews.count ketika price = 0 adalah 127741.8
cat("2. Slope (β1 =", round(slope, 2), "):\n")
## 2. Slope (β1 = -1345.54 ):
cat("   Setiap kenaikan 1 satuan price akan mengubah reviews.count sebesar", 
    round(slope, 2), "\n")
##    Setiap kenaikan 1 satuan price akan mengubah reviews.count sebesar -1345.54
# ESTIMASI PARAMETER DAN INFERENSI
cat("\n ESTIMASI PARAMETER \n")
## 
##  ESTIMASI PARAMETER
conf_int <- confint(model, level = 0.95)

cat("Interval Kepercayaan 95%:\n")
## Interval Kepercayaan 95%:
cat("   Intercept: [", round(conf_int[1,1], 3), ", ", 
    round(conf_int[1,2], 3), "]\n", sep = "")
##    Intercept: [82352.39, 173131.1]
cat("   Slope:     [", round(conf_int[2,1], 3), ", ", 
    round(conf_int[2,2], 3), "]\n", sep = "")
##    Slope:     [-4556.43, 1865.357]
# Uji hipotesis untuk slope
cat("\nUji Hipotesis untuk Slope (β1):\n")
## 
## Uji Hipotesis untuk Slope (β1):
cat("   H0: β1 = 0 (tidak ada hubungan linear antara price dan reviews.count)\n")
##    H0: β1 = 0 (tidak ada hubungan linear antara price dan reviews.count)
cat("   H1: β1 ≠ 0 (ada hubungan linear antara price dan reviews.count)\n")
##    H1: β1 ≠ 0 (ada hubungan linear antara price dan reviews.count)
summary_model <- summary(model)
slope_pvalue <- summary_model$coefficients[2, 4]

cat("   p-value =", round(slope_pvalue, 6), "\n")
##    p-value = 0.407658
if(slope_pvalue < 0.05) {
  cat("   Keputusan: Tolak H0, terdapat hubungan linear yang signifikan\n")
} else {
  cat("   Keputusan: Gagal tolak H0, tidak terdapat hubungan linear yang signifikan\n")
}
##    Keputusan: Gagal tolak H0, tidak terdapat hubungan linear yang signifikan
#  KOEFISIEN DETERMINASI
r_squared <- summary_model$r.squared

cat("\nKoefisien Determinasi (R²):\n")
## 
## Koefisien Determinasi (R²):
cat("   R² =", round(r_squared, 4), "\n")
##    R² = 0.007
cat("   Artinya:", round(r_squared * 100, 2), 
    "% variasi reviews.count dapat dijelaskan oleh price\n")
##    Artinya: 0.7 % variasi reviews.count dapat dijelaskan oleh price
#  VISUALISASI MODEL
ggplot(data, aes(x = price, y = reviews.count)) +
  geom_point(color = "blue", size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "red", fill = "pink") +
  labs(title = "Regresi Linear: Price vs Reviews Count",
       subtitle = paste("Y =", round(intercept, 2), 
                        "+", round(slope, 2), "X"),
       x = "Price",
       y = "Reviews Count") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# PREDIKSI
new_data <- data.frame(price = c(10, 20))  # ganti sesuai harga yang ingin diprediksi

prediction <- predict(model, newdata = new_data, interval = "confidence")

cat("\n=== PREDIKSI ===\n")
## 
## === PREDIKSI ===
cat("Untuk price =", new_data$price[1],
    ", prediksi reviews.count =", round(prediction[1, "fit"], 2), "\n")
## Untuk price = 10 , prediksi reviews.count = 114286.4
cat("Untuk price =", new_data$price[2],
    ", prediksi reviews.count =", round(prediction[2, "fit"], 2), "\n")
## Untuk price = 20 , prediksi reviews.count = 100831
# DIAGNOSTIC PLOTS
par(mfrow = c(2, 2))
plot(model, which = 1:4)

par(mfrow = c(1, 1))

cat("\n RINGKASAN ANALISIS \n")
## 
##  RINGKASAN ANALISIS
cat("1. Model: reviews.count = β0 + β1 * price + ε\n")
## 1. Model: reviews.count = β0 + β1 * price + ε
cat("2. Estimasi: Y =",
    round(intercept, 3), "+",
    round(slope, 3), "* price\n")
## 2. Estimasi: Y = 127741.8 + -1345.536 * price
cat("3. R² =", round(r_squared, 4),
    "(", round(r_squared * 100, 1), "%)\n")
## 3. R² = 0.007 ( 0.7 %)
# Uji F (ambil p-value yang benar)
f_stat <- summary_model$fstatistic
f_pvalue <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)

cat("4. Uji F (model): p-value =",
    round(f_pvalue, 6), "\n")
## 4. Uji F (model): p-value = 0.407658
cat("5. Asumsi:\n")
## 5. Asumsi:
cat("   - Normalitas: p =", round(shapiro_test$p.value, 4), "\n")
##    - Normalitas: p = 0
cat("   - Homoskedastisitas: p =", round(bp_test$p.value, 4), "\n")
##    - Homoskedastisitas: p = 0.4724
cat("   - Autokorelasi: p =", round(dw_test$p.value, 4), "\n")
##    - Autokorelasi: p = 0.105
hasil <- list(
  model = model,
  coefficients = coef(model),
  r_squared = r_squared,
  f_pvalue = f_pvalue,
  assumptions = list(
    normality = shapiro_test$p.value,
    homoscedasticity = bp_test$p.value,
    autocorrelation = dw_test$p.value
  ),
  confidence_intervals = conf_int
)

cat("\nAnalisis regresi linear sederhana selesai!\n")
## 
## Analisis regresi linear sederhana selesai!