# 1. MEMUAT PACKAGE DAN DATA
library(ggplot2)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Membaca data
data <- read.csv("~/Kuliah/sem/analisis regresi/university_student_retention_dataset_2134.csv")

# Melihat apakah attendance_rate (X) berpengaruh terhadap gpa_current (Y)

# 2. DATA CLEANING 
data_clean <- data %>% filter(!is.na(gpa_current), !is.na(attendance_rate)) 

# 3. ANALISIS DESKRIPTIF
cat("\n=== Statistik Deskriptif ===\n")
## 
## === Statistik Deskriptif ===
summary(data_clean[, c("gpa_current", "attendance_rate")])
##   gpa_current    attendance_rate 
##  Min.   :1.500   Min.   :0.5000  
##  1st Qu.:2.140   1st Qu.:0.6400  
##  Median :2.770   Median :0.7500  
##  Mean   :2.757   Mean   :0.7528  
##  3rd Qu.:3.370   3rd Qu.:0.8700  
##  Max.   :4.000   Max.   :1.0000
# Scatter plot
ggplot(data_clean, aes(x = attendance_rate, y = gpa_current)) +
  geom_point(color = "blue", size = 3) +
  labs(title = "Hubungan Attendance Rate dan GPA Current",
       x = "Attendance Rate",
       y = "GPA Current") +
  theme_minimal()

# Korelasi Pearson
cor_test <- cor.test(data_clean$attendance_rate, data_clean$gpa_current)
cat("\n=== Korelasi Pearson ===\n")
## 
## === Korelasi Pearson ===
cat("Korelasi =", round(cor_test$estimate, 4), "\n")
## Korelasi = 0.0051
cat("p-value =", round(cor_test$p.value, 6), "\n")
## p-value = 0.815401
# 4. MEMBANGUN MODEL REGRESI
model <- lm(gpa_current ~ attendance_rate, data = data_clean)

cat("\n=== Ringkasan Model Regresi ===\n")
## 
## === Ringkasan Model Regresi ===
summary(model)
## 
## Call:
## lm(formula = gpa_current ~ attendance_rate, data = data_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26256 -0.61984  0.01491  0.61409  1.24327 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.73798    0.08318  32.916   <2e-16 ***
## attendance_rate  0.02534    0.10855   0.233    0.815    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7201 on 2132 degrees of freedom
## Multiple R-squared:  2.557e-05,  Adjusted R-squared:  -0.0004435 
## F-statistic: 0.05452 on 1 and 2132 DF,  p-value: 0.8154
# 5. UJI ASUMSI REGRESI LINEAR
cat("\n=== UJI ASUMSI REGRESI LINEAR ===\n")
## 
## === UJI ASUMSI REGRESI LINEAR ===
# 5.1 Normalitas residual
shapiro_test <- shapiro.test(residuals(model))
cat("\n1. UJI NORMALITAS (Shapiro-Wilk):\n")
## 
## 1. UJI NORMALITAS (Shapiro-Wilk):
cat("   W =", round(shapiro_test$statistic, 4), "\n")
##    W = 0.9549
cat("   p-value =", round(shapiro_test$p.value, 6), "\n")
##    p-value = 0
if(shapiro_test$p.value > 0.05) {
  cat("   Keputusan: Residual berdistribusi normal\n")
} else {
  cat("   Keputusan: Residual tidak normal\n")
}
##    Keputusan: Residual tidak normal
# Q-Q Plot
qqnorm(residuals(model), main = "Q-Q Plot Residual")
qqline(residuals(model), col = "red")

# 5.2 Homoskedastisitas
bp_test <- bptest(model)
cat("\n2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):\n")
## 
## 2. UJI HOMOSKEDASTISITAS (Breusch-Pagan):
cat("   LM =", round(bp_test$statistic, 4), "\n")
##    LM = 0.0546
cat("   p-value =", round(bp_test$p.value, 6), "\n")
##    p-value = 0.815318
if(bp_test$p.value > 0.05) {
  cat("   Keputusan: Varian residual homogen\n")
} else {
  cat("   Keputusan: Ada heteroskedastisitas\n")
}
##    Keputusan: Varian residual homogen
# Plot residual vs fitted
plot(fitted(model), residuals(model),
     main = "Residual vs Fitted Values",
     xlab = "Fitted Values",
     ylab = "Residuals",
     pch = 19, col = "blue")
abline(h = 0, col = "red", lty = 2)

# 5.3 Autokorelasi
dw_test <- dwtest(model)
cat("\n3. UJI AUTOKORELASI (Durbin-Watson):\n")
## 
## 3. UJI AUTOKORELASI (Durbin-Watson):
cat("   DW =", round(dw_test$statistic, 4), "\n")
##    DW = 1.9511
cat("   p-value =", round(dw_test$p.value, 6), "\n")
##    p-value = 0.129258
if(dw_test$p.value > 0.05) {
  cat("   Keputusan: Tidak ada autokorelasi\n")
} else {
  cat("   Keputusan: Ada autokorelasi\n")
}
##    Keputusan: Tidak ada autokorelasi
# 6. INTERPRETASI KOEFISIEN
cat("\n=== INTERPRETASI KOEFISIEN ===\n")
## 
## === INTERPRETASI KOEFISIEN ===
intercept <- coef(model)[1]
slope <- coef(model)[2]

cat("Persamaan Regresi:\n")
## Persamaan Regresi:
cat("gpa_current =", round(intercept, 3), "+", round(slope, 3), "* attendance_rate\n")
## gpa_current = 2.738 + 0.025 * attendance_rate
cat("\nInterpretasi:\n")
## 
## Interpretasi:
cat("1. Intercept (β0 =", round(intercept, 3), "):\n")
## 1. Intercept (β0 = 2.738 ):
cat("   GPA saat attendance_rate = 0 adalah", round(intercept, 3), "\n")
##    GPA saat attendance_rate = 0 adalah 2.738
cat("2. Slope (β1 =", round(slope, 3), "):\n")
## 2. Slope (β1 = 0.025 ):
cat("   Setiap kenaikan 1 unit attendance_rate, GPA berubah sebesar",
    round(slope, 3), "poin\n")
##    Setiap kenaikan 1 unit attendance_rate, GPA berubah sebesar 0.025 poin
# 7. INTERVAL KEPERCAYAAN
cat("\n=== INTERVAL KEPERCAYAAN 95% ===\n")
## 
## === INTERVAL KEPERCAYAAN 95% ===
conf_int <- confint(model, level = 0.95)
cat("Intercept: [", round(conf_int[1,1], 3), ", ", round(conf_int[1,2], 3), "]\n", sep="")
## Intercept: [2.575, 2.901]
cat("Slope:     [", round(conf_int[2,1], 3), ", ", round(conf_int[2,2], 3), "]\n", sep="")
## Slope:     [-0.188, 0.238]
# 8. UJI HIPOTESIS UNTUK SLOPE
summary_model <- summary(model)
r_squared <- summary_model$r.squared
slope_pvalue <- summary_model$coefficients[2, 4]


cat("\n=== UJI HIPOTESIS (β1) ===\n")
## 
## === UJI HIPOTESIS (β1) ===
cat("H0: β1 = 0 (tidak ada hubungan linear)\n")
## H0: β1 = 0 (tidak ada hubungan linear)
cat("H1: β1 ≠ 0 (ada hubungan linear)\n")
## H1: β1 ≠ 0 (ada hubungan linear)
cat("p-value =", round(slope_pvalue, 6), "\n")
## p-value = 0.815401
if(slope_pvalue < 0.05) {
  cat("Keputusan: Tolak H0, hubungan signifikan\n")
} else {
  cat("Keputusan: Gagal tolak H0, hubungan tidak signifikan\n")
}
## Keputusan: Gagal tolak H0, hubungan tidak signifikan
# 9. KOEFISIEN DETERMINASI
new_data <- data.frame(attendance_rate = c(0.6, 0.8, 1.0))

prediction <- predict(model, newdata = new_data, interval = "confidence", level = 0.95)

cat("\n=== PREDIKSI + INTERVAL KEPERCAYAAN 95% ===\n")
## 
## === PREDIKSI + INTERVAL KEPERCAYAAN 95% ===
for(i in 1:nrow(new_data)) {
  cat("Attendance_rate =", new_data$attendance_rate[i], "\n")
  cat("  Prediksi GPA =", round(prediction[i, "fit"], 3), "\n")
  cat("  CI 95%: [", round(prediction[i, "lwr"], 3), ", ", round(prediction[i, "upr"], 3), "]\n\n", sep = "")
}
## Attendance_rate = 0.6 
##   Prediksi GPA = 2.753 
##   CI 95%: [2.709, 2.798]
## 
## Attendance_rate = 0.8 
##   Prediksi GPA = 2.758 
##   CI 95%: [2.726, 2.79]
## 
## Attendance_rate = 1 
##   Prediksi GPA = 2.763 
##   CI 95%: [2.702, 2.824]
# Tampilkan tabel prediksi
hasil_prediksi <- data.frame(new_data, prediction)
print(hasil_prediksi)
##   attendance_rate      fit      lwr      upr
## 1             0.6 2.753186 2.708556 2.797815
## 2             0.8 2.758254 2.726075 2.790434
## 3             1.0 2.763323 2.702461 2.824186
# 10. VISUALISASI MODEL
ggplot(data_clean, aes(x = attendance_rate, y = gpa_current)) +
  geom_point(color = "blue", size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "red", fill = "pink") +
  labs(title = "Garis Regresi Linear",
       subtitle = paste("Y =", round(intercept, 2), "+", round(slope, 2), "X"),
       x = "Attendance Rate",
       y = "GPA Current") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 9. PREDIKSI
new_data <- data.frame(attendance_rate = c(0.6, 0.8))
prediction <- predict(model, newdata = new_data, interval = "confidence")

cat("\n=== PREDIKSI ===\n")
## 
## === PREDIKSI ===
cat("Untuk attendance_rate = 0.6, prediksi GPA =", round(prediction[1, "fit"], 2), "\n")
## Untuk attendance_rate = 0.6, prediksi GPA = 2.75
cat("Untuk attendance_rate = 0.8, prediksi GPA =", round(prediction[2, "fit"], 2), "\n")
## Untuk attendance_rate = 0.8, prediksi GPA = 2.76
# 10. DIAGNOSTIC PLOTS
par(mfrow = c(2, 2))
plot(model, which = 1:4)

par(mfrow = c(1, 1))

# 11. RINGKASAN LENGKAP
cat("\n=== RINGKASAN ANALISIS ===\n")
## 
## === RINGKASAN ANALISIS ===
cat("1. Model: gpa_current = β0 + β1*attendance_rate + ε\n")
## 1. Model: gpa_current = β0 + β1*attendance_rate + ε
cat("2. Estimasi: Y =", round(intercept, 3), "+", round(slope, 3), "* X\n")
## 2. Estimasi: Y = 2.738 + 0.025 * X
cat("3. R² =", round(r_squared, 4), "(", round(r_squared*100, 1), "%)\n")
## 3. R² = 0 ( 0 %)
# Uji F (model) -> yang benar ambil p-value dari F-test
f_pvalue <- pf(summary_model$fstatistic[1],
               summary_model$fstatistic[2],
               summary_model$fstatistic[3],
               lower.tail = FALSE)

cat("4. Uji F (model): p-value =", round(f_pvalue, 6), "\n")
## 4. Uji F (model): p-value = 0.815401
cat("5. Asumsi:\n")
## 5. Asumsi:
cat("   - Normalitas: p =", round(shapiro_test$p.value, 4), "\n")
##    - Normalitas: p = 0
cat("   - Homoskedastisitas: p =", round(bp_test$p.value, 4), "\n")
##    - Homoskedastisitas: p = 0.8153
cat("   - Autokorelasi: p =", round(dw_test$p.value, 4), "\n")
##    - Autokorelasi: p = 0.1293
# Simpan hasil
hasil <- list(
  model = model,
  coefficients = coef(model),
  r_squared = r_squared,
  assumptions = list(
    normality = shapiro_test$p.value,
    homoscedasticity = bp_test$p.value,
    autocorrelation = dw_test$p.value
  ),
  confidence_intervals = conf_int
)

print("Analisis regresi linear sederhana selesai!")
## [1] "Analisis regresi linear sederhana selesai!"