# Load libraries
library(ggplot2)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(kableExtra)
library(nortest)

# Membaca data
data <- readxl::read_excel("data fix new.xlsx", sheet = "Sheet1")
colnames(data) <- c("y", "x1", "x2", "x3", "x4")

# 1. Statistika Deskriptif
cat("1. STATISTIKA DESKRIPTIF\n")
## 1. STATISTIKA DESKRIPTIF
desc_stats <- data.frame(
  Variabel = colnames(data),
  Mean = sapply(data, mean),
  Median = sapply(data, median),
  SD = sapply(data, sd),
  Min = sapply(data, min),
  Max = sapply(data, max)
)
print(kable(desc_stats, format = "simple", digits = 4))
## 
## 
##      Variabel       Mean    Median        SD       Min        Max
## ---  ---------  --------  --------  --------  --------  ---------
## y    y            0.7353    0.7515    0.3471    0.1043     2.4040
## x1   x1           0.7353    0.7274    0.0732    0.5469     0.8974
## x2   x2          56.0294   46.6667   20.0227   33.3333    96.6667
## x3   x3           0.7300    0.7222    0.0907    0.7222     1.7803
## x4   x4          69.0508   81.8182   26.6570   27.2727   100.0000
# 2. Scatter plot untuk variabel respon dengan setiap predictor
cat("\n2. SCATTER PLOT VARIABEL RESPON DENGAN PREDIKTOR\n")
## 
## 2. SCATTER PLOT VARIABEL RESPON DENGAN PREDIKTOR
# Fungsi untuk membuat scatter plot
create_scatter <- function(predictor) {
  ggplot(data, aes(x = .data[[predictor]], y = y)) +
    geom_point() +
    geom_smooth(method = "loess", se = FALSE, color = "red") +
    labs(title = paste("Scatter Plot y vs", predictor),
         x = predictor, y = "y") +
    theme_minimal()
}

# Buat scatter plot untuk setiap predictor
scatter_plots <- lapply(colnames(data)[-1], create_scatter)
print(scatter_plots)
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 0.71691
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 2.799e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 0.71691
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.0052905
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 1.1308
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning: Failed to fit group -1.
## Caused by error in `predLoess()`:
## ! NA/NaN/Inf in foreign function call (arg 5)

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 100.36
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 57.939
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 2.9246e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 1551.9

# 3. Regresi Nonparametrik Spline Linear dengan 1 titik knot
cat("\n3. REGRESI NONPARAMETRIK SPLINE LINEAR 1 TITIK KNOT\n")
## 
## 3. REGRESI NONPARAMETRIK SPLINE LINEAR 1 TITIK KNOT
# Pilih predictor yang paling berkorelasi dengan y
correlations <- sapply(data[-1], function(x) cor(x, data$y))
best_predictor <- names(which.max(abs(correlations)))
cat("Predictor terpilih:", best_predictor, "\n")
## Predictor terpilih: x2
# Fungsi untuk spline linear dengan knot tertentu
linear_spline <- function(x, knots) {
  basis <- matrix(x, ncol = 1)
  for (k in knots) {
    basis <- cbind(basis, pmax(0, x - k))
  }
  return(basis)
}

# 1 knot pada median
knot1 <- median(data[[best_predictor]])
X_spline1 <- linear_spline(data[[best_predictor]], knot1)
model_spline1 <- lm(y ~ X_spline1, data = data)

# Ringkasan model
cat("Model Spline Linear dengan 1 Knot (pada median):\n")
## Model Spline Linear dengan 1 Knot (pada median):
print(summary(model_spline1))
## 
## Call:
## lm(formula = y ~ X_spline1, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64340 -0.19160  0.00166  0.14187  1.67251 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.180461   0.251574  -0.717  0.47443   
## X_spline11   0.019541   0.005879   3.324  0.00115 **
## X_spline12  -0.014676   0.006730  -2.181  0.03097 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3138 on 133 degrees of freedom
## Multiple R-squared:  0.1948, Adjusted R-squared:  0.1827 
## F-statistic: 16.09 on 2 and 133 DF,  p-value: 5.528e-07
# Plot hasil
plot_data1 <- data.frame(x = data[[best_predictor]], y = data$y)
plot_data1$predicted <- predict(model_spline1)

ggplot(plot_data1, aes(x = x)) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = predicted), color = "blue", size = 1) +
  geom_vline(xintercept = knot1, linetype = "dashed", color = "red") +
  labs(title = "Spline Linear dengan 1 Knot",
       x = best_predictor, y = "y") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 4. Regresi Nonparametrik Spline Linear dengan 2 titik knot
cat("\n4. REGRESI NONPARAMETRIK SPLINE LINEAR 2 TITIK KNOT\n")
## 
## 4. REGRESI NONPARAMETRIK SPLINE LINEAR 2 TITIK KNOT
# 2 knots pada kuartil 1 dan 3
knots2 <- quantile(data[[best_predictor]], probs = c(0.33, 0.67))
X_spline2 <- linear_spline(data[[best_predictor]], knots2)
model_spline2 <- lm(y ~ X_spline2, data = data)

# Ringkasan model
cat("Model Spline Linear dengan 2 Knots (pada kuartil 1 dan 3):\n")
## Model Spline Linear dengan 2 Knots (pada kuartil 1 dan 3):
print(summary(model_spline2))
## 
## Call:
## lm(formula = y ~ X_spline2, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64870 -0.18188 -0.01642  0.10855  1.58947 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.431202   0.259128  -1.664 0.098475 .  
## X_spline21   0.026694   0.006214   4.296 3.35e-05 ***
## X_spline22  -0.092345   0.027245  -3.389 0.000924 ***
## X_spline23   0.075021   0.025545   2.937 0.003914 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3052 on 132 degrees of freedom
## Multiple R-squared:  0.2442, Adjusted R-squared:  0.227 
## F-statistic: 14.22 on 3 and 132 DF,  p-value: 4.402e-08
# Plot hasil
plot_data2 <- data.frame(x = data[[best_predictor]], y = data$y)
plot_data2$predicted <- predict(model_spline2)

ggplot(plot_data2, aes(x = x)) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = predicted), color = "blue", size = 1) +
  geom_vline(xintercept = knots2, linetype = "dashed", color = "red") +
  labs(title = "Spline Linear dengan 2 Knots",
       x = best_predictor, y = "y") +
  theme_minimal()

# 5. Regresi Nonparametrik Spline Linear dengan 3 titik knot
cat("\n5. REGRESI NONPARAMETRIK SPLINE LINEAR 3 TITIK KNOT\n")
## 
## 5. REGRESI NONPARAMETRIK SPLINE LINEAR 3 TITIK KNOT
# 3 knots pada persentil 25, 50, 75
knots3 <- quantile(data[[best_predictor]], probs = c(0.25, 0.5, 0.75))
X_spline3 <- linear_spline(data[[best_predictor]], knots3)
model_spline3 <- lm(y ~ X_spline3, data = data)

# Ringkasan model
cat("Model Spline Linear dengan 3 Knots (pada persentil 25, 50, 75):\n")
## Model Spline Linear dengan 3 Knots (pada persentil 25, 50, 75):
print(summary(model_spline3))
## 
## Call:
## lm(formula = y ~ X_spline3, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62754 -0.16373 -0.04877  0.09408  1.69998 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.497899   1.110602  -3.150 0.002026 ** 
## X_spline31   0.114170   0.031433   3.632 0.000402 ***
## X_spline32  -0.450414   0.145366  -3.098 0.002381 ** 
## X_spline33   0.344593   0.114791   3.002 0.003213 ** 
## X_spline34  -0.020046   0.009419  -2.128 0.035188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3007 on 131 degrees of freedom
## Multiple R-squared:  0.272,  Adjusted R-squared:  0.2498 
## F-statistic: 12.24 on 4 and 131 DF,  p-value: 1.753e-08
# Plot hasil
plot_data3 <- data.frame(x = data[[best_predictor]], y = data$y)
plot_data3$predicted <- predict(model_spline3)

ggplot(plot_data3, aes(x = x)) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = predicted), color = "blue", size = 1) +
  geom_vline(xintercept = knots3, linetype = "dashed", color = "red") +
  labs(title = "Spline Linear dengan 3 Knots",
       x = best_predictor, y = "y") +
  theme_minimal()

# 6. Regresi Nonparametrik Spline Linear dengan kombinasi knot
cat("\n6. REGRESI NONPARAMETRIK SPLINE LINEAR DENGAN KOMBINASI KNOT\n")
## 
## 6. REGRESI NONPARAMETRIK SPLINE LINEAR DENGAN KOMBINASI KNOT
# Coba beberapa kombinasi knot dan pilih yang terbaik berdasarkan AIC
knot_combinations <- list(
  c(0.3, 0.6),
  c(0.2, 0.5, 0.8),
  c(0.4, 0.7),
  c(0.25, 0.5, 0.75)
)

best_aic <- Inf
best_model <- NULL
best_knots <- NULL

for (knots in knot_combinations) {
  actual_knots <- quantile(data[[best_predictor]], probs = knots)
  X_spline <- linear_spline(data[[best_predictor]], actual_knots)
  model <- lm(y ~ X_spline, data = data)
  current_aic <- AIC(model)
  
  if (current_aic < best_aic) {
    best_aic <- current_aic
    best_model <- model
    best_knots <- actual_knots
  }
}

# Ringkasan model terbaik
cat("Model Spline Terbaik berdasarkan AIC dengan Knots pada:\n")
## Model Spline Terbaik berdasarkan AIC dengan Knots pada:
print(best_knots)
##      20%      50%      80% 
## 36.66667 46.66667 80.00000
cat("\nRingkasan Model Terbaik:\n")
## 
## Ringkasan Model Terbaik:
print(summary(best_model))
## 
## Call:
## lm(formula = y ~ X_spline, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62754 -0.16373 -0.04877  0.09408  1.69998 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.497899   1.110602  -3.150 0.002026 ** 
## X_spline1    0.114170   0.031433   3.632 0.000402 ***
## X_spline2   -0.112604   0.036342  -3.098 0.002381 ** 
## X_spline3    0.006783   0.008757   0.774 0.440039    
## X_spline4   -0.020046   0.009419  -2.128 0.035188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3007 on 131 degrees of freedom
## Multiple R-squared:  0.272,  Adjusted R-squared:  0.2498 
## F-statistic: 12.24 on 4 and 131 DF,  p-value: 1.753e-08
# Plot hasil
plot_data_best <- data.frame(x = data[[best_predictor]], y = data$y)
plot_data_best$predicted <- predict(best_model)

ggplot(plot_data_best, aes(x = x)) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = predicted), color = "blue", size = 1) +
  geom_vline(xintercept = best_knots, linetype = "dashed", color = "red") +
  labs(title = "Spline Linear dengan Kombinasi Knot Terbaik",
       x = best_predictor, y = "y") +
  theme_minimal()

# 7. Model Spline Terbaik
cat("\n7. MODEL SPLINE TERBAIK\n")
## 
## 7. MODEL SPLINE TERBAIK
# Bandingkan semua model spline yang telah dibuat
models <- list(
  "1 Knot" = model_spline1,
  "2 Knots" = model_spline2,
  "3 Knots" = model_spline3,
  "Best Combination" = best_model
)

# Buat tabel perbandingan
comparison <- data.frame(
  Model = names(models),
  AIC = sapply(models, AIC),
  BIC = sapply(models, BIC),
  R_squared = sapply(models, function(m) summary(m)$r.squared),
  Adj_R_squared = sapply(models, function(m) summary(m)$adj.r.squared)
)

print(kable(comparison, format = "simple", digits = 4))
## 
## 
##                    Model                   AIC       BIC   R_squared   Adj_R_squared
## -----------------  -----------------  --------  --------  ----------  --------------
## 1 Knot             1 Knot              75.7017   87.3523      0.1948          0.1827
## 2 Knots            2 Knots             69.0939   83.6571      0.2442          0.2270
## 3 Knots            3 Knots             65.9931   83.4690      0.2720          0.2498
## Best Combination   Best Combination    65.9931   83.4690      0.2720          0.2498
# Pilih model terbaik berdasarkan AIC
best_spline_model <- models[[which.min(comparison$AIC)]]
cat("\nModel Spline Terbaik adalah:", names(models)[which.min(comparison$AIC)], "\n")
## 
## Model Spline Terbaik adalah: Best Combination
print(summary(best_spline_model))
## 
## Call:
## lm(formula = y ~ X_spline, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62754 -0.16373 -0.04877  0.09408  1.69998 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.497899   1.110602  -3.150 0.002026 ** 
## X_spline1    0.114170   0.031433   3.632 0.000402 ***
## X_spline2   -0.112604   0.036342  -3.098 0.002381 ** 
## X_spline3    0.006783   0.008757   0.774 0.440039    
## X_spline4   -0.020046   0.009419  -2.128 0.035188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3007 on 131 degrees of freedom
## Multiple R-squared:  0.272,  Adjusted R-squared:  0.2498 
## F-statistic: 12.24 on 4 and 131 DF,  p-value: 1.753e-08
# 8. Uji Normalitas Residual
cat("\n8. UJI NORMALITAS RESIDUAL\n")
## 
## 8. UJI NORMALITAS RESIDUAL
residuals <- resid(best_spline_model)

# Shapiro-Wilk test
shapiro_test <- shapiro.test(residuals)
cat("Shapiro-Wilk Test for Normality:\n")
## Shapiro-Wilk Test for Normality:
print(shapiro_test)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals
## W = 0.88961, p-value = 1.282e-08
# Anderson-Darling test
ad_test <- ad.test(residuals)
cat("\nAnderson-Darling Test for Normality:\n")
## 
## Anderson-Darling Test for Normality:
print(ad_test)
## 
##  Anderson-Darling normality test
## 
## data:  residuals
## A = 2.8066, p-value = 4.248e-07
# QQ Plot
qqnorm(residuals)
qqline(residuals, col = "red")
title("QQ Plot of Residuals")

# Histogram residuals
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue")

# 9. Uji Serentak
cat("\n9. UJI SERENTAK\n")
## 
## 9. UJI SERENTAK
# Uji F untuk signifikansi model
cat("Uji F untuk Signifikansi Model:\n")
## Uji F untuk Signifikansi Model:
print(anova(best_spline_model))
## Analysis of Variance Table
## 
## Response: y
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## X_spline    4  4.4252 1.10631  12.237 1.753e-08 ***
## Residuals 131 11.8436 0.09041                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 10. Uji Parsial
cat("\n10. UJI PARSIAL\n")
## 
## 10. UJI PARSIAL
# Uji t untuk setiap koefisien
cat("Uji t untuk Koefisien Model:\n")
## Uji t untuk Koefisien Model:
print(summary(best_spline_model)$coefficients)
##                 Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept) -3.497899020 1.110601978 -3.1495523 0.0020259809
## X_spline1    0.114169919 0.031433119  3.6321537 0.0004021378
## X_spline2   -0.112603560 0.036341603 -3.0984753 0.0023808660
## X_spline3    0.006782523 0.008757446  0.7744864 0.4400387869
## X_spline4   -0.020046256 0.009419099 -2.1282562 0.0351883359