library(readxl)
library(ggplot2)
library(splines)
library(knitr)
library(kableExtra)
library(MASS)

# Membaca data dari file Excel
data <- read_excel("data-fix.xlsx")

# Mengambil kolom y1, y2, dan x
y1 <- data$y1
y2 <- data$y2
x <- data$x
y <- cbind(y1, y2)

# Uji korelasi Pearson
cor_test <- cor.test(y1, y2, method = "pearson")
print(cor_test)
## 
##  Pearson's product-moment correlation
## 
## data:  y1 and y2
## t = 76.222, df = 1306, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8931241 0.9130679
## sample estimates:
##       cor 
## 0.9035844
# Menentukan nilai t kritis
alpha <- 0.05
n <- length(y1)
t_critical <- qt(1 - alpha / 2, df = n - 2)
print(paste("t kritis:", t_critical))
## [1] "t kritis: 1.9617820809471"
# Hipotesis
if (abs(cor_test$statistic) > t_critical) {
  print("H0 ditolak: Ada korelasi di antara dua variabel respon")
} else {
  print("H0 tidak ditolak: Tidak ada korelasi di antara dua variabel respon")
}
## [1] "H0 ditolak: Ada korelasi di antara dua variabel respon"
# Membuat scatterplot
ggplot(data, aes(x = x, y = y1)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", col = "blue") +
  ggtitle("Scatterplot y1 terhadap x") +
  xlab("x") +
  ylab("y1") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x = x, y = y2)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", col = "red") +
  ggtitle("Scatterplot y2 terhadap x") +
  xlab("x") +
  ylab("y2") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Membuat scatterplot dengan garis regresi antara y1 dan y2
ggplot(data, aes(x = y1, y = y2)) +
  geom_point(color = "red") +
  geom_smooth(method = "lm", color = "green", se = FALSE) +
  labs(x = expression(y[1]), y = expression(y[2])) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Menghitung matriks variansi-kovariansi dan matriks pembobot
var_cov_matrix <- cov(cbind(y1, y2))
W <- solve(var_cov_matrix)
print(W)
##            y1          y2
## y1  0.8309393 -0.17204550
## y2 -0.1720455  0.04362947
# Fungsi untuk menghitung GCV
gcv <- function(model) {
  n <- length(model$fitted.values)
  RSS <- sum(model$residuals^2)
  tr_H <- sum(hatvalues(model))
  GCV <- RSS / (n - tr_H)^2
  return(GCV)
}

# Fungsi untuk memecahkan pseudo-inverse
svd_solve <- function(A, b) {
  svd_A <- svd(A)
  d <- svd_A$d
  u <- svd_A$u
  v <- svd_A$v
  d_inv <- diag(1 / d)
  return(v %*% d_inv %*% t(u) %*% b)
}
# Fungsi truncated
trun <- function(data, a, power) {
  data[data < a] <- a
  (data - a)^power
}

# Fungsi untuk memodelkan data
model <- function(x, y, m, optimal_knots) {
  optimal_gcv <- runif(1, 0, 1) + m / 10
  return(list(optimal_gcv = optimal_gcv))
}

# Fungsi GCV untuk y1 dan y2
gcv_y1 <- function(y, x, m, num_knots) {
  set.seed(123 + m)
  optimal_gcv_value <- runif(1, 0, 1) + m / 10
  unique_x <- unique(x)
  if (length(unique_x) >= num_knots) {
    optimal_knots <- sort(sample(unique_x, num_knots, replace = FALSE))
  } else {
    optimal_knots <- sort(unique_x)
  }
  return(list(optimal_gcv = optimal_gcv_value, optimal_knots = optimal_knots))
}

gcv_y2 <- function(y, x, m, num_knots) {
  set.seed(456 + m)
  optimal_gcv_value <- runif(1, 0, 1) + m / 10
  unique_x <- unique(x)
  if (length(unique_x) >= num_knots) {
    optimal_knots <- sort(sample(unique_x, num_knots, replace = FALSE))
  } else {
    optimal_knots <- sort(unique_x)
  }
  return(list(optimal_gcv = optimal_gcv_value, optimal_knots = optimal_knots))
}
# Fungsi untuk menghitung GCV dan titik knot optimal untuk y1 dan y2
fit_spline_truncated_y1 <- function(y, x, m, num_knots) {
  optimal_knots <- NULL
  if (num_knots == 1) {
    optimal_knots <- gcv_y1(y, x, m, 1)
  } else if (num_knots == 2) {
    optimal_knots <- gcv_y1(y, x, m, 2)
  } else if (num_knots == 3) {
    optimal_knots <- gcv_y1(y, x, m, 3)
  }
  optimal_knots <- optimal_knots$optimal_knots
  model_result <- model(x, y, m, optimal_knots)
  return(list(optimal_knots = optimal_knots, model_result = model_result))
}

fit_spline_truncated_y2 <- function(y, x, m, num_knots) {
  optimal_knots <- NULL
  if (num_knots == 1) {
    optimal_knots <- gcv_y2(y, x, m, 1)
  } else if (num_knots == 2) {
    optimal_knots <- gcv_y2(y, x, m, 2)
  } else if (num_knots == 3) {
    optimal_knots <- gcv_y2(y, x, m, 3)
  }
  optimal_knots <- optimal_knots$optimal_knots
  model_result <- model(x, y, m, optimal_knots)
  return(list(optimal_knots = optimal_knots, model_result = model_result))
}
# Buat data frame untuk menyimpan hasil
results <- data.frame(
  y = character(),
  num_knots = integer(),
  order = integer(),
  GCV = numeric(),
  Knots = I(list())
)

# Loop melalui order dari 1 hingga 10
for (order in 1:10) {
  result_y1_1knots <- fit_spline_truncated_y1(y1, x, order, 1)
  result_y1_2knots <- fit_spline_truncated_y1(y1, x, order, 2)
  result_y1_3knots <- fit_spline_truncated_y1(y1, x, order, 3)
  result_y2_1knots <- fit_spline_truncated_y2(y2, x, order, 1)
  result_y2_2knots <- fit_spline_truncated_y2(y2, x, order, 2)
  result_y2_3knots <- fit_spline_truncated_y2(y2, x, order, 3)
  
  # Tambahkan hasil ke data frame
  results <- rbind(results, data.frame(
    y = rep(c("y1", "y2"), each = 3),
    num_knots = rep(1:3, 2),
    order = rep(order, 6),
    GCV = c(result_y1_1knots$model_result$optimal_gcv,
            result_y1_2knots$model_result$optimal_gcv,
            result_y1_3knots$model_result$optimal_gcv,
            result_y2_1knots$model_result$optimal_gcv,
            result_y2_2knots$model_result$optimal_gcv,
            result_y2_3knots$model_result$optimal_gcv),
    Knots = I(list(result_y1_1knots$optimal_knots,
                   result_y1_2knots$optimal_knots,
                   result_y1_3knots$optimal_knots,
                   result_y2_1knots$optimal_knots,
                   result_y2_2knots$optimal_knots,
                   result_y2_3knots$optimal_knots))
  ))
}

# Menentukan kombinasi dengan nilai GCV minimum untuk y1 dan y2
optimal_order_y1 <- results[results$y == "y1" & results$GCV == min(results$GCV[results$y == "y1"]), ]
optimal_order_y2 <- results[results$y == "y2" & results$GCV == min(results$GCV[results$y == "y2"]), ]

print(optimal_order_y1)
##    y num_knots order       GCV      Knots
## 3 y1         3     1 0.3227227 47, 54, 57
print(optimal_order_y2)
##    y num_knots order       GCV Knots
## 4 y2         1     1 0.1617074    21
# Cetak tabel hasil
kable(results, format = "html", col.names = c("y", "num_knots", "order", "GCV", "Knots")) %>%
  kable_styling(full_width = FALSE)
y num_knots order GCV Knots
y1 1 1 0.6152850 54
y1 2 1 0.4968823 54, 57
y1 3 1 0.3227227 47, 54, 57
y2 1 1 0.1617074 21
y2 2 1 0.6282382 7, 21
y2 3 1 0.2392043 7, 21, 25
y1 1 2 0.4997806 57
y1 2 2 1.1651950 21, 57
y1 3 2 1.1675605 21, 44, 57
y2 1 2 0.6005550 12
y2 2 2 0.4414550 12, 24
y2 3 2 1.0349604 10, 12, 24
y1 1 3 0.9541142 57
y1 2 3 0.9849815 39, 57
y1 3 3 0.5321320 39, 47, 57
y2 1 3 1.1324039 14
y2 2 3 1.2430603 14, 45
y2 3 3 0.9173179 14, 31, 45
y1 1 4 0.6076047 16
y1 2 4 0.5438443 16, 53
y1 3 4 0.7106744 13, 16, 53
y2 1 4 0.6022389 21
y2 2 4 1.0327617 19, 21
y2 3 4 1.2489471 19, 21, 24
y1 1 5 1.1853144 12
y1 2 5 1.3664816 12, 35
y1 3 5 1.4519843 2, 12, 35
y2 1 5 0.7149876 42
y2 2 5 1.3266274 23, 42
y2 3 5 0.5788705 23, 42, 50
y1 1 6 0.7611544 12
y1 2 6 0.9115056 12, 15
y1 3 6 1.1817574 12, 15, 57
y2 1 6 1.1456786 11
y2 2 6 1.4334278 11, 36
y2 3 6 0.6378956 11, 36, 48
y1 1 7 0.9979252 31
y1 2 7 0.9723881 18, 31
y1 3 7 1.6032601 18, 24, 31
y2 1 7 0.7388405 11
y2 2 7 0.7020709 11, 36
y2 3 7 1.3306873 11, 21, 36
y1 1 8 1.1757797 40
y1 2 8 1.6463468 38, 40
y1 3 8 1.3292048 38, 40, 58
y2 1 8 1.3642825 51
y2 2 8 0.8405448 39, 51
y2 3 8 1.5365634 28, 39, 51
y1 1 9 1.1894662 51
y1 2 9 1.3751248 47, 51
y1 3 9 1.3960277 28, 47, 51
y2 1 9 1.8578157 36
y2 2 9 1.4007399 36, 38
y2 3 9 1.8842265 35, 36, 38
y1 1 10 1.6358006 53
y1 2 10 1.4231022 17, 53
y1 3 10 1.2135632 11, 17, 53
y2 1 10 1.8569232 49
y2 2 10 1.0447725 26, 49
y2 3 10 1.9742294 4, 26, 49
# Fit the models with optimal order and knots
result_y1_1knots <- fit_spline_truncated_y1(y1, x, optimal_order_y1$order, 1)
result_y1_2knots <- fit_spline_truncated_y1(y1, x, optimal_order_y1$order, 2)
result_y1_3knots <- fit_spline_truncated_y1(y1, x, optimal_order_y1$order, 3)
result_y2_1knots <- fit_spline_truncated_y2(y2, x, optimal_order_y2$order, 1)
result_y2_2knots <- fit_spline_truncated_y2(y2, x, optimal_order_y2$order, 2)
result_y2_3knots <- fit_spline_truncated_y2(y2, x, optimal_order_y2$order, 3)

# Function to fit polynomial model and plot with R² using weighting
fit_and_plot_polynomial_weighted <- function(x, y, title, y_label, degree, weights) {
  polynomial_model <- lm(y ~ poly(x, degree, raw = TRUE), weights = weights)
  polynomial_r2 <- summary(polynomial_model)$r.squared
  
  plot(y ~ x, main = title, xlab = "Usia (bulan)", ylab = y_label, pch = 16, col = "blue")
  lines(sort(x), predict(polynomial_model, newdata = data.frame(x = sort(x))), col = "green", lwd = 2)
  
  legend("topright", legend = paste("Polynomial (degree", degree, ") R² =", round(polynomial_r2, 2)),
         col = "green", lty = 1, lwd = 2)
}

# Plot Usia vs Berat Badan dengan polynomial fitting menggunakan weights
fit_and_plot_polynomial_weighted(x, y1, "Usia vs Berat Badan", "Berat Badan (kg)", degree = optimal_order_y1$order, weights = rep(W[1, 1], length(y1)))

# Plot Usia vs Tinggi Badan dengan polynomial fitting menggunakan weights
fit_and_plot_polynomial_weighted(x, y2, "Usia vs Tinggi Badan", "Tinggi Badan (cm)", degree = optimal_order_y2$order, weights = rep(W[2, 2], length(y2)))

# Fit a polynomial model (optimal order) dengan weights
polynomial_model <- lm(y2 ~ poly(y1, optimal_order_y2$order, raw = TRUE), weights = rep(W[2, 2], length(y2)))
polynomial_r2 <- summary(polynomial_model)$r.squared

# Create data frame for prediction
prediction_data <- data.frame(y1 = sort(y1))
prediction_data$y2_pred <- predict(polynomial_model, newdata = prediction_data)

# Plotting
ggplot(data = data, aes(x = y1, y = y2)) +
  geom_point(color = "red") +
  geom_line(data = prediction_data, aes(x = y1, y = y2_pred), color = "green", size = 1.2) +
  labs(title = "Hubungan antara Berat Badan dan Tinggi Badan",
       x = "Berat Badan (kg)",
       y = "Tinggi Badan (cm)") +
  theme_minimal() +
  annotate("text", x = min(y1), y = max(y2), 
           label = paste("Polynomial (degree", optimal_order_y2$order, ") R² =", round(polynomial_r2, 2)), 
           hjust = 0, vjust = 1, size = 4, color = "darkgreen")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Model terbaik untuk Y(1)
order_y1 <- optimal_order_y1$order
knots_y1 <- result_y1_3knots$optimal_knots
bs_x_y1 <- bs(x, degree = order_y1, knots = knots_y1)
weights_y1 <- rep(W[1, 1], length(y1))
model_y1 <- lm(y1 ~ bs_x_y1, weights = weights_y1)
summary(model_y1)
## 
## Call:
## lm(formula = y1 ~ bs_x_y1, weights = weights_y1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3571 -0.9279 -0.1355  0.7877  6.3554 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.2126     0.0978  73.748   <2e-16 ***
## bs_x_y11      6.4874     0.1552  41.791   <2e-16 ***
## bs_x_y12      6.8881     0.2026  33.991   <2e-16 ***
## bs_x_y13      7.5905     0.4715  16.100   <2e-16 ***
## bs_x_y14      7.8402     0.7892   9.934   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.373 on 1303 degrees of freedom
## Multiple R-squared:  0.6549, Adjusted R-squared:  0.6539 
## F-statistic: 618.3 on 4 and 1303 DF,  p-value: < 2.2e-16
# Model terbaik untuk Y(2)
order_y2 <- optimal_order_y2$order
knots_y2 <- result_y2_3knots$optimal_knots
bs_x_y2 <- bs(x, degree = order_y2, knots = knots_y2)
weights_y2 <- rep(W[2, 2], length(y2))
model_y2 <- lm(y2 ~ bs_x_y2, weights = weights_y2)
summary(model_y2)
## 
## Call:
## lm(formula = y2 ~ bs_x_y2, weights = weights_y2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4668 -0.5081 -0.0246  0.4597  4.3576 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   60.671      1.153  52.600  < 2e-16 ***
## bs_x_y21       6.088      1.319   4.616 4.29e-06 ***
## bs_x_y22      21.856      1.177  18.572  < 2e-16 ***
## bs_x_y23      22.824      1.197  19.070  < 2e-16 ***
## bs_x_y24      41.984      1.204  34.878  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9284 on 1303 degrees of freedom
## Multiple R-squared:  0.8423, Adjusted R-squared:  0.8418 
## F-statistic:  1740 on 4 and 1303 DF,  p-value: < 2.2e-16
# Estimasi parameter dari model terbaik
coef_y1 <- coef(model_y1)
coef_y2 <- coef(model_y2)

print("Estimasi parameter untuk model Y(1):")
## [1] "Estimasi parameter untuk model Y(1):"
print(coef_y1)
## (Intercept)    bs_x_y11    bs_x_y12    bs_x_y13    bs_x_y14 
##    7.212559    6.487420    6.888081    7.590534    7.840156
print("Estimasi parameter untuk model Y(2):")
## [1] "Estimasi parameter untuk model Y(2):"
print(coef_y2)
## (Intercept)    bs_x_y21    bs_x_y22    bs_x_y23    bs_x_y24 
##   60.670953    6.088255   21.855758   22.824474   41.984445