Moving Beyond Linearity - Data Auto-ISLR

Penulis

Rosita Ria Rusesta - M0501251016

Eksplorasi Data

library(ISLR)
library(tidyverse)
library(broom)
library(broom.helpers)
library(skimr)
library(performance)
library(rsample)
library(yardstick)
library(DataExplorer)
library(ggplot2)


head(Auto)
  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
5  17         8          302        140   3449         10.5   70      1
6  15         8          429        198   4341         10.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst
5               ford torino
6          ford galaxie 500
Auto <- Auto %>%
  mutate(origin = recode(origin,
                         `1` = "Amerika",
                         `2` = "Eropa",
                         `3` = "Jepang"))

data<-Auto[ , c(1,4,8)]
plot_intro(data = data,
           ggtheme = theme_classic(),
           theme_config = list(axis.line=element_blank(),
                               axis.ticks=element_blank(),
                               axis.text.x=element_blank(),
                               axis.title=element_blank()
           )  
)

data %>%
  group_by(origin) %>%
  summarise(
    count = n(),
    mean_mpg = mean(mpg, na.rm = TRUE),
    sd_mpg   = sd(mpg, na.rm = TRUE),
    min_mpg  = min(mpg, na.rm = TRUE),
    max_mpg  = max(mpg, na.rm = TRUE)
  )
# A tibble: 3 × 6
  origin  count mean_mpg sd_mpg min_mpg max_mpg
  <chr>   <int>    <dbl>  <dbl>   <dbl>   <dbl>
1 Amerika   245     20.0   6.44     9      39  
2 Eropa      68     27.6   6.58    16.2    44.3
3 Jepang     79     30.5   6.09    18      46.6
ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) +
  geom_boxplot(alpha = 0.6, color = "black") +  # boxplot dengan warna soft
  geom_jitter(width = 0.2, alpha = 0.7, color = "black", size = 1.5) +  # titik sebaran
  labs(
    title = "Sebaran mpg per origin",
    x = "",
    y = "mpg"
  ) +
  scale_fill_brewer(palette = "Pastel1") +  # warna pastel lembut
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title.y = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(size = 11, face = "bold")
  )

ggplot(Auto, aes(x = origin, y = horsepower, fill = origin)) +
  geom_boxplot(alpha = 0.6, color = "black") +  # boxplot dengan warna soft
  geom_jitter(width = 0.2, alpha = 0.7, color = "black", size = 1.5) +  # titik sebaran
  labs(
    title = "Sebaran horsepower per origin",
    x = "",
    y = "mpg"
  ) +
  scale_fill_brewer(palette = "Pastel1") +  # warna pastel lembut
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title.y = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(size = 11, face = "bold")
  )

ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.7) +
  facet_wrap(~ origin) +
  labs(
    title = "Scatter Plot MPG vs Horsepower per Origin",
    x = "Horsepower",
    y = "MPG"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    strip.text = element_text(face = "bold", size = 12)
  )

Fungsi Tangga (Piecewise Constant)

# pastikan origin factor
Auto$origin <- factor(Auto$origin, labels = c("Amerika", "Eropa", "Jepang"))

# buat step cut
Auto <- Auto %>% mutate(hp_cut = cut(horsepower, 12))

# model per origin + prediksi
models <- Auto %>%
  group_by(origin) %>%
  group_modify(~{
    mod <- lm(mpg ~ hp_cut, data = .x)
    prd <- predict(mod, interval = "predict")
    prd <- as.data.frame(prd)   # ubah ke data frame
    colnames(prd) <- c("pred", "lwr", "upr")  # rename agar konsisten
    bind_cols(.x, prd)
  })

# plot
ggplot(models, aes(x = horsepower, y = mpg)) +
  geom_point(color = "coral", alpha = 0.7) +
  geom_line(aes(y = pred), color = "blue", linewidth = 1.3) +
  labs(
    title = "Step Function mpg ~ horsepower per Origin",
    x = "Horsepower",
    y = "MPG"
  ) +
  facet_wrap(~ origin) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    strip.text = element_text(size = 12, face = "bold")
  )

ggplot(models, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.6) +
  geom_line(aes(y = pred), linewidth = 1.3) +
  labs(
    title = "Step Function mpg ~ horsepower per Origin",
    x = "Horsepower",
    y = "MPG",
    color = "Origin"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Regresi Polinomial

# pastikan origin factor
Auto$origin <- factor(Auto$origin, labels = c("US", "Europe", "Japan"))

# ---- Fit model polynomial 4 per origin + buat prediksi ----
models_poly <- Auto %>%
  group_by(origin) %>%
  group_modify(~{

    # fit model polynomial 4
    mod <- lm(mpg ~ poly(horsepower, 4), data = .x)

    # buat prediksi untuk horsepower yang ada
    prd <- predict(mod,
                   newdata = data.frame(horsepower = .x$horsepower),
                   interval = "predict")

    prd <- as.data.frame(prd)
    colnames(prd) <- c("pred", "lwr", "upr")

    # urutkan horsepower agar garis mulus
    ix <- order(.x$horsepower)

    # kembalikan data + prediksi + index urut
    bind_cols(.x, prd) %>% mutate(ix = ix)
  })

# ---- Plot ----
ggplot(models_poly, aes(x = horsepower, y = mpg)) +
  geom_point(color = "coral", alpha = 0.7) +
  geom_line(aes(y = pred), color = "blue", linewidth = 1.3) +
  labs(
    title = "Regresi Polinomial Ordo 4 per Origin",
    x = "Horsepower",
    y = "MPG"
  ) +
  facet_wrap(~ origin) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    strip.text = element_text(size = 12, face = "bold")
  )

# ---- Plot dalam 1 grafik ----
ggplot(models_poly, aes(x = horsepower, y = mpg, color = origin)) +
  geom_point(alpha = 0.6) +
  geom_line(aes(y = pred), linewidth = 1.3) +
  labs(
    title = "Regresi Polinomial Orde 4 (Dalam 1 Grafik)",
    x = "Horsepower",
    y = "MPG",
    color = "Origin"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
  )

Regresi Spline

library(splines)
library(dplyr)

# Data
df <- Auto  # atau dataset Anda
df$origin <- factor(df$origin)

# ----------------------------
# 1. FUNGSI: memilih knot optimal per origin
# ----------------------------
pilih_knot_optimal <- function(data_origin) {
  hp <- data_origin$horsepower
  # kandidat knot (kuantil)
  kandidat <- quantile(hp, probs = c(0.25, 0.5, 0.75))
  
  # coba model dengan 0 sampai 3 knot
  model_list <- list()
  AIC_list <- c()
  
  # Model tanpa knot (hanya polynomial spline)
  m0 <- lm(mpg ~ bs(hp, df = 4), data = data_origin)
  model_list[[1]] <- m0
  AIC_list[1] <- AIC(m0)
  
  # Model dengan 1–3 knot
  for (k in 1:3) {
    knots <- kandidat[1:k]
    m <- lm(mpg ~ bs(hp, knots = knots), data = data_origin)
    model_list[[k+1]] <- m
    AIC_list[k+1] <- AIC(m)
  }
  
  idx <- which.min(AIC_list)
  return(list(model = model_list[[idx]], knot = ifelse(idx==1, NA, kandidat[1:(idx-1)])))
}

# ----------------------------
# 2. FIT model per origin
# ----------------------------
origin_list <- split(df, df$origin)

hasil <- lapply(origin_list, pilih_knot_optimal)

# ----------------------------
# 3. PLOT GABUNG (multi-line dalam 1 grafik)
# ----------------------------

cols <- c("red", "blue", "darkgreen")   # warna per origin
names(cols) <- levels(df$origin)

plot(df$horsepower, df$mpg,
     pch = 19, col = cols[df$origin],
     main = "Spline Regresi MPG ~ Horsepower per Origin (Knot Optimal)",
     xlab = "Horsepower", ylab = "MPG")

# garis spline per origin
for (o in levels(df$origin)) {
  dat_o <- origin_list[[o]]
  hp_sorted <- sort(dat_o$horsepower)
  
  model_o <- hasil[[o]]$model
  pred_o <- predict(model_o,
                    newdata = data.frame(hp = hp_sorted))
  
  lines(hp_sorted, pred_o,
        col = cols[o], lwd = 2)
  
  # tambah garis vertikal jika ada knot
  knot_o <- hasil[[o]]$knot
  if (!is.na(knot_o[1])) {
    abline(v = knot_o, col = cols[o], lty = 2)
  }
}

legend("topright",
       legend = paste0("Origin ", levels(df$origin)),
       col = cols, lwd = 2)