Set-up - Menyiapkan tools untuk import, cleaning, ringkasan, missing, dan imputasi.

# =====================================================
# Instalasi paket (jalankan sekali saja jika belum terpasang)
# =====================================================
# install.packages(c(
#   "tidyverse",   # Manipulasi & visualisasi data
#   "janitor",     # Pembersihan nama variabel & data
#   "skimr",       # Ringkasan eksploratif dataset
#   "naniar",      # Analisis nilai hilang (missing values)
#   "lubridate",   # Manipulasi data tanggal & waktu
#   "stringr"      # Pengolahan string/teks
# ))

# =====================================================
# Memuat library yang dibutuhkan
# =====================================================
library(tidyverse)   # Data wrangling dan visualisasi (dplyr, ggplot2, dll.)
library(janitor)     # Membersihkan nama kolom dan struktur data
library(skimr)       # Menyediakan ringkasan statistik cepat (EDA)
library(naniar)      # Visualisasi dan analisis missing values
library(lubridate)   # Pengolahan dan ekstraksi komponen tanggal
library(stringr)     # Operasi string yang konsisten dan efisien

Import Data - memastikan data terbaca benar (angka tidak jadi teks karena format Indonesia).

# Load library yang diperlukan
library(tidyverse)  # untuk read_csv() dan manipulasi data

# URL dataset
land_path <- "https://raw.githubusercontent.com/dsciencelabs/dataset/refs/heads/master/sinarmas_land.csv"

land_raw <- read.csv(
  land_path,                 # Path / lokasi file data lahan
  sep = ";",                 # Pemisah kolom
  dec = ",",                 # Format desimal
  stringsAsFactors = FALSE   # Karakter tetap sebagai string
)

head(land_raw)

Rapikan Nama Kolom - nama kolom konsisten (snake_case), memudahkan coding.

land <- land_raw %>% clean_names()

names(proj)
## NULL
## NULL

Rename Kolom - Mengubah nama kolom yang dianggap perlu untuk diubah.

library(dplyr)

land <- land %>%
  rename(
    date   = tanggal,
    region = lokasi,
    project_type = project_pype
  )

names(land)
##  [1] "date"                          "region"                       
##  [3] "project_type"                  "sales_channel"                
##  [5] "interest_rate"                 "consumer_conf"                
##  [7] "marketing_spend"               "construction_progress"        
##  [9] "material_index"                "unit_price_sqm"               
## [11] "cost_sqm"                      "avg_unit_size_sqm"            
## [13] "units_sold"                    "revenue"                      
## [15] "construction_cost"             "profit"                       
## [17] "profit_margin"                 "water_use_m3"                 
## [19] "energy_kwh"                    "co2_ton"                      
## [21] "sediment_tss_mg_l"             "esg_score"                    
## [23] "esg_flag"                      "schedule_delay_days"          
## [25] "cost_overrun_pct"              "at_risk"                      
## [27] "kpi_target"                    "sigma_assumed"                
## [29] "revenue_billion_idr"           "construction_cost_billion_idr"
## [31] "profit_billion_idr"            "business_unit_id"             
## [33] "time_id"                       "row_id"

Hapus Duplikasi Baris - mencegah data “terhitung dua kali”.

land <- land %>% distinct()

# Check hasil
head(land)

Check Data Hilang

library(dplyr)

check_na_overview <- function(df) {
  tibble(
    column = names(df),
    na_count = sapply(df, function(x) sum(is.na(x))),
    na_pct = round(sapply(df, function(x) mean(is.na(x))) * 100, 2)
  ) %>%
    arrange(desc(na_pct))
}

na_land <- check_na_overview(land)

na_land

Imputasi Data Hilang

library(dplyr)
library(tidyr)
library(stringr)

fix_missing <- function(df, cat_fill = "unknown", num_fill = 0) {
  df %>%
    # pastikan factor jadi character supaya aman diproses
    mutate(across(where(is.factor), as.character)) %>%
    # rapikan text
    mutate(across(where(is.character), ~ str_squish(.x))) %>%
    # isi missing kategori
    mutate(across(where(is.character), ~ replace_na(.x, cat_fill))) %>%
    # isi missing numerik
    mutate(across(where(is.numeric), ~ replace_na(.x, num_fill)))
}

land <- fix_missing(land)

Parse Tanggal - (jika ada kolom date): bisa digunakan untuk analisis time series (bulanan, harian, mingguan, dll).

land <- land %>%
  mutate(
    date = as.Date(date, format = "%d/%m/%y"),
    year  = lubridate::year(date),
    month = lubridate::month(date)
  )

head(land)

Analisis Univariat

Histogram

library(ggplot2)

ggplot(land, aes(x = interest_rate)) +
  
  # Histogram (pakai density, bukan frekuensi)
  geom_histogram(
    aes(y = after_stat(density)),
    bins = 20,
    na.rm = TRUE,
    fill = "#8ECAE6",
    color = "white",
    alpha = 0.8
  ) +
  
  # Density curve
  geom_density(
    na.rm = TRUE,
    color = "#023047",
    linewidth = 1.2
  ) +
  
  # Garis median
  geom_vline(
    aes(xintercept = median(interest_rate, na.rm = TRUE)),
    color = "#D62828",
    linetype = "dashed",
    linewidth = 1
  ) +
  
  labs(
    title = "Distribusi Interest Rate Proyek",
    subtitle = "Histogram (Bar) + Density",
    x = "Interest Rate (%)",
    y = "Density",
    caption = "Garis putus-putus merah menunjukkan nilai median"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(color = "grey40"),
    plot.caption = element_text(size = 9, color = "grey40")
  )

Density Plot

library(ggplot2)

ggplot(land, aes(x = interest_rate)) +
  geom_density(
    na.rm = TRUE,
    fill = "#8ECAE6",
    color = "#023047",
    alpha = 0.6,
    linewidth = 1
  ) +
  geom_vline(
    aes(xintercept = median(interest_rate, na.rm = TRUE)),
    color = "#D62828",
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Density Plot Interest Rate Proyek",
    subtitle = "Distribusi Tidak Normal (Skewed & Bounded)",
    x = "Interest Rate (%)",
    y = "Density",
    caption = "Garis putus-putus merah menunjukkan nilai median"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(color = "grey40"),
    plot.caption = element_text(size = 9, color = "grey40")
  )

# install.packages("plotly") # jika belum terpasang
library(plotly)

# Ambil data & buang NA
x <- land$interest_rate
x <- x[!is.na(x)]

# Hitung density & median
d <- density(x)
med <- median(x)

# Plotly density plot
p <- plot_ly() %>%
  add_lines(
    x = d$x,
    y = d$y,
    line = list(color = "#023047", width = 3),
    fill = "tozeroy",
    fillcolor = "rgba(142,202,230,0.5)",
    name = "Density"
  ) %>%
  add_segments(
    x = med, xend = med,
    y = 0, yend = max(d$y),
    line = list(color = "#D62828", width = 2, dash = "dash"),
    name = "Median"
  ) %>%
  layout(
    title = list(
      text = "Density Plot Interest Rate Proyek<br><sup>Distribusi Tidak Normal (Skewed & Bounded)</sup>"
    ),
    xaxis = list(title = "Interest Rate (%)"),
    yaxis = list(title = "Density"),
    showlegend = TRUE
  )

p

Bar+Density Plot

library(plotly)

# Ambil data & buang NA
x <- land$interest_rate
x <- x[!is.na(x)]

# Hitung density & median
d <- density(x)
med <- median(x)

p <- plot_ly() %>%
  add_histogram(
    x = x,
    nbinsx = 20,
    histnorm = "probability density",
    marker = list(
      color = "rgba(142,202,230,0.7)",
      line = list(color = "white", width = 1)
    ),
    name = "Histogram"
  ) %>%
  add_lines(
    x = d$x,
    y = d$y,
    line = list(color = "#023047", width = 3),
    name = "Density"
  ) %>%
  add_segments(
    x = med, xend = med,
    y = 0, yend = max(d$y),
    line = list(color = "#D62828", width = 2, dash = "dash"),
    name = "Median"
  ) %>%
  layout(
    title = list(
      text = "Distribusi Interest Rate Proyek<br><sup>Histogram (Bar) + Density</sup>"
    ),
    xaxis = list(title = "Interest Rate (%)"),
    yaxis = list(title = "Density"),
    bargap = 0.05,
    legend = list(
      orientation = "h",
      x = 0.5,
      xanchor = "center",
      y = -0.2,
      yanchor = "top"
    )
  )

p

Boxplot

library(ggplot2)
library(ggrepel)

# Statistik utama
s <- with(land, data.frame(
  label = c("Min", "Max", "Mean", "Median", "Mode"),
  value = c(
    min(interest_rate, na.rm = TRUE),
    max(interest_rate, na.rm = TRUE),
    mean(interest_rate, na.rm = TRUE),
    median(interest_rate, na.rm = TRUE),
    as.numeric(names(sort(table(interest_rate), TRUE)[1]))
  )
))

ggplot(land, aes(x = "", y = interest_rate)) +
  geom_boxplot(fill = "#8ECAE6", alpha = 0.8) +

  # Titik statistik
  geom_point(data = s, aes(x = 1, y = value), color = "#D62828", size = 3) +

  # Label + panah otomatis (ANTI OVERLAP)
  geom_text_repel(
    data = s,
    aes(x = 1, y = value, label = paste0(label, ": ", round(value, 2))),
    nudge_x = 0.25,
    direction = "y",
    segment.color = "grey40",
    segment.size = 0.4,
    size = 3,
    box.padding = 0.4,
    point.padding = 0.3
  ) +

  labs(
    title = "Boxplot Interest Rate Proyek",
    subtitle = "Statistik Utama Ditampilkan Tanpa Overlap",
    y = "Interest Rate (%)",
    x = NULL
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )

# Offset Y untuk label (ANTI OVERLAP)
s$y_label <- s$value
s$y_label[s$label == "Mean"]   <- s$value[s$label == "Mean"] + 0.25
s$y_label[s$label == "Median"] <- s$value[s$label == "Median"] - 0.25
s$y_label[s$label == "Mode"]   <- s$value[s$label == "Mode"] + 0.25
s$y_label[s$label == "Min"]    <- s$value[s$label == "Min"] - 0.25


# Geser posisi X (HANYA INI YANG BERUBAH)
x_box   <- 1
x_arrow <- 1.30     # ujung panah
x_text  <- 1.48     # >>> teks lebih ke kanan

fig <- plot_ly() %>%

  # Boxplot + OUTLIER
  add_boxplot(
    x = rep(x_box, length(land$interest_rate)),
    y = land$interest_rate,
    fillcolor = "#8ECAE6",
    line = list(color = "#2C3E50"),
    opacity = 0.8,
    width = 0.4,
    boxpoints = "outliers",          # <<< OUTLIER DITAMPILKAN
    jitter = 0,
    pointpos = 0,
    marker = list(
      color = "#FB8500",             # warna outlier (beda dari statistik)
      size = 7
    ),
    showlegend = FALSE
  ) %>%

  # Titik statistik (Mean, Median, dst)
  add_markers(
    x = rep(x_box, nrow(s)),
    y = s$value,
    marker = list(color = "#D62828", size = 12),
    showlegend = FALSE
  ) %>%

  # Teks statistik
  add_text(
    x = rep(x_text, nrow(s)),
    y = s$y_label,
    text = paste0(s$label, ": ", round(s$value, 2)),
    textposition = "middle left",
    showlegend = FALSE
  ) %>%

  layout(
    title = list(
      text = "Boxplot Interest Rate Proyek<br>
              <sup>Dengan Outlier dan Statistik Utama</sup>",
      x = 0.05
    ),
    xaxis = list(
      range = c(0.6, 1.9),
      showticklabels = FALSE,
      showgrid = FALSE,
      zeroline = FALSE
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      zeroline = FALSE
    ),
    shapes = lapply(
      1:nrow(s),
      function(i) {
        list(
          type = "line",
          x0 = x_box,
          x1 = x_arrow,
          y0 = s$value[i],
          y1 = s$y_label[i],
          line = list(color = "grey40", width = 1)
        )
      }
    ),
    annotations = lapply(
      1:nrow(s),
      function(i) {
        list(
          x = x_arrow,
          y = s$y_label[i],
          ax = x_box,
          ay = s$value[i],
          xref = "x",
          yref = "y",
          axref = "x",
          ayref = "y",
          showarrow = TRUE,
          arrowhead = 3,
          arrowwidth = 1,
          arrowcolor = "grey40",
          text = ""
        )
      }
    ),
    margin = list(
      l = 0,
      r = 0,
      t = 90,
      b = 10
    )
  )

fig

Violin Plot

library(ggplot2)
library(ggrepel)

# =====================================================
# Statistik utama
# =====================================================
s <- with(land, data.frame(
  label = c("Min", "Max", "Mean", "Median", "Mode"),
  value = c(
    min(interest_rate, na.rm = TRUE),
    max(interest_rate, na.rm = TRUE),
    mean(interest_rate, na.rm = TRUE),
    median(interest_rate, na.rm = TRUE),
    as.numeric(names(sort(table(interest_rate), decreasing = TRUE)[1]))
  )
))

# =====================================================
# Violin Plot murni + Statistik
# =====================================================
ggplot(land, aes(x = "", y = interest_rate)) +

  # Violin plot (tanpa boxplot)
  geom_violin(
    fill = "#8ECAE6",
    alpha = 0.8,
    trim = FALSE
  ) +

  # Titik statistik
  geom_point(
    data = s,
    aes(x = 1, y = value),
    color = "#D62828",
    size = 3
  ) +

  # Label statistik (anti overlap)
  geom_text_repel(
    data = s,
    aes(
      x = 1,
      y = value,
      label = paste0(label, ": ", round(value, 2))
    ),
    nudge_x = 0.28,
    direction = "y",
    segment.color = "grey40",
    segment.size = 0.4,
    size = 3,
    box.padding = 0.4,
    point.padding = 0.3
  ) +

  labs(
    title = "Violin Plot Interest Rate Proyek",
    subtitle = "Distribusi Interest Rate dengan Statistik Utama",
    y = "Interest Rate (%)",
    x = NULL
  ) +

  theme_minimal() +
  theme(
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank()
  )

library(plotly)

# =====================================================
# 1. DATA
# =====================================================
y <- land$interest_rate
y <- y[!is.na(y)]

# =====================================================
# 2. DETEKSI OUTLIER (IQR METHOD)
# =====================================================
Q1  <- quantile(y, 0.25)
Q3  <- quantile(y, 0.75)
IQR <- Q3 - Q1

lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR

is_outlier <- y < lower | y > upper

# =====================================================
# 3. STATISTIK UTAMA
# =====================================================
s <- data.frame(
  label = c("Max", "Mean", "Median", "Mode", "Min"),
  value = c(
    max(y),
    mean(y),
    median(y),
    as.numeric(names(sort(table(y), decreasing = TRUE)[1])),
    min(y)
  )
)

# Offset Y (ANTI OVERLAP)
s$y_label <- s$value
s$y_label[s$label == "Mean"]   <- s$value[s$label == "Mean"] + 0.25
s$y_label[s$label == "Median"] <- s$value[s$label == "Median"] - 0.25
s$y_label[s$label == "Mode"]   <- s$value[s$label == "Mode"] + 0.25
s$y_label[s$label == "Min"]    <- s$value[s$label == "Min"] - 0.25

# =====================================================
# 4. POSISI X
# =====================================================
x_violin <- 1
x_arrow  <- 1.30
x_text   <- 1.52

# =====================================================
# 5. VIOLIN PLOT
# =====================================================
fig <- plot_ly() %>%

  # ---- VIOLIN (DENSITY)
  add_trace(
    type = "violin",
    x = rep(x_violin, length(y)),
    y = y,
    fillcolor = "#8ECAE6",
    line = list(color = "#2C3E50"),
    opacity = 0.85,
    width = 0.55,
    points = FALSE,
    showlegend = FALSE
  ) %>%

  # ---- NON-OUTLIER (ABU-ABU PEKAT)
  add_markers(
    x = rep(x_violin, sum(!is_outlier)),
    y = y[!is_outlier],
    marker = list(
      color = "rgba(90,90,90,0.9)",
      size = 6
    ),
    showlegend = FALSE
  ) %>%

  # ---- OUTLIER (MERAH)
  add_markers(
    x = rep(x_violin, sum(is_outlier)),
    y = y[is_outlier],
    marker = list(
      color = "#D62828",
      size = 9
    ),
    showlegend = FALSE
  ) %>%

  # ---- TITIK STATISTIK
  add_markers(
    x = rep(x_violin, nrow(s)),
    y = s$value,
    marker = list(color = "#FB8500", size = 13),
    showlegend = FALSE
  ) %>%

  # ---- TEKS STATISTIK
  add_text(
    x = rep(x_text, nrow(s)),
    y = s$y_label,
    text = paste0(s$label, ": ", round(s$value, 2)),
    textposition = "middle left",
    showlegend = FALSE
  ) %>%

  # =====================================================
  # 6. LAYOUT + PANAH
  # =====================================================
  layout(
    title = list(
      text = "Violin Plot Interest Rate Proyek<br>
              <sup>Outlier (IQR), Distribusi, dan Statistik Utama</sup>",
      x = 0.05
    ),
    xaxis = list(
      range = c(0.6, 1.95),
      showticklabels = FALSE,
      showgrid = FALSE,
      zeroline = FALSE
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      zeroline = FALSE
    ),
    shapes = lapply(
      1:nrow(s),
      function(i) {
        list(
          type = "line",
          x0 = x_violin,
          x1 = x_arrow,
          y0 = s$value[i],
          y1 = s$y_label[i],
          line = list(color = "grey40", width = 1)
        )
      }
    ),
    annotations = lapply(
      1:nrow(s),
      function(i) {
        list(
          x = x_arrow,
          y = s$y_label[i],
          ax = x_violin,
          ay = s$value[i],
          xref = "x",
          yref = "y",
          axref = "x",
          ayref = "y",
          showarrow = TRUE,
          arrowhead = 3,
          arrowwidth = 1,
          arrowcolor = "grey40",
          text = ""
        )
      }
    ),
    margin = list(
      l = 0,
      r = 0,
      t = 90,
      b = 10
    )
  )

fig

Boxplot+Violin Plot

library(ggplot2)
library(ggrepel)

# =====================================================
# Statistik utama
# =====================================================
s <- with(land, data.frame(
  label = c("Min", "Max", "Mean", "Median", "Mode"),
  value = c(
    min(interest_rate, na.rm = TRUE),
    max(interest_rate, na.rm = TRUE),
    mean(interest_rate, na.rm = TRUE),
    median(interest_rate, na.rm = TRUE),
    as.numeric(names(sort(table(interest_rate), decreasing = TRUE)[1]))
  )
))

# =====================================================
# Violin Plot + Statistik
# =====================================================
ggplot(land, aes(x = "", y = interest_rate)) +

  # Violin plot
  geom_violin(
    fill = "#8ECAE6",
    alpha = 0.8,
    trim = FALSE
  ) +

  # (Opsional tapi recommended) garis median di tengah violin
  geom_boxplot(
    width = 0.08,
    fill = "white",
    outlier.shape = NA,
    alpha = 0.7
  ) +

  # Titik statistik
  geom_point(
    data = s,
    aes(x = 1, y = value),
    color = "#D62828",
    size = 3
  ) +

  # Label statistik (anti overlap)
  geom_text_repel(
    data = s,
    aes(
      x = 1,
      y = value,
      label = paste0(label, ": ", round(value, 2))
    ),
    nudge_x = 0.28,
    direction = "y",
    segment.color = "grey40",
    segment.size = 0.4,
    size = 3,
    box.padding = 0.4,
    point.padding = 0.3
  ) +

  labs(
    title = "Violin Plot Interest Rate Proyek",
    subtitle = "Distribusi Data dengan Statistik Utama",
    y = "Interest Rate (%)",
    x = NULL
  ) +

  theme_minimal() +
  theme(
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank()
  )

library(plotly)

# =====================================================
# 1. DATA (BERSIHKAN NA)
# =====================================================
# y = data numerik utama yang akan diplot
y <- land$interest_rate
y <- y[!is.na(y)]

# =====================================================
# 2. DETEKSI OUTLIER BERDASARKAN ATURAN IQR
# =====================================================
# CATATAN PENTING:
# - Outlier TIDAK sama dengan min / max
# - Outlier ditentukan oleh batas statistik (IQR),
#   bukan urutan nilai

Q1  <- quantile(y, 0.25)
Q3  <- quantile(y, 0.75)
IQR <- Q3 - Q1

# Batas whisker boxplot
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Klasifikasi data
is_outlier_low  <- y < lower_bound   # outlier ekstrem RENDAH
is_outlier_high <- y > upper_bound   # outlier ekstrem TINGGI
is_normal       <- !(is_outlier_low | is_outlier_high)

# =====================================================
# 3. STATISTIK UTAMA (DESKRIPTIF)
# =====================================================
# Statistik ini DESKRIPTIF,
# BUKAN penentu outlier

s <- data.frame(
  label = c("Max", "Mean", "Median", "Mode", "Min"),
  value = c(
    max(y),
    mean(y),
    median(y),
    as.numeric(names(sort(table(y), decreasing = TRUE)[1])),
    min(y)
  )
)

# Offset Y agar teks tidak overlap
s$y_label <- s$value
s$y_label[s$label == "Mean"]   <- s$value[s$label == "Mean"] + 0.25
s$y_label[s$label == "Median"] <- s$value[s$label == "Median"] - 0.25
s$y_label[s$label == "Mode"]   <- s$value[s$label == "Mode"] + 0.25
s$y_label[s$label == "Min"]    <- s$value[s$label == "Min"] - 0.25

# =====================================================
# 4. POSISI X (UNTUK OVERLAY & ANOTASI)
# =====================================================
x_main  <- 1       # posisi violin & box
x_arrow <- 1.30    # ujung panah
x_text  <- 1.55    # posisi teks (lebih ke kanan)

# =====================================================
# 5. PLOTLY: VIOLIN + BOX + DATA POINT
# =====================================================
fig <- plot_ly() %>%

  # ---- VIOLIN (SEBARAN DATA)
  add_trace(
    type = "violin",
    x = rep(x_main, length(y)),
    y = y,
    fillcolor = "#8ECAE6",
    line = list(color = "#2C3E50"),
    opacity = 0.45,
    width = 0.6,
    points = FALSE,
    showlegend = FALSE
  ) %>%

  # ---- BOXPLOT (RINGKASAN STATISTIK)
  # Whisker berhenti di batas IQR (BUKAN min/max)
  add_boxplot(
    x = rep(x_main, length(y)),
    y = y,
    width = 0.22,
    fillcolor = "rgba(255,255,255,0.9)",
    line = list(color = "#2C3E50"),
    boxpoints = FALSE,
    showlegend = FALSE
  ) %>%

  # ---- DATA NORMAL (ABU-ABU)
  add_markers(
    x = rep(x_main, sum(is_normal)),
    y = y[is_normal],
    marker = list(
      color = "rgba(80,80,80,0.9)",
      size = 6
    ),
    name = "Normal (Dalam IQR)",
    showlegend = TRUE
  ) %>%

  # ---- OUTLIER EKSTREM RENDAH (BIRU)
  add_markers(
    x = rep(x_main, sum(is_outlier_low)),
    y = y[is_outlier_low],
    marker = list(
      color = "#1F77B4",
      size = 10,
      line = list(color = "black", width = 0.6)
    ),
    name = "Outlier Ekstrem Rendah",
    showlegend = TRUE
  ) %>%

  # ---- OUTLIER EKSTREM TINGGI (MERAH)
  add_markers(
    x = rep(x_main, sum(is_outlier_high)),
    y = y[is_outlier_high],
    marker = list(
      color = "#D62828",
      size = 10,
      line = list(color = "black", width = 0.6)
    ),
    name = "Outlier Ekstrem Tinggi",
    showlegend = TRUE
  ) %>%

  # ---- TITIK STATISTIK UTAMA
  add_markers(
    x = rep(x_main, nrow(s)),
    y = s$value,
    marker = list(color = "#FB8500", size = 13),
    showlegend = FALSE
  ) %>%

  # ---- TEKS STATISTIK
  add_text(
    x = rep(x_text, nrow(s)),
    y = s$y_label,
    text = paste0(s$label, ": ", round(s$value, 2)),
    textposition = "middle left",
    showlegend = FALSE
  ) %>%

  # =====================================================
  # 6. LAYOUT + PANAH PENJELAS
  # =====================================================
  layout(
    title = list(
      text = paste0(
        "Box + Violin Overlay Interest Rate Proyek<br>",
        "<sup>",
        "Catatan: Outlier ditentukan oleh IQR, bukan oleh Min/Max",
        "</sup>"
      ),
      x = 0.05
    ),
    xaxis = list(
      range = c(0.6, 2.0),
      showticklabels = FALSE,
      showgrid = FALSE,
      zeroline = FALSE
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      zeroline = FALSE
    ),
    shapes = lapply(
      1:nrow(s),
      function(i) {
        list(
          type = "line",
          x0 = x_main,
          x1 = x_arrow,
          y0 = s$value[i],
          y1 = s$y_label[i],
          line = list(color = "grey40", width = 1)
        )
      }
    ),
    margin = list(l = 0, r = 0, t = 110, b = 10)
  )

fig

Pie Chart

library(dplyr)
library(plotly)

# =====================================================
# 1. HITUNG FREKUENSI DAN PERSENTASE
# =====================================================
donut_data <- land %>%
  count(project_type, name = "frekuensi") %>%
  mutate(persen = round(frekuensi / sum(frekuensi) * 100, 1))

# =====================================================
# 2. BUAT DONUT CHART INTERAKTIF DENGAN POSISI TURUN
# =====================================================
fig <- plot_ly(
  donut_data,
  labels = ~project_type,
  values = ~frekuensi,
  type = 'pie',
  hole = 0.4,
  textposition = 'inside',
  textinfo = 'label+percent',
  hovertemplate = "<b>%{label}</b><br>Frekuensi: %{value}<br>Persentase: %{percent}<extra></extra>",
  marker = list(line = list(color = '#FFFFFF', width = 2)),
  domain = list(
    x = c(0, 1),      # full horizontal
    y = c(0.05, 0.90) # turunkan chart sedikit (awal 0.0, akhir 1.0)
  )
)

# =====================================================
# 3. LAYOUT
# =====================================================
fig <- fig %>%
  layout(
    title = list(
      text = "Distribusi Project Type (Donut Chart)",
      x = 0.5,
      y = 0.95
    ),
    showlegend = F
  )

fig

Bar Chart

library(dplyr)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# =====================================================
# 1. HITUNG FREKUENSI + PERSENTASE KUMULATIF
# =====================================================
pareto_project <- land %>%
  count(project_type, name = "frekuensi") %>%
  arrange(desc(frekuensi)) %>%
  mutate(
    cum_freq = cumsum(frekuensi),
    cum_pct  = cum_freq / sum(frekuensi)
  )

# =====================================================
# 2. PARETO CHART
# =====================================================
ggplot(pareto_project,
       aes(x = reorder(project_type, -frekuensi))) +

  # ---- BAR (FREKUENSI)
  geom_col(
    aes(y = frekuensi, fill = project_type),
    width = 0.8,
    alpha = 0.9
  ) +

  # ---- LABEL FREKUENSI
  geom_text(
    aes(y = frekuensi, label = frekuensi),
    vjust = -0.4,
    size = 3,
    color = "black"
  ) +

  # ---- GARIS KUMULATIF (%)
  geom_line(
    aes(
      y = cum_pct * max(frekuensi),
      group = 1
    ),
    color = "#D62828",
    linewidth = 1.2
  ) +

  geom_point(
    aes(y = cum_pct * max(frekuensi)),
    color = "#D62828",
    size = 2.5
  ) +

  # ---- LABEL % KUMULATIF
  geom_text(
    aes(
      y = cum_pct * max(frekuensi),
      label = percent(cum_pct, accuracy = 1)
    ),
    vjust = -0.8,
    color = "#D62828",
    size = 3
  ) +

  # =====================================================
  # 3. AXIS & TEMA
  # =====================================================
  scale_y_continuous(
    name = "Frekuensi",
    sec.axis = sec_axis(
      ~ . / max(pareto_project$frekuensi),
      labels = percent_format(accuracy = 1),
      name = "Persentase Kumulatif"
    )
  ) +

  labs(
    title = "Pareto Chart Project Type",
    x = "Project Type"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(
      angle = 45,
      hjust = 1
    ),
    plot.title = element_text(face = "bold")
  )

library(dplyr)
library(plotly)

# =====================================================
# 1. HITUNG FREKUENSI + KUMULATIF
# =====================================================
pareto_project <- land %>%
  count(project_type, name = "frekuensi") %>%
  arrange(desc(frekuensi)) %>%
  mutate(
    cum_freq = cumsum(frekuensi),
    cum_pct  = cum_freq / sum(frekuensi)
  )

max_freq <- max(pareto_project$frekuensi)

# =====================================================
# 2. PLOT PARETO
# =====================================================
fig <- plot_ly(pareto_project, x = ~project_type)

# ---- BAR (FREKUENSI)
fig <- fig %>%
  add_bars(
    y = ~frekuensi,
    color = ~project_type,
    text = ~frekuensi,
    textposition = "outside",
    hovertemplate =
      "<b>%{x}</b><br>" ~
      "Frekuensi: %{y}<extra></extra>",
    showlegend = FALSE
  )

# ---- GARIS PARETO (FIX WARNING)
fig <- fig %>%
  add_trace(
    type = "scatter",
    mode = "lines+markers",   # <<< PENTING
    y = ~cum_pct * max_freq,
    line = list(color = "#D62828", width = 3),
    marker = list(size = 7, color = "#D62828"),
    customdata = ~round(cum_pct * 100, 1),
    hovertemplate =
      "<b>%{x}</b><br>" ~
      "Kumulatif: %{customdata}%<extra></extra>",
    name = "Persentase Kumulatif"
  )

# =====================================================
# 3. LAYOUT
# =====================================================
fig <- fig %>%
  layout(
    title = list(
      text = "Pareto Chart Project Type<br>
              <sup>Frekuensi & Persentase Kumulatif</sup>",
      x = 0.05
    ),
    xaxis = list(
      title = "Project Type",
      categoryorder = "array",
      categoryarray = pareto_project$project_type
    ),
    yaxis = list(
      title = "Frekuensi",
      rangemode = "tozero"
    ),
    margin = list(t = 90)
  )

fig

Analisis Bivariat

Line Plot (Numerik vs Numerik)

library(tidyverse)
library(lubridate)
library(plotly)

land$date <- as.Date(land$date)

# GANTI ini kalau nama kolom revenue kamu beda
revenue_col <- "revenue"

# pastikan kolom revenue numeric
land[[revenue_col]] <- as.numeric(land[[revenue_col]])

data_rev_yearly <- land %>%
  mutate(year = year(date)) %>%
  group_by(year, region) %>%
  summarise(
    revenue = sum(.data[[revenue_col]], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(region, year)

p <- plot_ly()

p <- p %>%
  add_trace(
    data = data_rev_yearly,
    x = ~year,
    y = ~revenue,
    color = ~region,
    type = "scatter",
    mode = "lines+markers",
    name = ~region,
    line = list(shape = "spline", smoothing = 1.2, width = 3),
    marker = list(size = 6, opacity = 0.7),
    opacity = 0.8
  ) %>%
  layout(
    title = list(text = "<b>Annual Revenue per Region</b>", x = 0.5),
    xaxis = list(title = "", dtick = 1, tickangle = -45),
    yaxis = list(title = "Revenue", rangemode = "tozero"),
    legend = list(orientation = "h", x = 0.5, xanchor = "center", y = -0.15),
    margin = list(b = 60)
  )

p

Scatter Plot (Numerik vs Numerik)

# ==============================
# LIBRARY
# ==============================
library(tidyverse)
library(plotly)
library(scales)

# ==============================
# DATA PREPARATION
# ==============================
scatter_sales <- land %>%
  select(units_sold, profit) %>%
  drop_na()

# ==============================
# LINEAR MODEL
# ==============================
lm_model <- lm(profit ~ units_sold, data = scatter_sales)
lm_sum   <- summary(lm_model)

# Koefisien
intercept <- lm_sum$coefficients[1, 1]
slope     <- lm_sum$coefficients[2, 1]
p_value   <- lm_sum$coefficients[2, 4]
r_squared <- lm_sum$r.squared

# Format teks model
eq_text <- paste0(
  "Profit = ",
  formatC(intercept, format = "f", digits = 2),
  " + ",
  formatC(slope, format = "f", digits = 2),
  " × Units Sold"
)

stat_text <- paste0(
  "R² = ", round(r_squared, 3),
  "<br>p-value = ", formatC(p_value, format = "e", digits = 2)
)

# Prediksi garis regresi
scatter_sales <- scatter_sales %>%
  mutate(profit_pred = predict(lm_model))

# ==============================
# SCATTER PLOT + REGRESSION LINE
# ==============================
p <- plot_ly() %>%
  add_markers(
    data = scatter_sales,
    x = ~units_sold,
    y = ~profit,
    marker = list(size = 9, opacity = 0.7),
    hovertemplate = paste(
      "Units Sold: %{x:,}<br>",
      "Profit: %{y:,.2f}<br>",
      "<extra></extra>"
    ),
    name = "Observed"
  ) %>%
  add_lines(
    data = scatter_sales,
    x = ~units_sold,
    y = ~profit_pred,
    line = list(width = 2),
    name = "Linear Fit"
  ) %>%
  layout(
    title = list(
      text = "<b>Units Sold vs Profit with Linear Regression</b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Units Sold",
      zeroline = TRUE
    ),
    yaxis = list(
      title = "Profit",
      tickformat = ",.2f",
      zeroline = TRUE
    ),
    annotations = list(
      list(
        x = 0.02,
        y = 0.98,
        xref = "paper",
        yref = "paper",
        text = paste0(
          "<b>Linear Model</b><br>",
          eq_text, "<br>",
          stat_text
        ),
        showarrow = FALSE,
        align = "left",
        bgcolor = "rgba(255,255,255,0.85)",
        bordercolor = "black",
        borderwidth = 1
      )
    ),
    legend = list(
      orientation = "h",
      x = 0.5,
      xanchor = "center",
      y = -0.25
    )
  )
p

Boxplot (Numerik vs Kategorik)

library(plotly)

fig <- plot_ly(
  data = land,
  x = ~project_type,
  y = ~interest_rate,
  type = "box",
  boxpoints = "outliers",
  fillcolor = "#8ECAE6",
  opacity = 0.8,
  line = list(color = "black"),
  marker = list(
    color = "#D62828",   # warna outlier
    size = 6
  )
) %>%
  layout(
    title = list(
      text = "<b>Interest Rate per Jenis Proyek</b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Jenis Proyek"
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      ticksuffix = "%"
    ),
    template = "simple_white"
  )

fig

Violin Plot (Numerik vs Kategorik)

library(plotly)

fig <- plot_ly(
  data = land,
  x = ~project_type,
  y = ~interest_rate,
  type = "violin",
  fillcolor = "#8ECAE6",
  opacity = 0.8,
  line = list(color = "black"),
  box = list(visible = TRUE),      # boxplot di dalam violin
  meanline = list(visible = TRUE), # garis mean
  points = "outliers",             # tampilkan outlier
  marker = list(
    color = "#D62828",
    size = 6
  )
) %>%
  layout(
    title = list(
      text = "<b>Interest Rate per Jenis Proyek</b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Jenis Proyek"
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      ticksuffix = "%"
    ),
    template = "simple_white"
  )

fig

Bar Chart (Kategorik vs Kategorik)

library(dplyr)
library(plotly)

# Hitung units sold per project type & sales channel
bar_data <- land %>%
  group_by(project_type, sales_channel) %>%
  summarise(
    units_sold = sum(units_sold, na.rm = TRUE),
    .groups = "drop"
  )

# Urutkan project_type dari total units sold tertinggi
order_project <- bar_data %>%
  group_by(project_type) %>%
  summarise(total_units = sum(units_sold), .groups = "drop") %>%
  arrange(desc(total_units))

bar_data <- bar_data %>%
  mutate(
    project_type = factor(
      project_type,
      levels = order_project$project_type
    )
  )

fig <- plot_ly(
  data = bar_data,
  x = ~project_type,
  y = ~units_sold,
  color = ~sales_channel,
  type = "bar"
) %>%
  layout(
    title = list(
      text = "<b>Units Sold berdasarkan Project Type<br><sup>Dilapisi Sales Channel</sup></b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Project Type",
      categoryorder = "array"
    ),
    yaxis = list(
      title = "Units Sold"
    ),
    barmode = "stack",
    legend = list(
      orientation = "h",
      x = 0.5,
      xanchor = "center",
      y = -0.3
    ),
    template = "simple_white"
  )

Analisis Multivariat

Heatmap Korelasi

library(tidyverse)

# ======================
# 1. Ambil semua kolom numerik
# ======================
land_num <- land %>% select(where(is.numeric))

# ======================
# 2. Pilih kolom finansial
# ======================
keywords <- c("price", "capital", "sale", "profit", "margin", "cost", "total")
finance_vars <- land_num %>%
  select(any_of(names(land_num)[str_detect(names(land_num), paste(keywords, collapse = "|"))]))



# ======================
# 3. Hitung korelasi
# ======================
corr_mat <- round(cor(finance_vars, use = "complete.obs"), 2)

# ======================
# 4. Ubah ke long format dan ambil upper-right triangle
# ======================
corr_long <- as.data.frame(as.table(corr_mat))
colnames(corr_long) <- c("Var1", "Var2", "Correlation")

# Buat faktor untuk menentukan urutan kolom
levels_order <- colnames(finance_vars)
corr_long <- corr_long %>%
  mutate(
    Var1 = factor(Var1, levels = levels_order),
    Var2 = factor(Var2, levels = levels_order)
  ) %>%
  filter(as.numeric(Var1) < as.numeric(Var2))  # hanya atas kanan

# ======================
# 5. Plot heatmap
# ======================
ggplot(corr_long, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#B2182B", mid = "white", high = "#2166AC", midpoint = 0) +
  geom_text(aes(label = Correlation), size = 3) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.y = element_text(size = 10),
    axis.title = element_blank()
  ) +
  labs(title = "Upper-Right Triangle Correlation - Variabel Finansial")

Cluster Plot

library(tidyverse)
library(plotly)

# ======================
# 1. Ambil kolom numerik
# ======================
land_num <- land %>% select(where(is.numeric))

# ======================
# 2. Pilih variabel finansial
# ======================
keywords <- c("price", "capital", "sale", "profit", "margin", "cost", "total")
finance_vars <- land_num %>%
  select(any_of(names(land_num)[str_detect(names(land_num), paste(keywords, collapse = "|"))]))

if (ncol(finance_vars) == 0) {
  message("⚠️ Tidak ada variabel numerik finansial ditemukan. Cluster plot tidak dapat dibuat.")
} else {
  
  # ======================
  # 3. Standarisasi
  # ======================
  finance_scaled <- scale(finance_vars)
  
  # ======================
  # 4. K-means clustering
  # ======================
  set.seed(123)
  k <- 3
  km_res <- kmeans(finance_scaled, centers = k, nstart = 25)
  
  # ======================
  # 5. PCA untuk 2D visualisasi
  # ======================
  pca_res <- prcomp(finance_scaled)
  pca_plot <- data.frame(
    PC1 = pca_res$x[,1],
    PC2 = pca_res$x[,2],
    Cluster = factor(km_res$cluster)
  )
  
  # ======================
  # 6. Tambahkan hover data asli
  # ======================
  hover_text <- apply(finance_vars, 1, function(row){
    paste(paste(names(finance_vars), ": ", round(row, 2)), collapse = "<br>")
  })
  
  pca_plot$hover_text <- paste("Cluster: ", pca_plot$Cluster, "<br>", hover_text)
  
  # ======================
  # 7. Plotly interactive cluster plot
  # ======================
  plot_ly(
    data = pca_plot,
    x = ~PC1,
    y = ~PC2,
    color = ~Cluster,
    colors = c("#1F77B4", "#FF7F0E", "#2CA02C"),
    type = "scatter",
    mode = "markers",
    text = ~hover_text,
    hoverinfo = "text"
  ) %>%
    layout(
      title = paste("Interactive Cluster Plot (k =", k, ") - Variabel Finansial"),
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2")
    )
}

Bubble Plot

library(tidyverse)
library(plotly)

# ======================
# 1. Pilih kolom yang dibutuhkan
# ======================
finance_vars <- land %>%
  select(date, year, month, revenue, construction_cost, profit)

# ======================
# 2. Group per Tahun & Bulan
# ======================
finance_agg <- finance_vars %>%
  group_by(year, month) %>%
  summarise(
    total_revenue = sum(revenue, na.rm = TRUE),
    total_cost = sum(construction_cost, na.rm = TRUE),
    total_profit = sum(profit, na.rm = TRUE),
    .groups = "drop"
  )

# ======================
# 3. Buat hover text
# ======================
finance_agg <- finance_agg %>%
  mutate(
    hover_text = paste(
      "Tahun: ", year,
      "<br>Bulan: ", month,
      "<br>Total Revenue: ", round(total_revenue, 2),
      "<br>Total Cost: ", round(total_cost, 2),
      "<br>Total Profit: ", round(total_profit, 2)
    )
  )

# ======================
# 4. Bubble Plot interaktif
# ======================
plot_ly(
  data = finance_agg,
  x = ~total_revenue,
  y = ~total_cost,
  size = ~total_profit,
  sizes = c(10, 50),
  type = "scatter",
  mode = "markers",
  marker = list(sizemode = "area", opacity = 0.7, color = ~year, colorscale = "Viridis"),
  text = ~hover_text,
  hoverinfo = "text"
) %>%
  layout(
    title = "Bubble Plot per Tahun & Bulan (Agregat Finansial)",
    xaxis = list(title = "Total Revenue"),
    yaxis = list(title = "Total Cost")
  )
## Warning: `line.width` does not currently support multiple values.

Statistik Inferensial Parametrik

Uji Z

library(tidyverse)

# ======================
# 1. Tentukan data profit
# ======================
# Misal kita ingin uji rata-rata profit tahun 2026
current_year <- 2026
profit_data <- land %>%
  filter(year == current_year) %>%
  pull(profit)

# ======================
# 2. Tentukan target (mu0) dari rata-rata historis
# ======================
# Misal rata-rata profit tahun sebelumnya (2025)
historical_year <- current_year - 1
mu0 <- land %>%
  filter(year == historical_year) %>%
  summarise(mean_profit = mean(profit, na.rm = TRUE)) %>%
  pull(mean_profit)

cat("Target (mu0) berdasarkan rata-rata historis tahun", historical_year, "=", round(mu0,2), "\n")
## Target (mu0) berdasarkan rata-rata historis tahun 2025 = 213560617698
## Target (mu0) berdasarkan rata-rata historis tahun 2025 = 213560617698
# ======================
# 3. Hitung Uji Z
# ======================
n <- length(profit_data)
x_bar <- mean(profit_data, na.rm = TRUE)
sigma <- sd(profit_data, na.rm = TRUE)  # jika populasi sigma tidak diketahui, pakai SD sampel
z_stat <- (x_bar - mu0) / (sigma / sqrt(n))

# ======================
# 4. Hitung p-value
# ======================
p_value <- 2 * (1 - pnorm(abs(z_stat)))

# ======================
# 5. Tampilkan hasil
# ======================
cat("Uji Z:\n")
## Uji Z:
## Uji Z:
cat("Z =", round(z_stat,3), "\n")
## Z = -0.635
## Z = -0.635
cat("p-value =", round(p_value,5), "\n")
## p-value = 0.52567
## p-value = 0.52567
# ======================
# 6. Interpretasi singkat
# ======================
if(p_value < 0.05){
  cat("Hasil: Rata-rata profit tahun", current_year, "berbeda signifikan dari rata-rata historis.\n")
} else {
  cat("Hasil: Rata-rata profit tahun", current_year, "tidak berbeda signifikan dari rata-rata historis.\n")
}
## Hasil: Rata-rata profit tahun 2026 tidak berbeda signifikan dari rata-rata historis.
## Hasil: Rata-rata profit tahun 2026 tidak berbeda signifikan dari rata-rata historis.

Uji t

library(tidyverse)

# ======================
# 1. Pilih dua project_type
# ======================
types <- unique(land$project_type)[1:2]  # ambil 2 project_type pertama
profit_group <- land %>%
  filter(project_type %in% types) %>%
  select(project_type, profit)

# ======================
# 2. Cek ringkasan data
# ======================
profit_group %>% group_by(project_type) %>% summarise(
  mean_profit = mean(profit, na.rm = TRUE),
  sd_profit = sd(profit, na.rm = TRUE),
  n = n()
)
# ======================
# 3. Uji t independen
# ======================
t_test <- t.test(profit ~ project_type, data = profit_group, var.equal = TRUE) # asumsi varian sama

# ======================
# 4. Tampilkan hasil
# ======================
t_test
## 
##  Two Sample t-test
## 
## data:  profit by project_type
## t = -3.9333, df = 725, p-value = 9.184e-05
## alternative hypothesis: true difference in means between group Commercial and group Residential is not equal to 0
## 95 percent confidence interval:
##  -52492156922 -17537708738
## sample estimates:
##  mean in group Commercial mean in group Residential 
##              196420443165              231435375995
## 
##  Two Sample t-test
## 
## data:  profit by project_type
## t = -3.9333, df = 725, p-value = 9.184e-05
## alternative hypothesis: true difference in means between group Commercial and group Residential is not equal to 0
## 95 percent confidence interval:
##  -52492156922 -17537708738
## sample estimates:
##  mean in group Commercial mean in group Residential 
##              196420443165              231435375995
# ======================
# 5. Interpretasi singkat
# ======================
if(t_test$p.value < 0.05){
  cat("Hasil: Rata-rata profit antara kedua project_type berbeda signifikan.\n")
} else {
  cat("Hasil: Rata-rata profit antara kedua project_type tidak berbeda signifikan.\n")
}
## Hasil: Rata-rata profit antara kedua project_type berbeda signifikan.
## Hasil: Rata-rata profit antara kedua project_type berbeda signifikan.

ANOVA

library(tidyverse)

# ======================
# 1. Cek jumlah grup
# ======================
unique(land$region)  # pastikan ada lebih dari 2 region
## [1] "Jabodetabek" "Jawa"        "Kalimantan"  "Sulawesi"    "Sumatra"
## [1] "Jabodetabek" "Jawa"        "Kalimantan"  "Sulawesi"    "Sumatra"
# ======================
# 2. Ringkasan profit per region
# ======================
land %>%
  group_by(region) %>%
  summarise(
    mean_profit = mean(profit, na.rm = TRUE),
    sd_profit = sd(profit, na.rm = TRUE),
    n = n()
  )
# ======================
# 3. ANOVA satu arah
# ======================
anova_model <- aov(profit ~ region, data = land)

# ======================
# 4. Lihat hasil ANOVA
# ======================
summary(anova_model)
##              Df    Sum Sq   Mean Sq F value   Pr(>F)    
## region        4 5.411e+23 1.353e+23   11.58 3.44e-09 ***
## Residuals   960 1.121e+25 1.168e+22                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##              Df    Sum Sq   Mean Sq F value   Pr(>F)    
## region        4 5.411e+23 1.353e+23   11.58 3.44e-09 ***
## Residuals   960 1.121e+25 1.168e+22                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ======================
# 5. Jika signifikan, lanjut post-hoc Tukey HSD
# ======================
if(summary(anova_model)[[1]]$`Pr(>F)`[1] < 0.05){
  tukey_result <- TukeyHSD(anova_model)
  print(tukey_result)
  cat("Hasil: Ada perbedaan profit signifikan antar region.\n")
} else {
  cat("Hasil: Tidak ada perbedaan profit signifikan antar region.\n")
}
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = profit ~ region, data = land)
## 
## $region
##                                diff          lwr          upr     p adj
## Jawa-Jabodetabek       -16042509464 -46106271328  14021252400 0.5899057
## Kalimantan-Jabodetabek -58645811782 -88709573645 -28582049918 0.0000012
## Sulawesi-Jabodetabek   -56705333130 -86769094994 -26641571267 0.0000031
## Sumatra-Jabodetabek    -48954264174 -79018026037 -18890502310 0.0000936
## Kalimantan-Jawa        -42603302318 -72667064181 -12539540454 0.0010850
## Sulawesi-Jawa          -40662823666 -70726585530 -10599061803 0.0021477
## Sumatra-Jawa           -32911754710 -62975516573  -2847992846 0.0237312
## Sulawesi-Kalimantan      1940478651 -28123283212  32004240515 0.9997833
## Sumatra-Kalimantan       9691547608 -20372214256  39755309472 0.9039191
## Sumatra-Sulawesi         7751068957 -22312692907  37814830820 0.9554618
## 
## Hasil: Ada perbedaan profit signifikan antar region.
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = profit ~ region, data = land)
## 
## $region
##                                diff          lwr          upr     p adj
## Jawa-Jabodetabek       -16042509464 -46106271328  14021252400 0.5899057
## Kalimantan-Jabodetabek -58645811782 -88709573645 -28582049918 0.0000012
## Sulawesi-Jabodetabek   -56705333130 -86769094994 -26641571267 0.0000031
## Sumatra-Jabodetabek    -48954264174 -79018026037 -18890502310 0.0000936
## Kalimantan-Jawa        -42603302318 -72667064181 -12539540454 0.0010850
## Sulawesi-Jawa          -40662823666 -70726585530 -10599061803 0.0021477
## Sumatra-Jawa           -32911754710 -62975516573  -2847992846 0.0237312
## Sulawesi-Kalimantan      1940478651 -28123283212  32004240515 0.9997833
## Sumatra-Kalimantan       9691547608 -20372214256  39755309472 0.9039191
## Sumatra-Sulawesi         7751068957 -22312692907  37814830820 0.9554618
## 
## Hasil: Ada perbedaan profit signifikan antar region.

Statistik Inferensial Non-Parametrik

Mann–Whitney U Test

library(tidyverse)

# ======================
# 1. Pilih dua project_type
# ======================
types <- unique(land$project_type)[1:2]  # ambil dua project_type pertama
profit_group <- land %>%
  filter(project_type %in% types) %>%
  select(project_type, profit)

# ======================
# 2. Ringkasan data
# ======================
profit_group %>%
  group_by(project_type) %>%
  summarise(
    mean_profit = mean(profit, na.rm = TRUE),
    median_profit = median(profit, na.rm = TRUE),
    sd_profit = sd(profit, na.rm = TRUE),
    n = n()
  )
# ======================
# 3. Mann–Whitney U Test
# ======================
# di R: wilcox.test untuk Mann-Whitney
mw_test <- wilcox.test(profit ~ project_type, data = profit_group, exact = FALSE)

mw_test
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  profit by project_type
## W = 47590, p-value = 0.0001139
## alternative hypothesis: true location shift is not equal to 0
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  profit by project_type
## W = 47590, p-value = 0.0001139
## alternative hypothesis: true location shift is not equal to 0
# ======================
# 4. Interpretasi
# ======================
if(mw_test$p.value < 0.05){
  cat("Hasil: Profit antara kedua project_type berbeda signifikan (Mann-Whitney U Test).\n")
} else {
  cat("Hasil: Profit antara kedua project_type tidak berbeda signifikan.\n")
}
## Hasil: Profit antara kedua project_type berbeda signifikan (Mann-Whitney U Test).
## Hasil: Profit antara kedua project_type berbeda signifikan (Mann-Whitney U Test).

Kruskal–Wallis Test

library(tidyverse)
if(!require(PMCMRplus)) install.packages("PMCMRplus")
## Loading required package: PMCMRplus
## Loading required package: PMCMRplus
library(PMCMRplus)

# ======================
# 1️⃣ Bersihkan data
# ======================
land_clean <- land %>%
  filter(!is.na(profit), is.finite(profit), !is.na(region)) %>%
  mutate(region = factor(region))  # pastikan region = faktor

# ======================
# 2️⃣ Hapus grup dengan <2 observasi
# ======================
valid_groups <- land_clean %>%
  group_by(region) %>%
  summarise(n = n()) %>%
  filter(n >= 2) %>%
  pull(region)

land_clean2 <- land_clean %>%
  filter(region %in% valid_groups)

# ======================
# 3️⃣ Pastikan data valid
# ======================
if(nrow(land_clean2) == 0 | length(levels(land_clean2$region)) < 2){
  stop("⚠️ Tidak cukup data untuk menjalankan Kruskal-Wallis Test.")
}

# ======================
# 4️⃣ Kruskal-Wallis Test
# ======================
kruskal_test <- kruskal.test(profit ~ region, data = land_clean2)
print(kruskal_test)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  profit by region
## Kruskal-Wallis chi-squared = 41.678, df = 4, p-value = 1.945e-08
## 
##  Kruskal-Wallis rank sum test
## 
## data:  profit by region
## Kruskal-Wallis chi-squared = 41.678, df = 4, p-value = 1.945e-08
# ======================
# 5️⃣ Post-hoc Nemenyi jika signifikan
# ======================
if(kruskal_test$p.value < 0.05){
  posthoc <- kwAllPairsNemenyiTest(profit ~ region, data = land_clean2)
  print(posthoc)
}
## Warning in kwAllPairsNemenyiTest.default(c(1.5212e+11, 4.17566e+11,
## 2.20799e+11, : Ties are present, p-values are not corrected.
## 
##  Pairwise comparisons using Tukey-Kramer-Nemenyi all-pairs test with Tukey-Dist approximation
## data: profit by region
##            Jabodetabek Jawa    Kalimantan Sulawesi
## Jawa       0.80438     -       -          -       
## Kalimantan 6.8e-06     0.00108 -          -       
## Sulawesi   2.3e-05     0.00276 0.99924    -       
## Sumatra    0.00027     0.01716 0.93911    0.98451
## 
## P value adjustment method: single-step
## alternative hypothesis: two.sided
## Warning in kwAllPairsNemenyiTest.default(c(1.5212e+11, 4.17566e+11,
## 2.20799e+11, : Ties are present, p-values are not corrected.
## 
##  Pairwise comparisons using Tukey-Kramer-Nemenyi all-pairs test with Tukey-Dist approximation
## data: profit by region
##            Jabodetabek Jawa    Kalimantan Sulawesi
## Jawa       0.80438     -       -          -       
## Kalimantan 6.8e-06     0.00108 -          -       
## Sulawesi   2.3e-05     0.00276 0.99924    -       
## Sumatra    0.00027     0.01716 0.93911    0.98451
## 
## P value adjustment method: single-step
## alternative hypothesis: two.sided
# ======================
# 6️⃣ Interpretasi & Kesimpulan
# ======================
cat("\nKesimpulan:\n")
## 
## Kesimpulan:
## 
## Kesimpulan:
if(kruskal_test$p.value < 0.05){
  cat("Profit berbeda signifikan antar region (p =", round(kruskal_test$p.value,5), ").\n")
  cat("Post-hoc Nemenyi dapat digunakan untuk mengetahui pasangan region yang berbeda signifikan.\n")
} else {
  cat("Profit tidak berbeda signifikan antar region (p =", round(kruskal_test$p.value,5), ").\n")
}
## Profit berbeda signifikan antar region (p = 0 ).
## Post-hoc Nemenyi dapat digunakan untuk mengetahui pasangan region yang berbeda signifikan.
## Profit berbeda signifikan antar region (p = 0 ).
## Post-hoc Nemenyi dapat digunakan untuk mengetahui pasangan region yang berbeda signifikan.

Chi-Square Test

library(tidyverse)

# ======================
# 1️⃣ Buat tabel kontingensi
# ======================
contingency_table <- table(land$project_type, land$region)
contingency_table
##              
##               Jabodetabek Jawa Kalimantan Sulawesi Sumatra
##   Commercial           47   50         32       54      52
##   Industrial           16   19         18       22      22
##   MixedUse             34   26         35       19      27
##   Residential          96   98        108       98      92
##              
##               Jabodetabek Jawa Kalimantan Sulawesi Sumatra
##   Commercial           47   50         32       54      52
##   Industrial           16   19         18       22      22
##   MixedUse             34   26         35       19      27
##   Residential          96   98        108       98      92
# ======================
# 2️⃣ Chi-Square Test
# ======================
chi_test <- chisq.test(contingency_table)
chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 15.427, df = 12, p-value = 0.2189
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 15.427, df = 12, p-value = 0.2189
# ======================
# 3️⃣ Interpretasi & Kesimpulan
# ======================
cat("\nKesimpulan:\n")
## 
## Kesimpulan:
## 
## Kesimpulan:
if(chi_test$p.value < 0.05){
  cat("Ada hubungan signifikan antara project_type dan region (p =", round(chi_test$p.value,5), ").\n")
} else {
  cat("Tidak ada hubungan signifikan antara project_type dan region (p =", round(chi_test$p.value,5), ").\n")
}
## Tidak ada hubungan signifikan antara project_type dan region (p = 0.21893 ).
## Tidak ada hubungan signifikan antara project_type dan region (p = 0.21893 ).

Optimisasi

# ======================
# Optimisasi Portofolio Proyek
# ======================

library(tidyverse)
library(lpSolve)  # package untuk linear programming

# ======================
# 1️⃣ Formulasi Masalah
# ======================

## Objective Function
# Maksimalkan total profit proyek
# f(x) = Σ (profit_i * x_i), i = 1..n
# x_i = 1 jika proyek dipilih, 0 jika tidak

## Decision Variables
# x_i ∈ {0,1}  → pilih/tidak pilih proyek i

## Constraints
# 1. Total construction_cost tidak boleh melebihi budget
budget <- 1e12  # contoh: 1 triliun

# 2. ESG minimal rata-rata
min_esg <- 70  # ESG score minimal

# ======================
# 2️⃣ Siapkan data
# ======================
proj_data <- land %>%
  select(row_id, profit, construction_cost, esg_score) %>%
  filter(!is.na(profit), !is.na(construction_cost), !is.na(esg_score))

n_proj <- nrow(proj_data)

# Koefisien objective: profit
f.obj <- proj_data$profit

# Matriks constraints
# 1) Budget constraint
f.con <- matrix(proj_data$construction_cost, nrow = 1)

# 2) ESG minimal (average ESG ≥ min_esg)
# Σ (esg_score * x_i) / Σ x_i ≥ min_esg  → LP linear approximation:
# Σ ((esg_score - min_esg) * x_i) ≥ 0
f.con <- rbind(f.con, proj_data$esg_score - min_esg)

# Direction of constraints
f.dir <- c("<=", ">=")

# Right-hand side
f.rhs <- c(budget, 0)

# Decision variable type: binary
binary.vec <- rep(1, n_proj)

# ======================
# 3️⃣ Jalankan Optimisasi
# ======================
lp_result <- lp(
  direction = "max",
  objective.in = f.obj,
  const.mat = f.con,
  const.dir = f.dir,
  const.rhs = f.rhs,
  all.bin = TRUE
)

# ======================
# 4️⃣ Hasil Optimisasi
# ======================
selected_proj <- proj_data %>%
  mutate(selected = lp_result$solution) %>%
  filter(selected == 1)

selected_proj %>%
  summarise(
    total_profit = sum(profit),
    total_cost = sum(construction_cost),
    avg_esg = mean(esg_score)
  )
# Tampilkan proyek yang dipilih
selected_proj
# ======================
# 5️⃣ Analisis Risiko dan ESG
# ======================
risk_analysis <- selected_proj %>%
  summarise(
    avg_profit_margin = mean(profit / construction_cost * 100, na.rm = TRUE),
    max_cost = max(construction_cost),
    min_esg = min(esg_score),
    avg_esg = mean(esg_score)
  )

risk_analysis