Set-up - Menyiapkan tools untuk import, cleaning, ringkasan, missing, dan imputasi.

# =====================================================
# Instalasi paket (jalankan sekali saja jika belum terpasang)
# =====================================================
# install.packages(c(
#   "tidyverse",   # Manipulasi & visualisasi data
#   "janitor",     # Pembersihan nama variabel & data
#   "skimr",       # Ringkasan eksploratif dataset
#   "naniar",      # Analisis nilai hilang (missing values)
#   "lubridate",   # Manipulasi data tanggal & waktu
#   "stringr"      # Pengolahan string/teks
# ))

# =====================================================
# Memuat library yang dibutuhkan
# =====================================================
library(tidyverse)   # Data wrangling dan visualisasi (dplyr, ggplot2, dll.)
library(janitor)     # Membersihkan nama kolom dan struktur data
library(skimr)       # Menyediakan ringkasan statistik cepat (EDA)
library(naniar)      # Visualisasi dan analisis missing values
library(lubridate)   # Pengolahan dan ekstraksi komponen tanggal
library(stringr)     # Operasi string yang konsisten dan efisien

Import Data - memastikan data terbaca benar (angka tidak jadi teks karena format Indonesia).

# Load library yang diperlukan
library(tidyverse)  # untuk read_csv() dan manipulasi data

# URL dataset
land_path <- "https://raw.githubusercontent.com/dsciencelabs/dataset/refs/heads/master/sinarmas_land.csv"

land_raw <- read.csv(
  land_path,                 # Path / lokasi file data lahan
  sep = ";",                 # Pemisah kolom
  dec = ",",                 # Format desimal
  stringsAsFactors = FALSE   # Karakter tetap sebagai string
)

head(land_raw)

Rapikan Nama Kolom - nama kolom konsisten (snake_case), memudahkan coding.

land <- land_raw %>% clean_names()

names(proj)

## NULL

## NULL

Rename Kolom - Mengubah nama kolom yang dianggap perlu untuk diubah.

library(dplyr)

land <- land %>%
  rename(
    date   = tanggal,
    region = lokasi,
    project_type = project_pype
  )

names(land)

##  [1] "date"                          "region"                       
##  [3] "project_type"                  "sales_channel"                
##  [5] "interest_rate"                 "consumer_conf"                
##  [7] "marketing_spend"               "construction_progress"        
##  [9] "material_index"                "unit_price_sqm"               
## [11] "cost_sqm"                      "avg_unit_size_sqm"            
## [13] "units_sold"                    "revenue"                      
## [15] "construction_cost"             "profit"                       
## [17] "profit_margin"                 "water_use_m3"                 
## [19] "energy_kwh"                    "co2_ton"                      
## [21] "sediment_tss_mg_l"             "esg_score"                    
## [23] "esg_flag"                      "schedule_delay_days"          
## [25] "cost_overrun_pct"              "at_risk"                      
## [27] "kpi_target"                    "sigma_assumed"                
## [29] "revenue_billion_idr"           "construction_cost_billion_idr"
## [31] "profit_billion_idr"            "business_unit_id"             
## [33] "time_id"                       "row_id"

Hapus Duplikasi Baris - mencegah data “terhitung dua kali”.

land <- land %>% distinct()

# Check hasil
head(land)

Check Data Hilang

library(dplyr)

check_na_overview <- function(df) {
  tibble(
    column = names(df),
    na_count = sapply(df, function(x) sum(is.na(x))),
    na_pct = round(sapply(df, function(x) mean(is.na(x))) * 100, 2)
  ) %>%
    arrange(desc(na_pct))
}

na_land <- check_na_overview(land)

na_land

Imputasi Data Hilang

library(dplyr)
library(tidyr)
library(stringr)

fix_missing <- function(df, cat_fill = "unknown", num_fill = 0) {
  df %>%
    # pastikan factor jadi character supaya aman diproses
    mutate(across(where(is.factor), as.character)) %>%
    # rapikan text
    mutate(across(where(is.character), ~ str_squish(.x))) %>%
    # isi missing kategori
    mutate(across(where(is.character), ~ replace_na(.x, cat_fill))) %>%
    # isi missing numerik
    mutate(across(where(is.numeric), ~ replace_na(.x, num_fill)))
}

land <- fix_missing(land)

Parse Tanggal - (jika ada kolom date): bisa digunakan untuk analisis time series (bulanan, harian, mingguan, dll).

land <- land %>%
  mutate(
    date = as.Date(date, format = "%d/%m/%y"),
    year  = lubridate::year(date),
    month = lubridate::month(date)
  )

head(land)

Analisis Univariat

Histogram

library(ggplot2)

ggplot(land, aes(x = interest_rate)) +
  
  # Histogram (pakai density, bukan frekuensi)
  geom_histogram(
    aes(y = after_stat(density)),
    bins = 20,
    na.rm = TRUE,
    fill = "#8ECAE6",
    color = "white",
    alpha = 0.8
  ) +
  
  # Density curve
  geom_density(
    na.rm = TRUE,
    color = "#023047",
    linewidth = 1.2
  ) +
  
  # Garis median
  geom_vline(
    aes(xintercept = median(interest_rate, na.rm = TRUE)),
    color = "#D62828",
    linetype = "dashed",
    linewidth = 1
  ) +
  
  labs(
    title = "Distribusi Interest Rate Proyek",
    subtitle = "Histogram (Bar) + Density",
    x = "Interest Rate (%)",
    y = "Density",
    caption = "Garis putus-putus merah menunjukkan nilai median"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(color = "grey40"),
    plot.caption = element_text(size = 9, color = "grey40")
  )

Density Plot

library(ggplot2)

ggplot(land, aes(x = interest_rate)) +
  geom_density(
    na.rm = TRUE,
    fill = "#8ECAE6",
    color = "#023047",
    alpha = 0.6,
    linewidth = 1
  ) +
  geom_vline(
    aes(xintercept = median(interest_rate, na.rm = TRUE)),
    color = "#D62828",
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Density Plot Interest Rate Proyek",
    subtitle = "Distribusi Tidak Normal (Skewed & Bounded)",
    x = "Interest Rate (%)",
    y = "Density",
    caption = "Garis putus-putus merah menunjukkan nilai median"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(color = "grey40"),
    plot.caption = element_text(size = 9, color = "grey40")
  )

# install.packages("plotly") # jika belum terpasang
library(plotly)

# Ambil data & buang NA
x <- land$interest_rate
x <- x[!is.na(x)]

# Hitung density & median
d <- density(x)
med <- median(x)

# Plotly density plot
p <- plot_ly() %>%
  add_lines(
    x = d$x,
    y = d$y,
    line = list(color = "#023047", width = 3),
    fill = "tozeroy",
    fillcolor = "rgba(142,202,230,0.5)",
    name = "Density"
  ) %>%
  add_segments(
    x = med, xend = med,
    y = 0, yend = max(d$y),
    line = list(color = "#D62828", width = 2, dash = "dash"),
    name = "Median"
  ) %>%
  layout(
    title = list(
      text = "Density Plot Interest Rate Proyek<br><sup>Distribusi Tidak Normal (Skewed & Bounded)</sup>"
    ),
    xaxis = list(title = "Interest Rate (%)"),
    yaxis = list(title = "Density"),
    showlegend = TRUE
  )

p

Bar+Density Plot

library(plotly)

# Ambil data & buang NA
x <- land$interest_rate
x <- x[!is.na(x)]

# Hitung density & median
d <- density(x)
med <- median(x)

p <- plot_ly() %>%
  add_histogram(
    x = x,
    nbinsx = 20,
    histnorm = "probability density",
    marker = list(
      color = "rgba(142,202,230,0.7)",
      line = list(color = "white", width = 1)
    ),
    name = "Histogram"
  ) %>%
  add_lines(
    x = d$x,
    y = d$y,
    line = list(color = "#023047", width = 3),
    name = "Density"
  ) %>%
  add_segments(
    x = med, xend = med,
    y = 0, yend = max(d$y),
    line = list(color = "#D62828", width = 2, dash = "dash"),
    name = "Median"
  ) %>%
  layout(
    title = list(
      text = "Distribusi Interest Rate Proyek<br><sup>Histogram (Bar) + Density</sup>"
    ),
    xaxis = list(title = "Interest Rate (%)"),
    yaxis = list(title = "Density"),
    bargap = 0.05,
    legend = list(
      orientation = "h",
      x = 0.5,
      xanchor = "center",
      y = -0.2,
      yanchor = "top"
    )
  )

p

Boxplot

library(ggplot2)
library(ggrepel)

# Statistik utama
s <- with(land, data.frame(
  label = c("Min", "Max", "Mean", "Median", "Mode"),
  value = c(
    min(interest_rate, na.rm = TRUE),
    max(interest_rate, na.rm = TRUE),
    mean(interest_rate, na.rm = TRUE),
    median(interest_rate, na.rm = TRUE),
    as.numeric(names(sort(table(interest_rate), TRUE)[1]))
  )
))

ggplot(land, aes(x = "", y = interest_rate)) +
  geom_boxplot(fill = "#8ECAE6", alpha = 0.8) +

  # Titik statistik
  geom_point(data = s, aes(x = 1, y = value), color = "#D62828", size = 3) +

  # Label + panah otomatis (ANTI OVERLAP)
  geom_text_repel(
    data = s,
    aes(x = 1, y = value, label = paste0(label, ": ", round(value, 2))),
    nudge_x = 0.25,
    direction = "y",
    segment.color = "grey40",
    segment.size = 0.4,
    size = 3,
    box.padding = 0.4,
    point.padding = 0.3
  ) +

  labs(
    title = "Boxplot Interest Rate Proyek",
    subtitle = "Statistik Utama Ditampilkan Tanpa Overlap",
    y = "Interest Rate (%)",
    x = NULL
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )

# Offset Y untuk label (ANTI OVERLAP)
s$y_label <- s$value
s$y_label[s$label == "Mean"]   <- s$value[s$label == "Mean"] + 0.25
s$y_label[s$label == "Median"] <- s$value[s$label == "Median"] - 0.25
s$y_label[s$label == "Mode"]   <- s$value[s$label == "Mode"] + 0.25
s$y_label[s$label == "Min"]    <- s$value[s$label == "Min"] - 0.25


# Geser posisi X (HANYA INI YANG BERUBAH)
x_box   <- 1
x_arrow <- 1.30     # ujung panah
x_text  <- 1.48     # >>> teks lebih ke kanan

fig <- plot_ly() %>%

  # Boxplot + OUTLIER
  add_boxplot(
    x = rep(x_box, length(land$interest_rate)),
    y = land$interest_rate,
    fillcolor = "#8ECAE6",
    line = list(color = "#2C3E50"),
    opacity = 0.8,
    width = 0.4,
    boxpoints = "outliers",          # <<< OUTLIER DITAMPILKAN
    jitter = 0,
    pointpos = 0,
    marker = list(
      color = "#FB8500",             # warna outlier (beda dari statistik)
      size = 7
    ),
    showlegend = FALSE
  ) %>%

  # Titik statistik (Mean, Median, dst)
  add_markers(
    x = rep(x_box, nrow(s)),
    y = s$value,
    marker = list(color = "#D62828", size = 12),
    showlegend = FALSE
  ) %>%

  # Teks statistik
  add_text(
    x = rep(x_text, nrow(s)),
    y = s$y_label,
    text = paste0(s$label, ": ", round(s$value, 2)),
    textposition = "middle left",
    showlegend = FALSE
  ) %>%

  layout(
    title = list(
      text = "Boxplot Interest Rate Proyek<br>
              <sup>Dengan Outlier dan Statistik Utama</sup>",
      x = 0.05
    ),
    xaxis = list(
      range = c(0.6, 1.9),
      showticklabels = FALSE,
      showgrid = FALSE,
      zeroline = FALSE
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      zeroline = FALSE
    ),
    shapes = lapply(
      1:nrow(s),
      function(i) {
        list(
          type = "line",
          x0 = x_box,
          x1 = x_arrow,
          y0 = s$value[i],
          y1 = s$y_label[i],
          line = list(color = "grey40", width = 1)
        )
      }
    ),
    annotations = lapply(
      1:nrow(s),
      function(i) {
        list(
          x = x_arrow,
          y = s$y_label[i],
          ax = x_box,
          ay = s$value[i],
          xref = "x",
          yref = "y",
          axref = "x",
          ayref = "y",
          showarrow = TRUE,
          arrowhead = 3,
          arrowwidth = 1,
          arrowcolor = "grey40",
          text = ""
        )
      }
    ),
    margin = list(
      l = 0,
      r = 0,
      t = 90,
      b = 10
    )
  )

fig

Violin Plot

library(ggplot2)
library(ggrepel)

# =====================================================
# Statistik utama
# =====================================================
s <- with(land, data.frame(
  label = c("Min", "Max", "Mean", "Median", "Mode"),
  value = c(
    min(interest_rate, na.rm = TRUE),
    max(interest_rate, na.rm = TRUE),
    mean(interest_rate, na.rm = TRUE),
    median(interest_rate, na.rm = TRUE),
    as.numeric(names(sort(table(interest_rate), decreasing = TRUE)[1]))
  )
))

# =====================================================
# Violin Plot murni + Statistik
# =====================================================
ggplot(land, aes(x = "", y = interest_rate)) +

  # Violin plot (tanpa boxplot)
  geom_violin(
    fill = "#8ECAE6",
    alpha = 0.8,
    trim = FALSE
  ) +

  # Titik statistik
  geom_point(
    data = s,
    aes(x = 1, y = value),
    color = "#D62828",
    size = 3
  ) +

  # Label statistik (anti overlap)
  geom_text_repel(
    data = s,
    aes(
      x = 1,
      y = value,
      label = paste0(label, ": ", round(value, 2))
    ),
    nudge_x = 0.28,
    direction = "y",
    segment.color = "grey40",
    segment.size = 0.4,
    size = 3,
    box.padding = 0.4,
    point.padding = 0.3
  ) +

  labs(
    title = "Violin Plot Interest Rate Proyek",
    subtitle = "Distribusi Interest Rate dengan Statistik Utama",
    y = "Interest Rate (%)",
    x = NULL
  ) +

  theme_minimal() +
  theme(
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank()
  )

library(plotly)

# =====================================================
# 1. DATA
# =====================================================
y <- land$interest_rate
y <- y[!is.na(y)]

# =====================================================
# 2. DETEKSI OUTLIER (IQR METHOD)
# =====================================================
Q1  <- quantile(y, 0.25)
Q3  <- quantile(y, 0.75)
IQR <- Q3 - Q1

lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR

is_outlier <- y < lower | y > upper

# =====================================================
# 3. STATISTIK UTAMA
# =====================================================
s <- data.frame(
  label = c("Max", "Mean", "Median", "Mode", "Min"),
  value = c(
    max(y),
    mean(y),
    median(y),
    as.numeric(names(sort(table(y), decreasing = TRUE)[1])),
    min(y)
  )
)

# Offset Y (ANTI OVERLAP)
s$y_label <- s$value
s$y_label[s$label == "Mean"]   <- s$value[s$label == "Mean"] + 0.25
s$y_label[s$label == "Median"] <- s$value[s$label == "Median"] - 0.25
s$y_label[s$label == "Mode"]   <- s$value[s$label == "Mode"] + 0.25
s$y_label[s$label == "Min"]    <- s$value[s$label == "Min"] - 0.25

# =====================================================
# 4. POSISI X
# =====================================================
x_violin <- 1
x_arrow  <- 1.30
x_text   <- 1.52

# =====================================================
# 5. VIOLIN PLOT
# =====================================================
fig <- plot_ly() %>%

  # ---- VIOLIN (DENSITY)
  add_trace(
    type = "violin",
    x = rep(x_violin, length(y)),
    y = y,
    fillcolor = "#8ECAE6",
    line = list(color = "#2C3E50"),
    opacity = 0.85,
    width = 0.55,
    points = FALSE,
    showlegend = FALSE
  ) %>%

  # ---- NON-OUTLIER (ABU-ABU PEKAT)
  add_markers(
    x = rep(x_violin, sum(!is_outlier)),
    y = y[!is_outlier],
    marker = list(
      color = "rgba(90,90,90,0.9)",
      size = 6
    ),
    showlegend = FALSE
  ) %>%

  # ---- OUTLIER (MERAH)
  add_markers(
    x = rep(x_violin, sum(is_outlier)),
    y = y[is_outlier],
    marker = list(
      color = "#D62828",
      size = 9
    ),
    showlegend = FALSE
  ) %>%

  # ---- TITIK STATISTIK
  add_markers(
    x = rep(x_violin, nrow(s)),
    y = s$value,
    marker = list(color = "#FB8500", size = 13),
    showlegend = FALSE
  ) %>%

  # ---- TEKS STATISTIK
  add_text(
    x = rep(x_text, nrow(s)),
    y = s$y_label,
    text = paste0(s$label, ": ", round(s$value, 2)),
    textposition = "middle left",
    showlegend = FALSE
  ) %>%

  # =====================================================
  # 6. LAYOUT + PANAH
  # =====================================================
  layout(
    title = list(
      text = "Violin Plot Interest Rate Proyek<br>
              <sup>Outlier (IQR), Distribusi, dan Statistik Utama</sup>",
      x = 0.05
    ),
    xaxis = list(
      range = c(0.6, 1.95),
      showticklabels = FALSE,
      showgrid = FALSE,
      zeroline = FALSE
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      zeroline = FALSE
    ),
    shapes = lapply(
      1:nrow(s),
      function(i) {
        list(
          type = "line",
          x0 = x_violin,
          x1 = x_arrow,
          y0 = s$value[i],
          y1 = s$y_label[i],
          line = list(color = "grey40", width = 1)
        )
      }
    ),
    annotations = lapply(
      1:nrow(s),
      function(i) {
        list(
          x = x_arrow,
          y = s$y_label[i],
          ax = x_violin,
          ay = s$value[i],
          xref = "x",
          yref = "y",
          axref = "x",
          ayref = "y",
          showarrow = TRUE,
          arrowhead = 3,
          arrowwidth = 1,
          arrowcolor = "grey40",
          text = ""
        )
      }
    ),
    margin = list(
      l = 0,
      r = 0,
      t = 90,
      b = 10
    )
  )

fig

Boxplot+Violin Plot

library(ggplot2)
library(ggrepel)

# =====================================================
# Statistik utama
# =====================================================
s <- with(land, data.frame(
  label = c("Min", "Max", "Mean", "Median", "Mode"),
  value = c(
    min(interest_rate, na.rm = TRUE),
    max(interest_rate, na.rm = TRUE),
    mean(interest_rate, na.rm = TRUE),
    median(interest_rate, na.rm = TRUE),
    as.numeric(names(sort(table(interest_rate), decreasing = TRUE)[1]))
  )
))

# =====================================================
# Violin Plot + Statistik
# =====================================================
ggplot(land, aes(x = "", y = interest_rate)) +

  # Violin plot
  geom_violin(
    fill = "#8ECAE6",
    alpha = 0.8,
    trim = FALSE
  ) +

  # (Opsional tapi recommended) garis median di tengah violin
  geom_boxplot(
    width = 0.08,
    fill = "white",
    outlier.shape = NA,
    alpha = 0.7
  ) +

  # Titik statistik
  geom_point(
    data = s,
    aes(x = 1, y = value),
    color = "#D62828",
    size = 3
  ) +

  # Label statistik (anti overlap)
  geom_text_repel(
    data = s,
    aes(
      x = 1,
      y = value,
      label = paste0(label, ": ", round(value, 2))
    ),
    nudge_x = 0.28,
    direction = "y",
    segment.color = "grey40",
    segment.size = 0.4,
    size = 3,
    box.padding = 0.4,
    point.padding = 0.3
  ) +

  labs(
    title = "Violin Plot Interest Rate Proyek",
    subtitle = "Distribusi Data dengan Statistik Utama",
    y = "Interest Rate (%)",
    x = NULL
  ) +

  theme_minimal() +
  theme(
    axis.text.x  = element_blank(),
    axis.ticks.x = element_blank()
  )

library(plotly)

# =====================================================
# 1. DATA (BERSIHKAN NA)
# =====================================================
# y = data numerik utama yang akan diplot
y <- land$interest_rate
y <- y[!is.na(y)]

# =====================================================
# 2. DETEKSI OUTLIER BERDASARKAN ATURAN IQR
# =====================================================
# CATATAN PENTING:
# - Outlier TIDAK sama dengan min / max
# - Outlier ditentukan oleh batas statistik (IQR),
#   bukan urutan nilai

Q1  <- quantile(y, 0.25)
Q3  <- quantile(y, 0.75)
IQR <- Q3 - Q1

# Batas whisker boxplot
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Klasifikasi data
is_outlier_low  <- y < lower_bound   # outlier ekstrem RENDAH
is_outlier_high <- y > upper_bound   # outlier ekstrem TINGGI
is_normal       <- !(is_outlier_low | is_outlier_high)

# =====================================================
# 3. STATISTIK UTAMA (DESKRIPTIF)
# =====================================================
# Statistik ini DESKRIPTIF,
# BUKAN penentu outlier

s <- data.frame(
  label = c("Max", "Mean", "Median", "Mode", "Min"),
  value = c(
    max(y),
    mean(y),
    median(y),
    as.numeric(names(sort(table(y), decreasing = TRUE)[1])),
    min(y)
  )
)

# Offset Y agar teks tidak overlap
s$y_label <- s$value
s$y_label[s$label == "Mean"]   <- s$value[s$label == "Mean"] + 0.25
s$y_label[s$label == "Median"] <- s$value[s$label == "Median"] - 0.25
s$y_label[s$label == "Mode"]   <- s$value[s$label == "Mode"] + 0.25
s$y_label[s$label == "Min"]    <- s$value[s$label == "Min"] - 0.25

# =====================================================
# 4. POSISI X (UNTUK OVERLAY & ANOTASI)
# =====================================================
x_main  <- 1       # posisi violin & box
x_arrow <- 1.30    # ujung panah
x_text  <- 1.55    # posisi teks (lebih ke kanan)

# =====================================================
# 5. PLOTLY: VIOLIN + BOX + DATA POINT
# =====================================================
fig <- plot_ly() %>%

  # ---- VIOLIN (SEBARAN DATA)
  add_trace(
    type = "violin",
    x = rep(x_main, length(y)),
    y = y,
    fillcolor = "#8ECAE6",
    line = list(color = "#2C3E50"),
    opacity = 0.45,
    width = 0.6,
    points = FALSE,
    showlegend = FALSE
  ) %>%

  # ---- BOXPLOT (RINGKASAN STATISTIK)
  # Whisker berhenti di batas IQR (BUKAN min/max)
  add_boxplot(
    x = rep(x_main, length(y)),
    y = y,
    width = 0.22,
    fillcolor = "rgba(255,255,255,0.9)",
    line = list(color = "#2C3E50"),
    boxpoints = FALSE,
    showlegend = FALSE
  ) %>%

  # ---- DATA NORMAL (ABU-ABU)
  add_markers(
    x = rep(x_main, sum(is_normal)),
    y = y[is_normal],
    marker = list(
      color = "rgba(80,80,80,0.9)",
      size = 6
    ),
    name = "Normal (Dalam IQR)",
    showlegend = TRUE
  ) %>%

  # ---- OUTLIER EKSTREM RENDAH (BIRU)
  add_markers(
    x = rep(x_main, sum(is_outlier_low)),
    y = y[is_outlier_low],
    marker = list(
      color = "#1F77B4",
      size = 10,
      line = list(color = "black", width = 0.6)
    ),
    name = "Outlier Ekstrem Rendah",
    showlegend = TRUE
  ) %>%

  # ---- OUTLIER EKSTREM TINGGI (MERAH)
  add_markers(
    x = rep(x_main, sum(is_outlier_high)),
    y = y[is_outlier_high],
    marker = list(
      color = "#D62828",
      size = 10,
      line = list(color = "black", width = 0.6)
    ),
    name = "Outlier Ekstrem Tinggi",
    showlegend = TRUE
  ) %>%

  # ---- TITIK STATISTIK UTAMA
  add_markers(
    x = rep(x_main, nrow(s)),
    y = s$value,
    marker = list(color = "#FB8500", size = 13),
    showlegend = FALSE
  ) %>%

  # ---- TEKS STATISTIK
  add_text(
    x = rep(x_text, nrow(s)),
    y = s$y_label,
    text = paste0(s$label, ": ", round(s$value, 2)),
    textposition = "middle left",
    showlegend = FALSE
  ) %>%

  # =====================================================
  # 6. LAYOUT + PANAH PENJELAS
  # =====================================================
  layout(
    title = list(
      text = paste0(
        "Box + Violin Overlay Interest Rate Proyek<br>",
        "<sup>",
        "Catatan: Outlier ditentukan oleh IQR, bukan oleh Min/Max",
        "</sup>"
      ),
      x = 0.05
    ),
    xaxis = list(
      range = c(0.6, 2.0),
      showticklabels = FALSE,
      showgrid = FALSE,
      zeroline = FALSE
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      zeroline = FALSE
    ),
    shapes = lapply(
      1:nrow(s),
      function(i) {
        list(
          type = "line",
          x0 = x_main,
          x1 = x_arrow,
          y0 = s$value[i],
          y1 = s$y_label[i],
          line = list(color = "grey40", width = 1)
        )
      }
    ),
    margin = list(l = 0, r = 0, t = 110, b = 10)
  )

fig

Pie Chart

library(dplyr)
library(plotly)

# =====================================================
# 1. HITUNG FREKUENSI DAN PERSENTASE
# =====================================================
donut_data <- land %>%
  count(project_type, name = "frekuensi") %>%
  mutate(persen = round(frekuensi / sum(frekuensi) * 100, 1))

# =====================================================
# 2. BUAT DONUT CHART INTERAKTIF DENGAN POSISI TURUN
# =====================================================
fig <- plot_ly(
  donut_data,
  labels = ~project_type,
  values = ~frekuensi,
  type = 'pie',
  hole = 0.4,
  textposition = 'inside',
  textinfo = 'label+percent',
  hovertemplate = "<b>%{label}</b><br>Frekuensi: %{value}<br>Persentase: %{percent}<extra></extra>",
  marker = list(line = list(color = '#FFFFFF', width = 2)),
  domain = list(
    x = c(0, 1),      # full horizontal
    y = c(0.05, 0.90) # turunkan chart sedikit (awal 0.0, akhir 1.0)
  )
)

# =====================================================
# 3. LAYOUT
# =====================================================
fig <- fig %>%
  layout(
    title = list(
      text = "Distribusi Project Type (Donut Chart)",
      x = 0.5,
      y = 0.95
    ),
    showlegend = F
  )

fig

Bar Chart

library(dplyr)
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

# =====================================================
# 1. HITUNG FREKUENSI + PERSENTASE KUMULATIF
# =====================================================
pareto_project <- land %>%
  count(project_type, name = "frekuensi") %>%
  arrange(desc(frekuensi)) %>%
  mutate(
    cum_freq = cumsum(frekuensi),
    cum_pct  = cum_freq / sum(frekuensi)
  )

# =====================================================
# 2. PARETO CHART
# =====================================================
ggplot(pareto_project,
       aes(x = reorder(project_type, -frekuensi))) +

  # ---- BAR (FREKUENSI)
  geom_col(
    aes(y = frekuensi, fill = project_type),
    width = 0.8,
    alpha = 0.9
  ) +

  # ---- LABEL FREKUENSI
  geom_text(
    aes(y = frekuensi, label = frekuensi),
    vjust = -0.4,
    size = 3,
    color = "black"
  ) +

  # ---- GARIS KUMULATIF (%)
  geom_line(
    aes(
      y = cum_pct * max(frekuensi),
      group = 1
    ),
    color = "#D62828",
    linewidth = 1.2
  ) +

  geom_point(
    aes(y = cum_pct * max(frekuensi)),
    color = "#D62828",
    size = 2.5
  ) +

  # ---- LABEL % KUMULATIF
  geom_text(
    aes(
      y = cum_pct * max(frekuensi),
      label = percent(cum_pct, accuracy = 1)
    ),
    vjust = -0.8,
    color = "#D62828",
    size = 3
  ) +

  # =====================================================
  # 3. AXIS & TEMA
  # =====================================================
  scale_y_continuous(
    name = "Frekuensi",
    sec.axis = sec_axis(
      ~ . / max(pareto_project$frekuensi),
      labels = percent_format(accuracy = 1),
      name = "Persentase Kumulatif"
    )
  ) +

  labs(
    title = "Pareto Chart Project Type",
    x = "Project Type"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(
      angle = 45,
      hjust = 1
    ),
    plot.title = element_text(face = "bold")
  )

library(dplyr)
library(plotly)

# =====================================================
# 1. HITUNG FREKUENSI + KUMULATIF
# =====================================================
pareto_project <- land %>%
  count(project_type, name = "frekuensi") %>%
  arrange(desc(frekuensi)) %>%
  mutate(
    cum_freq = cumsum(frekuensi),
    cum_pct  = cum_freq / sum(frekuensi)
  )

max_freq <- max(pareto_project$frekuensi)

# =====================================================
# 2. PLOT PARETO
# =====================================================
fig <- plot_ly(pareto_project, x = ~project_type)

# ---- BAR (FREKUENSI)
fig <- fig %>%
  add_bars(
    y = ~frekuensi,
    color = ~project_type,
    text = ~frekuensi,
    textposition = "outside",
    hovertemplate =
      "<b>%{x}</b><br>" ~
      "Frekuensi: %{y}<extra></extra>",
    showlegend = FALSE
  )

# ---- GARIS PARETO (FIX WARNING)
fig <- fig %>%
  add_trace(
    type = "scatter",
    mode = "lines+markers",   # <<< PENTING
    y = ~cum_pct * max_freq,
    line = list(color = "#D62828", width = 3),
    marker = list(size = 7, color = "#D62828"),
    customdata = ~round(cum_pct * 100, 1),
    hovertemplate =
      "<b>%{x}</b><br>" ~
      "Kumulatif: %{customdata}%<extra></extra>",
    name = "Persentase Kumulatif"
  )

# =====================================================
# 3. LAYOUT
# =====================================================
fig <- fig %>%
  layout(
    title = list(
      text = "Pareto Chart Project Type<br>
              <sup>Frekuensi & Persentase Kumulatif</sup>",
      x = 0.05
    ),
    xaxis = list(
      title = "Project Type",
      categoryorder = "array",
      categoryarray = pareto_project$project_type
    ),
    yaxis = list(
      title = "Frekuensi",
      rangemode = "tozero"
    ),
    margin = list(t = 90)
  )

fig

Analisis Bivariat

Line Plot (Numerik vs Numerik)

library(tidyverse)
library(lubridate)
library(plotly)

land$date <- as.Date(land$date)

# GANTI ini kalau nama kolom revenue kamu beda
revenue_col <- "revenue"

# pastikan kolom revenue numeric
land[[revenue_col]] <- as.numeric(land[[revenue_col]])

data_rev_yearly <- land %>%
  mutate(year = year(date)) %>%
  group_by(year, region) %>%
  summarise(
    revenue = sum(.data[[revenue_col]], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(region, year)

p <- plot_ly()

p <- p %>%
  add_trace(
    data = data_rev_yearly,
    x = ~year,
    y = ~revenue,
    color = ~region,
    type = "scatter",
    mode = "lines+markers",
    name = ~region,
    line = list(shape = "spline", smoothing = 1.2, width = 3),
    marker = list(size = 6, opacity = 0.7),
    opacity = 0.8
  ) %>%
  layout(
    title = list(text = "<b>Annual Revenue per Region</b>", x = 0.5),
    xaxis = list(title = "", dtick = 1, tickangle = -45),
    yaxis = list(title = "Revenue", rangemode = "tozero"),
    legend = list(orientation = "h", x = 0.5, xanchor = "center", y = -0.15),
    margin = list(b = 60)
  )

p

Scatter Plot (Numerik vs Numerik)

# ==============================
# LIBRARY
# ==============================
library(tidyverse)
library(plotly)
library(scales)

# ==============================
# DATA PREPARATION
# ==============================
scatter_sales <- land %>%
  select(units_sold, profit) %>%
  drop_na()

# ==============================
# LINEAR MODEL
# ==============================
lm_model <- lm(profit ~ units_sold, data = scatter_sales)
lm_sum   <- summary(lm_model)

# Koefisien
intercept <- lm_sum$coefficients[1, 1]
slope     <- lm_sum$coefficients[2, 1]
p_value   <- lm_sum$coefficients[2, 4]
r_squared <- lm_sum$r.squared

# Format teks model
eq_text <- paste0(
  "Profit = ",
  formatC(intercept, format = "f", digits = 2),
  " + ",
  formatC(slope, format = "f", digits = 2),
  " × Units Sold"
)

stat_text <- paste0(
  "R² = ", round(r_squared, 3),
  "<br>p-value = ", formatC(p_value, format = "e", digits = 2)
)

# Prediksi garis regresi
scatter_sales <- scatter_sales %>%
  mutate(profit_pred = predict(lm_model))

# ==============================
# SCATTER PLOT + REGRESSION LINE
# ==============================
p <- plot_ly() %>%
  add_markers(
    data = scatter_sales,
    x = ~units_sold,
    y = ~profit,
    marker = list(size = 9, opacity = 0.7),
    hovertemplate = paste(
      "Units Sold: %{x:,}<br>",
      "Profit: %{y:,.2f}<br>",
      "<extra></extra>"
    ),
    name = "Observed"
  ) %>%
  add_lines(
    data = scatter_sales,
    x = ~units_sold,
    y = ~profit_pred,
    line = list(width = 2),
    name = "Linear Fit"
  ) %>%
  layout(
    title = list(
      text = "<b>Units Sold vs Profit with Linear Regression</b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Units Sold",
      zeroline = TRUE
    ),
    yaxis = list(
      title = "Profit",
      tickformat = ",.2f",
      zeroline = TRUE
    ),
    annotations = list(
      list(
        x = 0.02,
        y = 0.98,
        xref = "paper",
        yref = "paper",
        text = paste0(
          "<b>Linear Model</b><br>",
          eq_text, "<br>",
          stat_text
        ),
        showarrow = FALSE,
        align = "left",
        bgcolor = "rgba(255,255,255,0.85)",
        bordercolor = "black",
        borderwidth = 1
      )
    ),
    legend = list(
      orientation = "h",
      x = 0.5,
      xanchor = "center",
      y = -0.25
    )
  )
p

Boxplot (Numerik vs Kategorik)

library(plotly)

fig <- plot_ly(
  data = land,
  x = ~project_type,
  y = ~interest_rate,
  type = "box",
  boxpoints = "outliers",
  fillcolor = "#8ECAE6",
  opacity = 0.8,
  line = list(color = "black"),
  marker = list(
    color = "#D62828",   # warna outlier
    size = 6
  )
) %>%
  layout(
    title = list(
      text = "<b>Interest Rate per Jenis Proyek</b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Jenis Proyek"
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      ticksuffix = "%"
    ),
    template = "simple_white"
  )

fig

Violin Plot (Numerik vs Kategorik)

library(plotly)

fig <- plot_ly(
  data = land,
  x = ~project_type,
  y = ~interest_rate,
  type = "violin",
  fillcolor = "#8ECAE6",
  opacity = 0.8,
  line = list(color = "black"),
  box = list(visible = TRUE),      # boxplot di dalam violin
  meanline = list(visible = TRUE), # garis mean
  points = "outliers",             # tampilkan outlier
  marker = list(
    color = "#D62828",
    size = 6
  )
) %>%
  layout(
    title = list(
      text = "<b>Interest Rate per Jenis Proyek</b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Jenis Proyek"
    ),
    yaxis = list(
      title = "Interest Rate (%)",
      ticksuffix = "%"
    ),
    template = "simple_white"
  )

fig

Bar Chart (Kategorik vs Kategorik)

library(dplyr)
library(plotly)

# Hitung units sold per project type & sales channel
bar_data <- land %>%
  group_by(project_type, sales_channel) %>%
  summarise(
    units_sold = sum(units_sold, na.rm = TRUE),
    .groups = "drop"
  )

# Urutkan project_type dari total units sold tertinggi
order_project <- bar_data %>%
  group_by(project_type) %>%
  summarise(total_units = sum(units_sold), .groups = "drop") %>%
  arrange(desc(total_units))

bar_data <- bar_data %>%
  mutate(
    project_type = factor(
      project_type,
      levels = order_project$project_type
    )
  )

fig <- plot_ly(
  data = bar_data,
  x = ~project_type,
  y = ~units_sold,
  color = ~sales_channel,
  type = "bar"
) %>%
  layout(
    title = list(
      text = "<b>Units Sold berdasarkan Project Type<br><sup>Dilapisi Sales Channel</sup></b>",
      x = 0.5
    ),
    xaxis = list(
      title = "Project Type",
      categoryorder = "array"
    ),
    yaxis = list(
      title = "Units Sold"
    ),
    barmode = "stack",
    legend = list(
      orientation = "h",
      x = 0.5,
      xanchor = "center",
      y = -0.3
    ),
    template = "simple_white"
  )

Analisis Multivariat

Heatmap Korelasi

library(tidyverse)

# ======================
# 1. Ambil semua kolom numerik
# ======================
land_num <- land %>% select(where(is.numeric))

# ======================
# 2. Pilih kolom finansial
# ======================
keywords <- c("price", "capital", "sale", "profit", "margin", "cost", "total")
finance_vars <- land_num %>%
  select(any_of(names(land_num)[str_detect(names(land_num), paste(keywords, collapse = "|"))]))



# ======================
# 3. Hitung korelasi
# ======================
corr_mat <- round(cor(finance_vars, use = "complete.obs"), 2)

# ======================
# 4. Ubah ke long format dan ambil upper-right triangle
# ======================
corr_long <- as.data.frame(as.table(corr_mat))
colnames(corr_long) <- c("Var1", "Var2", "Correlation")

# Buat faktor untuk menentukan urutan kolom
levels_order <- colnames(finance_vars)
corr_long <- corr_long %>%
  mutate(
    Var1 = factor(Var1, levels = levels_order),
    Var2 = factor(Var2, levels = levels_order)
  ) %>%
  filter(as.numeric(Var1) < as.numeric(Var2))  # hanya atas kanan

# ======================
# 5. Plot heatmap
# ======================
ggplot(corr_long, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#B2182B", mid = "white", high = "#2166AC", midpoint = 0) +
  geom_text(aes(label = Correlation), size = 3) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.y = element_text(size = 10),
    axis.title = element_blank()
  ) +
  labs(title = "Upper-Right Triangle Correlation - Variabel Finansial")

Cluster Plot

library(tidyverse)
library(plotly)

# ======================
# 1. Ambil kolom numerik
# ======================
land_num <- land %>% select(where(is.numeric))

# ======================
# 2. Pilih variabel finansial
# ======================
keywords <- c("price", "capital", "sale", "profit", "margin", "cost", "total")
finance_vars <- land_num %>%
  select(any_of(names(land_num)[str_detect(names(land_num), paste(keywords, collapse = "|"))]))

if (ncol(finance_vars) == 0) {
  message("⚠️ Tidak ada variabel numerik finansial ditemukan. Cluster plot tidak dapat dibuat.")
} else {
  
  # ======================
  # 3. Standarisasi
  # ======================
  finance_scaled <- scale(finance_vars)
  
  # ======================
  # 4. K-means clustering
  # ======================
  set.seed(123)
  k <- 3
  km_res <- kmeans(finance_scaled, centers = k, nstart = 25)
  
  # ======================
  # 5. PCA untuk 2D visualisasi
  # ======================
  pca_res <- prcomp(finance_scaled)
  pca_plot <- data.frame(
    PC1 = pca_res$x[,1],
    PC2 = pca_res$x[,2],
    Cluster = factor(km_res$cluster)
  )
  
  # ======================
  # 6. Tambahkan hover data asli
  # ======================
  hover_text <- apply(finance_vars, 1, function(row){
    paste(paste(names(finance_vars), ": ", round(row, 2)), collapse = "<br>")
  })
  
  pca_plot$hover_text <- paste("Cluster: ", pca_plot$Cluster, "<br>", hover_text)
  
  # ======================
  # 7. Plotly interactive cluster plot
  # ======================
  plot_ly(
    data = pca_plot,
    x = ~PC1,
    y = ~PC2,
    color = ~Cluster,
    colors = c("#1F77B4", "#FF7F0E", "#2CA02C"),
    type = "scatter",
    mode = "markers",
    text = ~hover_text,
    hoverinfo = "text"
  ) %>%
    layout(
      title = paste("Interactive Cluster Plot (k =", k, ") - Variabel Finansial"),
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2")
    )
}

Bubble Plot

library(tidyverse)
library(plotly)

# ======================
# 1. Pilih kolom yang dibutuhkan
# ======================
finance_vars <- land %>%
  select(date, year, month, revenue, construction_cost, profit)

# ======================
# 2. Group per Tahun & Bulan
# ======================
finance_agg <- finance_vars %>%
  group_by(year, month) %>%
  summarise(
    total_revenue = sum(revenue, na.rm = TRUE),
    total_cost = sum(construction_cost, na.rm = TRUE),
    total_profit = sum(profit, na.rm = TRUE),
    .groups = "drop"
  )

# ======================
# 3. Buat hover text
# ======================
finance_agg <- finance_agg %>%
  mutate(
    hover_text = paste(
      "Tahun: ", year,
      "<br>Bulan: ", month,
      "<br>Total Revenue: ", round(total_revenue, 2),
      "<br>Total Cost: ", round(total_cost, 2),
      "<br>Total Profit: ", round(total_profit, 2)
    )
  )

# ======================
# 4. Bubble Plot interaktif
# ======================
plot_ly(
  data = finance_agg,
  x = ~total_revenue,
  y = ~total_cost,
  size = ~total_profit,
  sizes = c(10, 50),
  type = "scatter",
  mode = "markers",
  marker = list(sizemode = "area", opacity = 0.7, color = ~year, colorscale = "Viridis"),
  text = ~hover_text,
  hoverinfo = "text"
) %>%
  layout(
    title = "Bubble Plot per Tahun & Bulan (Agregat Finansial)",
    xaxis = list(title = "Total Revenue"),
    yaxis = list(title = "Total Cost")
  )

## Warning: `line.width` does not currently support multiple values.

Statistik Inferensial Parametrik

Uji Z

library(tidyverse)

# ======================
# 1. Tentukan data profit
# ======================
# Misal kita ingin uji rata-rata profit tahun 2026
current_year <- 2026
profit_data <- land %>%
  filter(year == current_year) %>%
  pull(profit)

# ======================
# 2. Tentukan target (mu0) dari rata-rata historis
# ======================
# Misal rata-rata profit tahun sebelumnya (2025)
historical_year <- current_year - 1
mu0 <- land %>%
  filter(year == historical_year) %>%
  summarise(mean_profit = mean(profit, na.rm = TRUE)) %>%
  pull(mean_profit)

cat("Target (mu0) berdasarkan rata-rata historis tahun", historical_year, "=", round(mu0,2), "\n")

## Target (mu0) berdasarkan rata-rata historis tahun 2025 = 213560617698

## Target (mu0) berdasarkan rata-rata historis tahun 2025 = 213560617698
# ======================
# 3. Hitung Uji Z
# ======================
n <- length(profit_data)
x_bar <- mean(profit_data, na.rm = TRUE)
sigma <- sd(profit_data, na.rm = TRUE)  # jika populasi sigma tidak diketahui, pakai SD sampel
z_stat <- (x_bar - mu0) / (sigma / sqrt(n))

# ======================
# 4. Hitung p-value
# ======================
p_value <- 2 * (1 - pnorm(abs(z_stat)))

# ======================
# 5. Tampilkan hasil
# ======================
cat("Uji Z:\n")

## Uji Z:

## Uji Z:
cat("Z =", round(z_stat,3), "\n")

## Z = -0.635

## Z = -0.635
cat("p-value =", round(p_value,5), "\n")

## p-value = 0.52567

## p-value = 0.52567
# ======================
# 6. Interpretasi singkat
# ======================
if(p_value < 0.05){
  cat("Hasil: Rata-rata profit tahun", current_year, "berbeda signifikan dari rata-rata historis.\n")
} else {
  cat("Hasil: Rata-rata profit tahun", current_year, "tidak berbeda signifikan dari rata-rata historis.\n")
}

## Hasil: Rata-rata profit tahun 2026 tidak berbeda signifikan dari rata-rata historis.

## Hasil: Rata-rata profit tahun 2026 tidak berbeda signifikan dari rata-rata historis.

Uji t

library(tidyverse)

# ======================
# 1. Pilih dua project_type
# ======================
types <- unique(land$project_type)[1:2]  # ambil 2 project_type pertama
profit_group <- land %>%
  filter(project_type %in% types) %>%
  select(project_type, profit)

# ======================
# 2. Cek ringkasan data
# ======================
profit_group %>% group_by(project_type) %>% summarise(
  mean_profit = mean(profit, na.rm = TRUE),
  sd_profit = sd(profit, na.rm = TRUE),
  n = n()
)

# ======================
# 3. Uji t independen
# ======================
t_test <- t.test(profit ~ project_type, data = profit_group, var.equal = TRUE) # asumsi varian sama

# ======================
# 4. Tampilkan hasil
# ======================
t_test

## 
##  Two Sample t-test
## 
## data:  profit by project_type
## t = -3.9333, df = 725, p-value = 9.184e-05
## alternative hypothesis: true difference in means between group Commercial and group Residential is not equal to 0
## 95 percent confidence interval:
##  -52492156922 -17537708738
## sample estimates:
##  mean in group Commercial mean in group Residential 
##              196420443165              231435375995

## 
##  Two Sample t-test
## 
## data:  profit by project_type
## t = -3.9333, df = 725, p-value = 9.184e-05
## alternative hypothesis: true difference in means between group Commercial and group Residential is not equal to 0
## 95 percent confidence interval:
##  -52492156922 -17537708738
## sample estimates:
##  mean in group Commercial mean in group Residential 
##              196420443165              231435375995
# ======================
# 5. Interpretasi singkat
# ======================
if(t_test$p.value < 0.05){
  cat("Hasil: Rata-rata profit antara kedua project_type berbeda signifikan.\n")
} else {
  cat("Hasil: Rata-rata profit antara kedua project_type tidak berbeda signifikan.\n")
}

## Hasil: Rata-rata profit antara kedua project_type berbeda signifikan.

## Hasil: Rata-rata profit antara kedua project_type berbeda signifikan.

ANOVA

library(tidyverse)

# ======================
# 1. Cek jumlah grup
# ======================
unique(land$region)  # pastikan ada lebih dari 2 region

## [1] "Jabodetabek" "Jawa"        "Kalimantan"  "Sulawesi"    "Sumatra"

## [1] "Jabodetabek" "Jawa"        "Kalimantan"  "Sulawesi"    "Sumatra"
# ======================
# 2. Ringkasan profit per region
# ======================
land %>%
  group_by(region) %>%
  summarise(
    mean_profit = mean(profit, na.rm = TRUE),
    sd_profit = sd(profit, na.rm = TRUE),
    n = n()
  )

# ======================
# 3. ANOVA satu arah
# ======================
anova_model <- aov(profit ~ region, data = land)

# ======================
# 4. Lihat hasil ANOVA
# ======================
summary(anova_model)

##              Df    Sum Sq   Mean Sq F value   Pr(>F)    
## region        4 5.411e+23 1.353e+23   11.58 3.44e-09 ***
## Residuals   960 1.121e+25 1.168e+22                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##              Df    Sum Sq   Mean Sq F value   Pr(>F)    
## region        4 5.411e+23 1.353e+23   11.58 3.44e-09 ***
## Residuals   960 1.121e+25 1.168e+22                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ======================
# 5. Jika signifikan, lanjut post-hoc Tukey HSD
# ======================
if(summary(anova_model)[[1]]$`Pr(>F)`[1] < 0.05){
  tukey_result <- TukeyHSD(anova_model)
  print(tukey_result)
  cat("Hasil: Ada perbedaan profit signifikan antar region.\n")
} else {
  cat("Hasil: Tidak ada perbedaan profit signifikan antar region.\n")
}

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = profit ~ region, data = land)
## 
## $region
##                                diff          lwr          upr     p adj
## Jawa-Jabodetabek       -16042509464 -46106271328  14021252400 0.5899057
## Kalimantan-Jabodetabek -58645811782 -88709573645 -28582049918 0.0000012
## Sulawesi-Jabodetabek   -56705333130 -86769094994 -26641571267 0.0000031
## Sumatra-Jabodetabek    -48954264174 -79018026037 -18890502310 0.0000936
## Kalimantan-Jawa        -42603302318 -72667064181 -12539540454 0.0010850
## Sulawesi-Jawa          -40662823666 -70726585530 -10599061803 0.0021477
## Sumatra-Jawa           -32911754710 -62975516573  -2847992846 0.0237312
## Sulawesi-Kalimantan      1940478651 -28123283212  32004240515 0.9997833
## Sumatra-Kalimantan       9691547608 -20372214256  39755309472 0.9039191
## Sumatra-Sulawesi         7751068957 -22312692907  37814830820 0.9554618
## 
## Hasil: Ada perbedaan profit signifikan antar region.

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = profit ~ region, data = land)
## 
## $region
##                                diff          lwr          upr     p adj
## Jawa-Jabodetabek       -16042509464 -46106271328  14021252400 0.5899057
## Kalimantan-Jabodetabek -58645811782 -88709573645 -28582049918 0.0000012
## Sulawesi-Jabodetabek   -56705333130 -86769094994 -26641571267 0.0000031
## Sumatra-Jabodetabek    -48954264174 -79018026037 -18890502310 0.0000936
## Kalimantan-Jawa        -42603302318 -72667064181 -12539540454 0.0010850
## Sulawesi-Jawa          -40662823666 -70726585530 -10599061803 0.0021477
## Sumatra-Jawa           -32911754710 -62975516573  -2847992846 0.0237312
## Sulawesi-Kalimantan      1940478651 -28123283212  32004240515 0.9997833
## Sumatra-Kalimantan       9691547608 -20372214256  39755309472 0.9039191
## Sumatra-Sulawesi         7751068957 -22312692907  37814830820 0.9554618
## 
## Hasil: Ada perbedaan profit signifikan antar region.

Statistik Inferensial Non-Parametrik

Mann–Whitney U Test

library(tidyverse)

# ======================
# 1. Pilih dua project_type
# ======================
types <- unique(land$project_type)[1:2]  # ambil dua project_type pertama
profit_group <- land %>%
  filter(project_type %in% types) %>%
  select(project_type, profit)

# ======================
# 2. Ringkasan data
# ======================
profit_group %>%
  group_by(project_type) %>%
  summarise(
    mean_profit = mean(profit, na.rm = TRUE),
    median_profit = median(profit, na.rm = TRUE),
    sd_profit = sd(profit, na.rm = TRUE),
    n = n()
  )

# ======================
# 3. Mann–Whitney U Test
# ======================
# di R: wilcox.test untuk Mann-Whitney
mw_test <- wilcox.test(profit ~ project_type, data = profit_group, exact = FALSE)

mw_test

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  profit by project_type
## W = 47590, p-value = 0.0001139
## alternative hypothesis: true location shift is not equal to 0

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  profit by project_type
## W = 47590, p-value = 0.0001139
## alternative hypothesis: true location shift is not equal to 0
# ======================
# 4. Interpretasi
# ======================
if(mw_test$p.value < 0.05){
  cat("Hasil: Profit antara kedua project_type berbeda signifikan (Mann-Whitney U Test).\n")
} else {
  cat("Hasil: Profit antara kedua project_type tidak berbeda signifikan.\n")
}

## Hasil: Profit antara kedua project_type berbeda signifikan (Mann-Whitney U Test).

## Hasil: Profit antara kedua project_type berbeda signifikan (Mann-Whitney U Test).

Kruskal–Wallis Test

library(tidyverse)
if(!require(PMCMRplus)) install.packages("PMCMRplus")

## Loading required package: PMCMRplus

## Loading required package: PMCMRplus
library(PMCMRplus)

# ======================
# 1️⃣ Bersihkan data
# ======================
land_clean <- land %>%
  filter(!is.na(profit), is.finite(profit), !is.na(region)) %>%
  mutate(region = factor(region))  # pastikan region = faktor

# ======================
# 2️⃣ Hapus grup dengan <2 observasi
# ======================
valid_groups <- land_clean %>%
  group_by(region) %>%
  summarise(n = n()) %>%
  filter(n >= 2) %>%
  pull(region)

land_clean2 <- land_clean %>%
  filter(region %in% valid_groups)

# ======================
# 3️⃣ Pastikan data valid
# ======================
if(nrow(land_clean2) == 0 | length(levels(land_clean2$region)) < 2){
  stop("⚠️ Tidak cukup data untuk menjalankan Kruskal-Wallis Test.")
}

# ======================
# 4️⃣ Kruskal-Wallis Test
# ======================
kruskal_test <- kruskal.test(profit ~ region, data = land_clean2)
print(kruskal_test)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  profit by region
## Kruskal-Wallis chi-squared = 41.678, df = 4, p-value = 1.945e-08

## 
##  Kruskal-Wallis rank sum test
## 
## data:  profit by region
## Kruskal-Wallis chi-squared = 41.678, df = 4, p-value = 1.945e-08
# ======================
# 5️⃣ Post-hoc Nemenyi jika signifikan
# ======================
if(kruskal_test$p.value < 0.05){
  posthoc <- kwAllPairsNemenyiTest(profit ~ region, data = land_clean2)
  print(posthoc)
}

## Warning in kwAllPairsNemenyiTest.default(c(1.5212e+11, 4.17566e+11,
## 2.20799e+11, : Ties are present, p-values are not corrected.

## 
##  Pairwise comparisons using Tukey-Kramer-Nemenyi all-pairs test with Tukey-Dist approximation

## data: profit by region

##            Jabodetabek Jawa    Kalimantan Sulawesi
## Jawa       0.80438     -       -          -       
## Kalimantan 6.8e-06     0.00108 -          -       
## Sulawesi   2.3e-05     0.00276 0.99924    -       
## Sumatra    0.00027     0.01716 0.93911    0.98451

## 
## P value adjustment method: single-step

## alternative hypothesis: two.sided

## Warning in kwAllPairsNemenyiTest.default(c(1.5212e+11, 4.17566e+11,
## 2.20799e+11, : Ties are present, p-values are not corrected.
## 
##  Pairwise comparisons using Tukey-Kramer-Nemenyi all-pairs test with Tukey-Dist approximation
## data: profit by region
##            Jabodetabek Jawa    Kalimantan Sulawesi
## Jawa       0.80438     -       -          -       
## Kalimantan 6.8e-06     0.00108 -          -       
## Sulawesi   2.3e-05     0.00276 0.99924    -       
## Sumatra    0.00027     0.01716 0.93911    0.98451
## 
## P value adjustment method: single-step
## alternative hypothesis: two.sided
# ======================
# 6️⃣ Interpretasi & Kesimpulan
# ======================
cat("\nKesimpulan:\n")

## 
## Kesimpulan:

## 
## Kesimpulan:
if(kruskal_test$p.value < 0.05){
  cat("Profit berbeda signifikan antar region (p =", round(kruskal_test$p.value,5), ").\n")
  cat("Post-hoc Nemenyi dapat digunakan untuk mengetahui pasangan region yang berbeda signifikan.\n")
} else {
  cat("Profit tidak berbeda signifikan antar region (p =", round(kruskal_test$p.value,5), ").\n")
}

## Profit berbeda signifikan antar region (p = 0 ).
## Post-hoc Nemenyi dapat digunakan untuk mengetahui pasangan region yang berbeda signifikan.

## Profit berbeda signifikan antar region (p = 0 ).
## Post-hoc Nemenyi dapat digunakan untuk mengetahui pasangan region yang berbeda signifikan.

Chi-Square Test

library(tidyverse)

# ======================
# 1️⃣ Buat tabel kontingensi
# ======================
contingency_table <- table(land$project_type, land$region)
contingency_table

##              
##               Jabodetabek Jawa Kalimantan Sulawesi Sumatra
##   Commercial           47   50         32       54      52
##   Industrial           16   19         18       22      22
##   MixedUse             34   26         35       19      27
##   Residential          96   98        108       98      92

##              
##               Jabodetabek Jawa Kalimantan Sulawesi Sumatra
##   Commercial           47   50         32       54      52
##   Industrial           16   19         18       22      22
##   MixedUse             34   26         35       19      27
##   Residential          96   98        108       98      92
# ======================
# 2️⃣ Chi-Square Test
# ======================
chi_test <- chisq.test(contingency_table)
chi_test

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 15.427, df = 12, p-value = 0.2189

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 15.427, df = 12, p-value = 0.2189
# ======================
# 3️⃣ Interpretasi & Kesimpulan
# ======================
cat("\nKesimpulan:\n")

## 
## Kesimpulan:

## 
## Kesimpulan:
if(chi_test$p.value < 0.05){
  cat("Ada hubungan signifikan antara project_type dan region (p =", round(chi_test$p.value,5), ").\n")
} else {
  cat("Tidak ada hubungan signifikan antara project_type dan region (p =", round(chi_test$p.value,5), ").\n")
}

## Tidak ada hubungan signifikan antara project_type dan region (p = 0.21893 ).

## Tidak ada hubungan signifikan antara project_type dan region (p = 0.21893 ).

Optimisasi

# ======================
# Optimisasi Portofolio Proyek
# ======================

library(tidyverse)
library(lpSolve)  # package untuk linear programming

# ======================
# 1️⃣ Formulasi Masalah
# ======================

## Objective Function
# Maksimalkan total profit proyek
# f(x) = Σ (profit_i * x_i), i = 1..n
# x_i = 1 jika proyek dipilih, 0 jika tidak

## Decision Variables
# x_i ∈ {0,1}  → pilih/tidak pilih proyek i

## Constraints
# 1. Total construction_cost tidak boleh melebihi budget
budget <- 1e12  # contoh: 1 triliun

# 2. ESG minimal rata-rata
min_esg <- 70  # ESG score minimal

# ======================
# 2️⃣ Siapkan data
# ======================
proj_data <- land %>%
  select(row_id, profit, construction_cost, esg_score) %>%
  filter(!is.na(profit), !is.na(construction_cost), !is.na(esg_score))

n_proj <- nrow(proj_data)

# Koefisien objective: profit
f.obj <- proj_data$profit

# Matriks constraints
# 1) Budget constraint
f.con <- matrix(proj_data$construction_cost, nrow = 1)

# 2) ESG minimal (average ESG ≥ min_esg)
# Σ (esg_score * x_i) / Σ x_i ≥ min_esg  → LP linear approximation:
# Σ ((esg_score - min_esg) * x_i) ≥ 0
f.con <- rbind(f.con, proj_data$esg_score - min_esg)

# Direction of constraints
f.dir <- c("<=", ">=")

# Right-hand side
f.rhs <- c(budget, 0)

# Decision variable type: binary
binary.vec <- rep(1, n_proj)

# ======================
# 3️⃣ Jalankan Optimisasi
# ======================
lp_result <- lp(
  direction = "max",
  objective.in = f.obj,
  const.mat = f.con,
  const.dir = f.dir,
  const.rhs = f.rhs,
  all.bin = TRUE
)

# ======================
# 4️⃣ Hasil Optimisasi
# ======================
selected_proj <- proj_data %>%
  mutate(selected = lp_result$solution) %>%
  filter(selected == 1)

selected_proj %>%
  summarise(
    total_profit = sum(profit),
    total_cost = sum(construction_cost),
    avg_esg = mean(esg_score)
  )

# Tampilkan proyek yang dipilih
selected_proj

# ======================
# 5️⃣ Analisis Risiko dan ESG
# ======================
risk_analysis <- selected_proj %>%
  summarise(
    avg_profit_margin = mean(profit / construction_cost * 100, na.rm = TRUE),
    max_cost = max(construction_cost),
    min_esg = min(esg_score),
    avg_esg = mean(esg_score)
  )

risk_analysis

EDA

Jenndy

2026-02-06

Set-up - Menyiapkan tools untuk import, cleaning, ringkasan, missing, dan imputasi.

Import Data - memastikan data terbaca benar (angka tidak jadi teks karena format Indonesia).

Rapikan Nama Kolom - nama kolom konsisten (snake_case), memudahkan coding.

Rename Kolom - Mengubah nama kolom yang dianggap perlu untuk diubah.

Hapus Duplikasi Baris - mencegah data “terhitung dua kali”.

Check Data Hilang

Imputasi Data Hilang

Parse Tanggal - (jika ada kolom date): bisa digunakan untuk analisis time series (bulanan, harian, mingguan, dll).

Analisis Univariat

Histogram

Density Plot

Bar+Density Plot

Boxplot

Violin Plot

Boxplot+Violin Plot

Pie Chart

Bar Chart

Analisis Bivariat

Line Plot (Numerik vs Numerik)

Scatter Plot (Numerik vs Numerik)

Boxplot (Numerik vs Kategorik)

Violin Plot (Numerik vs Kategorik)

Bar Chart (Kategorik vs Kategorik)

Analisis Multivariat

Heatmap Korelasi

Cluster Plot

Bubble Plot

Statistik Inferensial Parametrik

Uji Z

Uji t

ANOVA

Statistik Inferensial Non-Parametrik

Mann–Whitney U Test

Kruskal–Wallis Test

Chi-Square Test

Optimisasi