Week 11 EDA

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'readr' was built under R version 4.5.3

## Warning: package 'forcats' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)

## Warning: package 'readxl' was built under R version 4.5.3

library(lubridate)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.5.3

library(patchwork)

## Warning: package 'patchwork' was built under R version 4.5.3

library(knitr)

# Load data dari file Excel
df_raw <- read_excel("05_AGRICULTURE_HARVEST.xlsx", sheet = "Harvest")

# Cek sekilas
glimpse(df_raw)

## Rows: 1,000
## Columns: 8
## $ farmer_id         <chr> "F00001", "F00002", "F00003", "F00004", "F00005", "F…
## $ location          <chr> "Sulawesi", "Sulawesi", "Borneo", "Nusa Tenggara", "…
## $ planting_date     <dttm> 2023-07-05, 2023-09-04, 2023-08-17, 2023-07-06, 202…
## $ harvest_date      <dttm> 2023-10-09, 2024-01-04, 2024-01-01, 2023-12-08, 202…
## $ yield             <dbl> 3.755435, 5.002442, 7.575964, 0.500000, 5.804625, 4.…
## $ rainfall          <dbl> 2069, 2172, 2595, 10000, 1489, 1401, 1601, 1723, 168…
## $ temperature       <dbl> 25.62675, 29.30899, 27.39532, 25.49576, 26.49621, 29…
## $ fertilizer_amount <dbl> 357, 266, 118, 293, 152, 54, 348, 313, 290, 383, 398…

head(df_raw)

## # A tibble: 6 × 8
##   farmer_id location      planting_date       harvest_date        yield rainfall
##   <chr>     <chr>         <dttm>              <dttm>              <dbl>    <dbl>
## 1 F00001    Sulawesi      2023-07-05 00:00:00 2023-10-09 00:00:00  3.76     2069
## 2 F00002    Sulawesi      2023-09-04 00:00:00 2024-01-04 00:00:00  5.00     2172
## 3 F00003    Borneo        2023-08-17 00:00:00 2024-01-01 00:00:00  7.58     2595
## 4 F00004    Nusa Tenggara 2023-07-06 00:00:00 2023-12-08 00:00:00  0.5     10000
## 5 F00005    Borneo        2023-08-29 00:00:00 2023-12-29 00:00:00  5.80     1489
## 6 F00006    Sumatra       2023-08-24 00:00:00 2024-02-07 00:00:00  4.17     1401
## # ℹ 2 more variables: temperature <dbl>, fertilizer_amount <dbl>

# Ringkasan statistik
summary(df_raw)

##   farmer_id           location         planting_date                
##  Length:1000        Length:1000        Min.   :2023-06-01 00:00:00  
##  Class :character   Class :character   1st Qu.:2023-07-18 00:00:00  
##  Mode  :character   Mode  :character   Median :2023-09-11 00:00:00  
##                                        Mean   :2023-09-07 14:49:55  
##                                        3rd Qu.:2023-10-27 00:00:00  
##                                        Max.   :2023-12-17 00:00:00  
##                                                                     
##   harvest_date                     yield           rainfall      temperature   
##  Min.   :2023-09-03 00:00:00   Min.   :-5.000   Min.   :    0   Min.   :25.00  
##  1st Qu.:2023-11-30 18:00:00   1st Qu.: 5.097   1st Qu.: 1464   1st Qu.:26.74  
##  Median :2024-01-21 12:00:00   Median : 5.886   Median : 1974   Median :28.46  
##  Mean   :2024-01-18 15:38:52   Mean   : 6.000   Mean   : 2077   Mean   :28.47  
##  3rd Qu.:2024-03-10 00:00:00   3rd Qu.: 6.821   3rd Qu.: 2484   3rd Qu.:30.24  
##  Max.   :2024-06-05 00:00:00   Max.   :20.000   Max.   :10000   Max.   :31.99  
##                                NA's   :20       NA's   :29      NA's   :20     
##  fertilizer_amount
##  Min.   : 50.0    
##  1st Qu.:168.0    
##  Median :268.0    
##  Mean   :272.3    
##  3rd Qu.:380.5    
##  Max.   :499.0    
##

# Cek missing values per kolom
cat("=== MISSING VALUES ===\n")

## === MISSING VALUES ===

colSums(is.na(df_raw))

##         farmer_id          location     planting_date      harvest_date 
##                 0                 0                 0                 0 
##             yield          rainfall       temperature fertilizer_amount 
##                20                29                20                 0

# Cek nilai unik di kolom location (deteksi inkonsistensi)
cat("\n=== NILAI UNIK LOCATION ===\n")

## 
## === NILAI UNIK LOCATION ===

unique(df_raw$location)

##  [1] "Sulawesi"      "Borneo"        "Nusa Tenggara" "Sumatra"      
##  [5] "Java"          "nusa tenggara" "sumatra"       "borneo"       
##  [9] "java"          "sulawesi"

# Cek nilai ekstrem di yield
cat("\n=== NILAI YIELD MENCURIGAKAN ===\n")

## 
## === NILAI YIELD MENCURIGAKAN ===

df_raw %>% filter(yield <= 0 | yield >= 15) %>% 
  select(farmer_id, location, yield, rainfall) %>% 
  print()

## # A tibble: 21 × 4
##    farmer_id location      yield rainfall
##    <chr>     <chr>         <dbl>    <dbl>
##  1 F00013    Nusa Tenggara    20    10000
##  2 F00031    Borneo           20    10000
##  3 F00043    Nusa Tenggara    20        0
##  4 F00120    Sumatra          -5    10000
##  5 F00156    Borneo           20    10000
##  6 F00167    Java             -5        0
##  7 F00172    Borneo           -5    10000
##  8 F00347    Sumatra          -5        0
##  9 F00350    Sulawesi         20        0
## 10 F00363    Nusa Tenggara    20    10000
## # ℹ 11 more rows

df_clean <- df_raw %>%
  
  # --- 3.1 Standardisasi kapitalisasi kolom location ---
  mutate(location = str_to_title(location)) %>%
  
  # --- 3.2 Konversi serial Excel ke tanggal ---
  # Serial Excel dihitung dari 1 Jan 1900; R menggunakan origin "1899-12-30"
  mutate(
    planting_date = as.Date(planting_date, origin = "1899-12-30"),
    harvest_date  = as.Date(harvest_date,  origin = "1899-12-30")
  ) %>%
  
  # --- 3.3 Hitung durasi tanam (hari) ---
  mutate(growth_days = as.numeric(harvest_date - planting_date)) %>%
  
  # --- 3.4 Hapus baris dengan yield tidak valid ---
  # Yield negatif (-5) dan nilai absurd (20 ton/ha untuk padi sangat tidak realistis)
  # Batas wajar: 0.5–12 ton/ha untuk pertanian tropis
  filter(!is.na(yield)) %>%
  filter(yield > 0.5 & yield < 15) %>%
  
  # --- 3.5 Hapus baris dengan rainfall tidak valid ---
  # Nilai 0, 5000, 10000 mm tidak realistis untuk musim tanam
  filter(!is.na(rainfall)) %>%
  filter(rainfall > 100 & rainfall < 4000) %>%
  
  # --- 3.6 Hapus baris dengan temperature NA ---
  filter(!is.na(temperature))

# Cek hasil cleaning
cat("=== UKURAN DATA ===\n")

## === UKURAN DATA ===

cat("Sebelum cleaning:", nrow(df_raw), "baris\n")

## Sebelum cleaning: 1000 baris

cat("Setelah cleaning:", nrow(df_clean), "baris\n")

## Setelah cleaning: 906 baris

cat("Baris dihapus   :", nrow(df_raw) - nrow(df_clean), "baris\n\n")

## Baris dihapus   : 94 baris

glimpse(df_clean)

## Rows: 906
## Columns: 9
## $ farmer_id         <chr> "F00001", "F00002", "F00003", "F00005", "F00006", "F…
## $ location          <chr> "Sulawesi", "Sulawesi", "Borneo", "Borneo", "Sumatra…
## $ planting_date     <date> 2023-07-05, 2023-09-04, 2023-08-17, 2023-08-29, 202…
## $ harvest_date      <date> 2023-10-09, 2024-01-04, 2024-01-01, 2023-12-29, 202…
## $ yield             <dbl> 3.755435, 5.002442, 7.575964, 5.804625, 4.172883, 6.…
## $ rainfall          <dbl> 2069, 2172, 2595, 1489, 1401, 1601, 1723, 1680, 1624…
## $ temperature       <dbl> 25.62675, 29.30899, 27.39532, 26.49621, 29.39553, 25…
## $ fertilizer_amount <dbl> 357, 266, 118, 152, 54, 348, 313, 290, 398, 271, 468…
## $ growth_days       <dbl> 96, 122, 137, 122, 167, 173, 116, 136, 155, 132, 157…

df_final <- df_clean %>%
  
  # --- 4.1 Ekstrak bulan & tahun tanam ---
  mutate(
    planting_month = month(planting_date, label = TRUE, abbr = FALSE),
    planting_year  = year(planting_date)
  ) %>%
  
  # --- 4.2 Kategorisasi yield (rendah/sedang/tinggi) ---
  mutate(yield_category = case_when(
    yield < 4.5  ~ "Rendah",
    yield < 6.5  ~ "Sedang",
    TRUE         ~ "Tinggi"
  )) %>%
  mutate(yield_category = factor(yield_category, levels = c("Rendah", "Sedang", "Tinggi"))) %>%
  
  # --- 4.3 Kategorisasi suhu ---
  mutate(temp_category = case_when(
    temperature < 27 ~ "Sejuk (<27°C)",
    temperature < 30 ~ "Hangat (27–30°C)",
    TRUE             ~ "Panas (>30°C)"
  )) %>%
  mutate(temp_category = factor(temp_category,
                                levels = c("Sejuk (<27°C)", "Hangat (27–30°C)", "Panas (>30°C)"))) %>%
  
  # --- 4.4 Log-transform rainfall (untuk normalisasi distribusi) ---
  mutate(log_rainfall = log(rainfall))

# Ringkasan data final
summary(df_final %>% select(yield, rainfall, temperature, growth_days, fertilizer_amount))

##      yield           rainfall     temperature     growth_days   
##  Min.   : 2.285   Min.   :1003   Min.   :25.01   Min.   : 90.0  
##  1st Qu.: 5.160   1st Qu.:1458   1st Qu.:26.74   1st Qu.:109.2  
##  Median : 5.909   Median :1972   Median :28.45   Median :133.0  
##  Mean   : 5.958   Mean   :1980   Mean   :28.46   Mean   :133.6  
##  3rd Qu.: 6.802   3rd Qu.:2468   3rd Qu.:30.20   3rd Qu.:157.0  
##  Max.   :10.041   Max.   :2998   Max.   :31.99   Max.   :179.0  
##  fertilizer_amount
##  Min.   : 50.0    
##  1st Qu.:163.5    
##  Median :266.0    
##  Mean   :271.5    
##  3rd Qu.:379.0    
##  Max.   :499.0

p1 <- df_final %>%
  ggplot(aes(x = reorder(location, yield, median), y = yield, fill = location)) +
  geom_boxplot(alpha = 0.8, outlier.shape = 21, outlier.size = 1.5) +
  geom_jitter(width = 0.15, alpha = 0.2, size = 0.8) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3,
               fill = "white", color = "black") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Distribusi Hasil Panen per Pulau",
    subtitle = "Berlian putih = rata-rata | Garis tengah = median",
    x = NULL, y = "Yield (ton/ha)",
    fill = "Pulau"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"))

p1

p2 <- df_final %>%
  ggplot(aes(x = rainfall, y = yield, color = location)) +
  geom_point(alpha = 0.5, size = 1.8) +
  geom_smooth(method = "loess", se = FALSE, color = "gray30", linewidth = 1) +
  scale_color_brewer(palette = "Set2") +
  labs(
    title = "Curah Hujan vs Hasil Panen",
    subtitle = "Garis abu-abu = tren keseluruhan (LOESS)",
    x = "Curah Hujan (mm)", y = "Yield (ton/ha)",
    color = "Pulau"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

p2

## `geom_smooth()` using formula = 'y ~ x'

p3 <- df_final %>%
  ggplot(aes(x = fertilizer_amount, y = yield)) +
  geom_point(aes(color = location), alpha = 0.4, size = 1.5) +
  geom_smooth(method = "lm", se = TRUE, color = "firebrick", linewidth = 1.2) +
  facet_wrap(~location, ncol = 3) +
  scale_color_brewer(palette = "Set2") +
  labs(
    title = "Pengaruh Jumlah Pupuk terhadap Hasil Panen (per Pulau)",
    subtitle = "Garis merah = regresi linear | Bayangan = confidence interval 95%",
    x = "Jumlah Pupuk (kg/ha)", y = "Yield (ton/ha)"
  ) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"),
        strip.text = element_text(face = "bold"))

p3

## `geom_smooth()` using formula = 'y ~ x'

p4 <- df_final %>%
  ggplot(aes(x = yield_category, y = growth_days, fill = yield_category)) +
  geom_violin(alpha = 0.7, trim = FALSE) +
  geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("Rendah" = "#d73027", "Sedang" = "#fee08b", "Tinggi" = "#1a9850")) +
  labs(
    title = "Lama Masa Tanam berdasarkan Kategori Hasil Panen",
    subtitle = "Apakah masa tanam lebih lama menghasilkan panen lebih baik?",
    x = "Kategori Yield", y = "Durasi Tanam (hari)"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"))

p4

p5 <- df_final %>%
  group_by(location, planting_month) %>%
  summarise(mean_yield = mean(yield), .groups = "drop") %>%
  ggplot(aes(x = planting_month, y = location, fill = mean_yield)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(mean_yield, 1)), size = 3, fontface = "bold") +
  scale_fill_gradient2(low = "#d73027", mid = "#fee08b", high = "#1a9850",
                       midpoint = median(df_final$yield)) +
  labs(
    title = "Rata-rata Yield per Pulau & Bulan Tanam",
    subtitle = "Hijau = hasil tinggi | Merah = hasil rendah",
    x = "Bulan Tanam", y = NULL, fill = "Avg Yield\n(ton/ha)"
  ) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold"),
        axis.text.x = element_text(angle = 30, hjust = 1))

p5

df_final %>%
  group_by(location) %>%
  summarise(
    `Jumlah Data`     = n(),
    `Rata-rata Yield` = round(mean(yield), 2),
    `Median Yield`    = round(median(yield), 2),
    `Std Dev Yield`   = round(sd(yield), 2),
    `Avg Curah Hujan` = round(mean(rainfall)),
    `Avg Suhu (°C)`   = round(mean(temperature), 1),
    `Avg Lama Tanam`  = round(mean(growth_days))
  ) %>%
  arrange(desc(`Rata-rata Yield`)) %>%
  kable(caption = "Ringkasan Statistik per Pulau")

Ringkasan Statistik per Pulau
location	Jumlah Data	Rata-rata Yield	Median Yield	Std Dev Yield	Avg Curah Hujan	Avg Suhu (°C)	Avg Lama Tanam
Borneo	188	6.04	5.87	1.27	2026	28.6	131
Sulawesi	168	5.98	5.90	1.23	1960	28.4	134
Nusa Tenggara	185	5.93	5.96	1.26	1959	28.4	134
Java	183	5.92	6.01	1.19	1961	28.3	135
Sumatra	182	5.91	5.87	1.20	1990	28.6	135

Week 11 EDA

Ahmad Rasyid Maulana

2026-05-11