library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(lubridate)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.5.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.3
library(knitr)
# Load data dari file Excel
df_raw <- read_excel("05_AGRICULTURE_HARVEST.xlsx", sheet = "Harvest")
# Cek sekilas
glimpse(df_raw)
## Rows: 1,000
## Columns: 8
## $ farmer_id <chr> "F00001", "F00002", "F00003", "F00004", "F00005", "F…
## $ location <chr> "Sulawesi", "Sulawesi", "Borneo", "Nusa Tenggara", "…
## $ planting_date <dttm> 2023-07-05, 2023-09-04, 2023-08-17, 2023-07-06, 202…
## $ harvest_date <dttm> 2023-10-09, 2024-01-04, 2024-01-01, 2023-12-08, 202…
## $ yield <dbl> 3.755435, 5.002442, 7.575964, 0.500000, 5.804625, 4.…
## $ rainfall <dbl> 2069, 2172, 2595, 10000, 1489, 1401, 1601, 1723, 168…
## $ temperature <dbl> 25.62675, 29.30899, 27.39532, 25.49576, 26.49621, 29…
## $ fertilizer_amount <dbl> 357, 266, 118, 293, 152, 54, 348, 313, 290, 383, 398…
head(df_raw)
## # A tibble: 6 × 8
## farmer_id location planting_date harvest_date yield rainfall
## <chr> <chr> <dttm> <dttm> <dbl> <dbl>
## 1 F00001 Sulawesi 2023-07-05 00:00:00 2023-10-09 00:00:00 3.76 2069
## 2 F00002 Sulawesi 2023-09-04 00:00:00 2024-01-04 00:00:00 5.00 2172
## 3 F00003 Borneo 2023-08-17 00:00:00 2024-01-01 00:00:00 7.58 2595
## 4 F00004 Nusa Tenggara 2023-07-06 00:00:00 2023-12-08 00:00:00 0.5 10000
## 5 F00005 Borneo 2023-08-29 00:00:00 2023-12-29 00:00:00 5.80 1489
## 6 F00006 Sumatra 2023-08-24 00:00:00 2024-02-07 00:00:00 4.17 1401
## # ℹ 2 more variables: temperature <dbl>, fertilizer_amount <dbl>
# Ringkasan statistik
summary(df_raw)
## farmer_id location planting_date
## Length:1000 Length:1000 Min. :2023-06-01 00:00:00
## Class :character Class :character 1st Qu.:2023-07-18 00:00:00
## Mode :character Mode :character Median :2023-09-11 00:00:00
## Mean :2023-09-07 14:49:55
## 3rd Qu.:2023-10-27 00:00:00
## Max. :2023-12-17 00:00:00
##
## harvest_date yield rainfall temperature
## Min. :2023-09-03 00:00:00 Min. :-5.000 Min. : 0 Min. :25.00
## 1st Qu.:2023-11-30 18:00:00 1st Qu.: 5.097 1st Qu.: 1464 1st Qu.:26.74
## Median :2024-01-21 12:00:00 Median : 5.886 Median : 1974 Median :28.46
## Mean :2024-01-18 15:38:52 Mean : 6.000 Mean : 2077 Mean :28.47
## 3rd Qu.:2024-03-10 00:00:00 3rd Qu.: 6.821 3rd Qu.: 2484 3rd Qu.:30.24
## Max. :2024-06-05 00:00:00 Max. :20.000 Max. :10000 Max. :31.99
## NA's :20 NA's :29 NA's :20
## fertilizer_amount
## Min. : 50.0
## 1st Qu.:168.0
## Median :268.0
## Mean :272.3
## 3rd Qu.:380.5
## Max. :499.0
##
# Cek missing values per kolom
cat("=== MISSING VALUES ===\n")
## === MISSING VALUES ===
colSums(is.na(df_raw))
## farmer_id location planting_date harvest_date
## 0 0 0 0
## yield rainfall temperature fertilizer_amount
## 20 29 20 0
# Cek nilai unik di kolom location (deteksi inkonsistensi)
cat("\n=== NILAI UNIK LOCATION ===\n")
##
## === NILAI UNIK LOCATION ===
unique(df_raw$location)
## [1] "Sulawesi" "Borneo" "Nusa Tenggara" "Sumatra"
## [5] "Java" "nusa tenggara" "sumatra" "borneo"
## [9] "java" "sulawesi"
# Cek nilai ekstrem di yield
cat("\n=== NILAI YIELD MENCURIGAKAN ===\n")
##
## === NILAI YIELD MENCURIGAKAN ===
df_raw %>% filter(yield <= 0 | yield >= 15) %>%
select(farmer_id, location, yield, rainfall) %>%
print()
## # A tibble: 21 × 4
## farmer_id location yield rainfall
## <chr> <chr> <dbl> <dbl>
## 1 F00013 Nusa Tenggara 20 10000
## 2 F00031 Borneo 20 10000
## 3 F00043 Nusa Tenggara 20 0
## 4 F00120 Sumatra -5 10000
## 5 F00156 Borneo 20 10000
## 6 F00167 Java -5 0
## 7 F00172 Borneo -5 10000
## 8 F00347 Sumatra -5 0
## 9 F00350 Sulawesi 20 0
## 10 F00363 Nusa Tenggara 20 10000
## # ℹ 11 more rows
df_clean <- df_raw %>%
# --- 3.1 Standardisasi kapitalisasi kolom location ---
mutate(location = str_to_title(location)) %>%
# --- 3.2 Konversi serial Excel ke tanggal ---
# Serial Excel dihitung dari 1 Jan 1900; R menggunakan origin "1899-12-30"
mutate(
planting_date = as.Date(planting_date, origin = "1899-12-30"),
harvest_date = as.Date(harvest_date, origin = "1899-12-30")
) %>%
# --- 3.3 Hitung durasi tanam (hari) ---
mutate(growth_days = as.numeric(harvest_date - planting_date)) %>%
# --- 3.4 Hapus baris dengan yield tidak valid ---
# Yield negatif (-5) dan nilai absurd (20 ton/ha untuk padi sangat tidak realistis)
# Batas wajar: 0.5–12 ton/ha untuk pertanian tropis
filter(!is.na(yield)) %>%
filter(yield > 0.5 & yield < 15) %>%
# --- 3.5 Hapus baris dengan rainfall tidak valid ---
# Nilai 0, 5000, 10000 mm tidak realistis untuk musim tanam
filter(!is.na(rainfall)) %>%
filter(rainfall > 100 & rainfall < 4000) %>%
# --- 3.6 Hapus baris dengan temperature NA ---
filter(!is.na(temperature))
# Cek hasil cleaning
cat("=== UKURAN DATA ===\n")
## === UKURAN DATA ===
cat("Sebelum cleaning:", nrow(df_raw), "baris\n")
## Sebelum cleaning: 1000 baris
cat("Setelah cleaning:", nrow(df_clean), "baris\n")
## Setelah cleaning: 906 baris
cat("Baris dihapus :", nrow(df_raw) - nrow(df_clean), "baris\n\n")
## Baris dihapus : 94 baris
glimpse(df_clean)
## Rows: 906
## Columns: 9
## $ farmer_id <chr> "F00001", "F00002", "F00003", "F00005", "F00006", "F…
## $ location <chr> "Sulawesi", "Sulawesi", "Borneo", "Borneo", "Sumatra…
## $ planting_date <date> 2023-07-05, 2023-09-04, 2023-08-17, 2023-08-29, 202…
## $ harvest_date <date> 2023-10-09, 2024-01-04, 2024-01-01, 2023-12-29, 202…
## $ yield <dbl> 3.755435, 5.002442, 7.575964, 5.804625, 4.172883, 6.…
## $ rainfall <dbl> 2069, 2172, 2595, 1489, 1401, 1601, 1723, 1680, 1624…
## $ temperature <dbl> 25.62675, 29.30899, 27.39532, 26.49621, 29.39553, 25…
## $ fertilizer_amount <dbl> 357, 266, 118, 152, 54, 348, 313, 290, 398, 271, 468…
## $ growth_days <dbl> 96, 122, 137, 122, 167, 173, 116, 136, 155, 132, 157…
df_final <- df_clean %>%
# --- 4.1 Ekstrak bulan & tahun tanam ---
mutate(
planting_month = month(planting_date, label = TRUE, abbr = FALSE),
planting_year = year(planting_date)
) %>%
# --- 4.2 Kategorisasi yield (rendah/sedang/tinggi) ---
mutate(yield_category = case_when(
yield < 4.5 ~ "Rendah",
yield < 6.5 ~ "Sedang",
TRUE ~ "Tinggi"
)) %>%
mutate(yield_category = factor(yield_category, levels = c("Rendah", "Sedang", "Tinggi"))) %>%
# --- 4.3 Kategorisasi suhu ---
mutate(temp_category = case_when(
temperature < 27 ~ "Sejuk (<27°C)",
temperature < 30 ~ "Hangat (27–30°C)",
TRUE ~ "Panas (>30°C)"
)) %>%
mutate(temp_category = factor(temp_category,
levels = c("Sejuk (<27°C)", "Hangat (27–30°C)", "Panas (>30°C)"))) %>%
# --- 4.4 Log-transform rainfall (untuk normalisasi distribusi) ---
mutate(log_rainfall = log(rainfall))
# Ringkasan data final
summary(df_final %>% select(yield, rainfall, temperature, growth_days, fertilizer_amount))
## yield rainfall temperature growth_days
## Min. : 2.285 Min. :1003 Min. :25.01 Min. : 90.0
## 1st Qu.: 5.160 1st Qu.:1458 1st Qu.:26.74 1st Qu.:109.2
## Median : 5.909 Median :1972 Median :28.45 Median :133.0
## Mean : 5.958 Mean :1980 Mean :28.46 Mean :133.6
## 3rd Qu.: 6.802 3rd Qu.:2468 3rd Qu.:30.20 3rd Qu.:157.0
## Max. :10.041 Max. :2998 Max. :31.99 Max. :179.0
## fertilizer_amount
## Min. : 50.0
## 1st Qu.:163.5
## Median :266.0
## Mean :271.5
## 3rd Qu.:379.0
## Max. :499.0
p1 <- df_final %>%
ggplot(aes(x = reorder(location, yield, median), y = yield, fill = location)) +
geom_boxplot(alpha = 0.8, outlier.shape = 21, outlier.size = 1.5) +
geom_jitter(width = 0.15, alpha = 0.2, size = 0.8) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3,
fill = "white", color = "black") +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Distribusi Hasil Panen per Pulau",
subtitle = "Berlian putih = rata-rata | Garis tengah = median",
x = NULL, y = "Yield (ton/ha)",
fill = "Pulau"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold"))
p1

p2 <- df_final %>%
ggplot(aes(x = rainfall, y = yield, color = location)) +
geom_point(alpha = 0.5, size = 1.8) +
geom_smooth(method = "loess", se = FALSE, color = "gray30", linewidth = 1) +
scale_color_brewer(palette = "Set2") +
labs(
title = "Curah Hujan vs Hasil Panen",
subtitle = "Garis abu-abu = tren keseluruhan (LOESS)",
x = "Curah Hujan (mm)", y = "Yield (ton/ha)",
color = "Pulau"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))
p2
## `geom_smooth()` using formula = 'y ~ x'

p3 <- df_final %>%
ggplot(aes(x = fertilizer_amount, y = yield)) +
geom_point(aes(color = location), alpha = 0.4, size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "firebrick", linewidth = 1.2) +
facet_wrap(~location, ncol = 3) +
scale_color_brewer(palette = "Set2") +
labs(
title = "Pengaruh Jumlah Pupuk terhadap Hasil Panen (per Pulau)",
subtitle = "Garis merah = regresi linear | Bayangan = confidence interval 95%",
x = "Jumlah Pupuk (kg/ha)", y = "Yield (ton/ha)"
) +
theme_minimal(base_size = 11) +
theme(legend.position = "none",
plot.title = element_text(face = "bold"),
strip.text = element_text(face = "bold"))
p3
## `geom_smooth()` using formula = 'y ~ x'

p4 <- df_final %>%
ggplot(aes(x = yield_category, y = growth_days, fill = yield_category)) +
geom_violin(alpha = 0.7, trim = FALSE) +
geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA) +
scale_fill_manual(values = c("Rendah" = "#d73027", "Sedang" = "#fee08b", "Tinggi" = "#1a9850")) +
labs(
title = "Lama Masa Tanam berdasarkan Kategori Hasil Panen",
subtitle = "Apakah masa tanam lebih lama menghasilkan panen lebih baik?",
x = "Kategori Yield", y = "Durasi Tanam (hari)"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold"))
p4

p5 <- df_final %>%
group_by(location, planting_month) %>%
summarise(mean_yield = mean(yield), .groups = "drop") %>%
ggplot(aes(x = planting_month, y = location, fill = mean_yield)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(aes(label = round(mean_yield, 1)), size = 3, fontface = "bold") +
scale_fill_gradient2(low = "#d73027", mid = "#fee08b", high = "#1a9850",
midpoint = median(df_final$yield)) +
labs(
title = "Rata-rata Yield per Pulau & Bulan Tanam",
subtitle = "Hijau = hasil tinggi | Merah = hasil rendah",
x = "Bulan Tanam", y = NULL, fill = "Avg Yield\n(ton/ha)"
) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 30, hjust = 1))
p5

df_final %>%
group_by(location) %>%
summarise(
`Jumlah Data` = n(),
`Rata-rata Yield` = round(mean(yield), 2),
`Median Yield` = round(median(yield), 2),
`Std Dev Yield` = round(sd(yield), 2),
`Avg Curah Hujan` = round(mean(rainfall)),
`Avg Suhu (°C)` = round(mean(temperature), 1),
`Avg Lama Tanam` = round(mean(growth_days))
) %>%
arrange(desc(`Rata-rata Yield`)) %>%
kable(caption = "Ringkasan Statistik per Pulau")
Ringkasan Statistik per Pulau
| Borneo |
188 |
6.04 |
5.87 |
1.27 |
2026 |
28.6 |
131 |
| Sulawesi |
168 |
5.98 |
5.90 |
1.23 |
1960 |
28.4 |
134 |
| Nusa Tenggara |
185 |
5.93 |
5.96 |
1.26 |
1959 |
28.4 |
134 |
| Java |
183 |
5.92 |
6.01 |
1.19 |
1961 |
28.3 |
135 |
| Sumatra |
182 |
5.91 |
5.87 |
1.20 |
1990 |
28.6 |
135 |