#RECALL LIBRARY
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library (rlang)
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
library (vctrs)
##
## Attaching package: 'vctrs'
##
## The following object is masked from 'package:dplyr':
##
## data_frame
##
## The following object is masked from 'package:tibble':
##
## data_frame
#CALLING DATA
agriculture_harvest <- read.csv("C:/Users/RAIHAN PUTR/OneDrive/Kuliah/Pengantar Sains Data/ADE/05_AGRICULTURE_HARVEST.csv")
#EKSPLORASI DARA
glimpse(agriculture_harvest)
## Rows: 1,000
## Columns: 8
## $ farmer_id <chr> "F00001", "F00002", "F00003", "F00004", "F00005", "F…
## $ location <chr> "Sulawesi", "Sulawesi", "Borneo", "Nusa Tenggara", "…
## $ planting_date <chr> "2023-07-05 00:00:00", "2023-09-04 00:00:00", "2023-…
## $ harvest_date <chr> "2023-10-09 00:00:00", "2024-01-04 00:00:00", "2024-…
## $ yield <dbl> 3.755435, 5.002442, 7.575964, 0.500000, 5.804625, 4.…
## $ rainfall <int> 2069, 2172, 2595, 10000, 1489, 1401, 1601, 1723, 168…
## $ temperature <dbl> 25.62675, 29.30899, 27.39532, 25.49576, 26.49621, 29…
## $ fertilizer_amount <int> 357, 266, 118, 293, 152, 54, 348, 313, 290, 383, 398…
summary(agriculture_harvest)
## farmer_id location planting_date harvest_date
## Length :1000 Length :1000 Length :1000 Length :1000
## N.unique :1000 N.unique : 10 N.unique : 197 N.unique : 256
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 6 Min.nchar: 4 Min.nchar: 19 Min.nchar: 19
## Max.nchar: 6 Max.nchar: 13 Max.nchar: 19 Max.nchar: 19
##
##
## yield rainfall temperature fertilizer_amount
## Min. :-5.000 Min. : 0 Min. :25.00 Min. : 50.0
## 1st Qu.: 5.097 1st Qu.: 1464 1st Qu.:26.74 1st Qu.:168.0
## Median : 5.886 Median : 1974 Median :28.46 Median :268.0
## Mean : 6.000 Mean : 2077 Mean :28.47 Mean :272.3
## 3rd Qu.: 6.821 3rd Qu.: 2484 3rd Qu.:30.24 3rd Qu.:380.5
## Max. :20.000 Max. :10000 Max. :31.99 Max. :499.0
## NAs :20 NAs :29 NAs :20
colSums(is.na(agriculture_harvest))
## farmer_id location planting_date harvest_date
## 0 0 0 0
## yield rainfall temperature fertilizer_amount
## 20 29 20 0
#DELETE MISSING VALUE
agriculture_clean = agriculture_harvest %>% drop_na()
# DATA TRANSFORMATION
# Menggunakan data 'agriculture_clean' yang sudah ada sebelumnya
agriculture_transformed <- agriculture_clean %>%
mutate(
# 1. Transformasi Tipe Data (Tanggal)
# Menghapus format jam dan menjadikannya objek Date murni
planting_date = as.Date(planting_date),
harvest_date = as.Date(harvest_date),
# 2. Transformasi Teks (Lokasi)
# Menyamakan format kapitalisasi (Contoh: "nusa tenggara" -> "Nusa Tenggara")
location = str_to_title(location),
# 3. Pembuatan Variabel Baru (Feature Engineering)
# Menghitung selisih hari antara panen dan tanam
planting_duration = as.numeric(harvest_date - planting_date),
# 4. Transformasi Nilai (Rounding)
# Membulatkan angka desimal yang terlalu panjang agar rapi
yield = round(yield, 2),
temperature = round(temperature, 1),
# 5. Transformasi Kategorikal (Opsional)
# Mengelompokkan yield ke dalam kategori untuk memudahkan visualisasi
yield_group = case_when(
yield < 5 ~ "Rendah",
yield >= 5 & yield < 8 ~ "Sedang",
yield >= 8 ~ "Tinggi"
)
)
# Cek hasil transformasi
glimpse(agriculture_transformed)
## Rows: 931
## Columns: 10
## $ farmer_id <chr> "F00001", "F00002", "F00003", "F00004", "F00005", "F…
## $ location <chr> "Sulawesi", "Sulawesi", "Borneo", "Nusa Tenggara", "…
## $ planting_date <date> 2023-07-05, 2023-09-04, 2023-08-17, 2023-07-06, 202…
## $ harvest_date <date> 2023-10-09, 2024-01-04, 2024-01-01, 2023-12-08, 202…
## $ yield <dbl> 3.76, 5.00, 7.58, 0.50, 5.80, 4.17, 6.01, 5.52, 5.60…
## $ rainfall <int> 2069, 2172, 2595, 10000, 1489, 1401, 1601, 1723, 168…
## $ temperature <dbl> 25.6, 29.3, 27.4, 25.5, 26.5, 29.4, 25.2, 29.1, 31.7…
## $ fertilizer_amount <int> 357, 266, 118, 293, 152, 54, 348, 313, 290, 398, 271…
## $ planting_duration <dbl> 96, 122, 137, 155, 122, 167, 173, 116, 136, 155, 132…
## $ yield_group <chr> "Rendah", "Sedang", "Sedang", "Rendah", "Sedang", "R…