#RECALL LIBRARY

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library (rlang)
## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
library (vctrs)
## 
## Attaching package: 'vctrs'
## 
## The following object is masked from 'package:dplyr':
## 
##     data_frame
## 
## The following object is masked from 'package:tibble':
## 
##     data_frame

#CALLING DATA

agriculture_harvest <- read.csv("C:/Users/RAIHAN PUTR/OneDrive/Kuliah/Pengantar Sains Data/ADE/05_AGRICULTURE_HARVEST.csv")

#EKSPLORASI DARA

glimpse(agriculture_harvest)
## Rows: 1,000
## Columns: 8
## $ farmer_id         <chr> "F00001", "F00002", "F00003", "F00004", "F00005", "F…
## $ location          <chr> "Sulawesi", "Sulawesi", "Borneo", "Nusa Tenggara", "…
## $ planting_date     <chr> "2023-07-05 00:00:00", "2023-09-04 00:00:00", "2023-…
## $ harvest_date      <chr> "2023-10-09 00:00:00", "2024-01-04 00:00:00", "2024-…
## $ yield             <dbl> 3.755435, 5.002442, 7.575964, 0.500000, 5.804625, 4.…
## $ rainfall          <int> 2069, 2172, 2595, 10000, 1489, 1401, 1601, 1723, 168…
## $ temperature       <dbl> 25.62675, 29.30899, 27.39532, 25.49576, 26.49621, 29…
## $ fertilizer_amount <int> 357, 266, 118, 293, 152, 54, 348, 313, 290, 383, 398…
summary(agriculture_harvest)
##      farmer_id         location      planting_date     harvest_date 
##  Length   :1000   Length   :1000   Length   :1000   Length   :1000  
##  N.unique :1000   N.unique :  10   N.unique : 197   N.unique : 256  
##  N.blank  :   0   N.blank  :   0   N.blank  :   0   N.blank  :   0  
##  Min.nchar:   6   Min.nchar:   4   Min.nchar:  19   Min.nchar:  19  
##  Max.nchar:   6   Max.nchar:  13   Max.nchar:  19   Max.nchar:  19  
##                                                                     
##                                                                     
##      yield           rainfall      temperature    fertilizer_amount
##  Min.   :-5.000   Min.   :    0   Min.   :25.00   Min.   : 50.0    
##  1st Qu.: 5.097   1st Qu.: 1464   1st Qu.:26.74   1st Qu.:168.0    
##  Median : 5.886   Median : 1974   Median :28.46   Median :268.0    
##  Mean   : 6.000   Mean   : 2077   Mean   :28.47   Mean   :272.3    
##  3rd Qu.: 6.821   3rd Qu.: 2484   3rd Qu.:30.24   3rd Qu.:380.5    
##  Max.   :20.000   Max.   :10000   Max.   :31.99   Max.   :499.0    
##  NAs    :20       NAs    :29      NAs    :20
colSums(is.na(agriculture_harvest))
##         farmer_id          location     planting_date      harvest_date 
##                 0                 0                 0                 0 
##             yield          rainfall       temperature fertilizer_amount 
##                20                29                20                 0

#DELETE MISSING VALUE

agriculture_clean = agriculture_harvest %>% drop_na()
# DATA TRANSFORMATION

# Menggunakan data 'agriculture_clean' yang sudah ada sebelumnya
agriculture_transformed <- agriculture_clean %>%
  mutate(
    # 1. Transformasi Tipe Data (Tanggal)
    # Menghapus format jam dan menjadikannya objek Date murni
    planting_date = as.Date(planting_date),
    harvest_date  = as.Date(harvest_date),
    
    # 2. Transformasi Teks (Lokasi)
    # Menyamakan format kapitalisasi (Contoh: "nusa tenggara" -> "Nusa Tenggara")
    location = str_to_title(location),
    
    # 3. Pembuatan Variabel Baru (Feature Engineering)
    # Menghitung selisih hari antara panen dan tanam
    planting_duration = as.numeric(harvest_date - planting_date),
    
    # 4. Transformasi Nilai (Rounding)
    # Membulatkan angka desimal yang terlalu panjang agar rapi
    yield       = round(yield, 2),
    temperature = round(temperature, 1),
    
    # 5. Transformasi Kategorikal (Opsional)
    # Mengelompokkan yield ke dalam kategori untuk memudahkan visualisasi
    yield_group = case_when(
      yield < 5 ~ "Rendah",
      yield >= 5 & yield < 8 ~ "Sedang",
      yield >= 8 ~ "Tinggi"
    )
  )

# Cek hasil transformasi
glimpse(agriculture_transformed)
## Rows: 931
## Columns: 10
## $ farmer_id         <chr> "F00001", "F00002", "F00003", "F00004", "F00005", "F…
## $ location          <chr> "Sulawesi", "Sulawesi", "Borneo", "Nusa Tenggara", "…
## $ planting_date     <date> 2023-07-05, 2023-09-04, 2023-08-17, 2023-07-06, 202…
## $ harvest_date      <date> 2023-10-09, 2024-01-04, 2024-01-01, 2023-12-08, 202…
## $ yield             <dbl> 3.76, 5.00, 7.58, 0.50, 5.80, 4.17, 6.01, 5.52, 5.60…
## $ rainfall          <int> 2069, 2172, 2595, 10000, 1489, 1401, 1601, 1723, 168…
## $ temperature       <dbl> 25.6, 29.3, 27.4, 25.5, 26.5, 29.4, 25.2, 29.1, 31.7…
## $ fertilizer_amount <int> 357, 266, 118, 293, 152, 54, 348, 313, 290, 398, 271…
## $ planting_duration <dbl> 96, 122, 137, 155, 122, 167, 173, 116, 136, 155, 132…
## $ yield_group       <chr> "Rendah", "Sedang", "Sedang", "Rendah", "Sedang", "R…