# 1. Memuat Library
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
data <- read.csv("D:/SEMESTER ^6^ DPEN/Data Mining/Projek/weatherHistory.csv")
glimpse(data)
## Rows: 96,453
## Columns: 12
## $ Formatted.Date <chr> "2006-04-01 00:00:00.000 +0200", "2006-04-01 …
## $ Summary <chr> "Partly Cloudy", "Partly Cloudy", "Mostly Clo…
## $ Precip.Type <chr> "rain", "rain", "rain", "rain", "rain", "rain…
## $ Temperature..C. <dbl> 9.472222, 9.355556, 9.377778, 8.288889, 8.755…
## $ Apparent.Temperature..C. <dbl> 7.388889, 7.227778, 9.377778, 5.944444, 6.977…
## $ Humidity <dbl> 0.89, 0.86, 0.89, 0.83, 0.83, 0.85, 0.95, 0.8…
## $ Wind.Speed..km.h. <dbl> 14.1197, 14.2646, 3.9284, 14.1036, 11.0446, 1…
## $ Wind.Bearing..degrees. <dbl> 251, 259, 204, 269, 259, 258, 259, 260, 259, …
## $ Visibility..km. <dbl> 15.8263, 15.8263, 14.9569, 15.8263, 15.8263, …
## $ Loud.Cover <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Pressure..millibars. <dbl> 1015.13, 1015.63, 1015.94, 1016.41, 1016.51, …
## $ Daily.Summary <chr> "Partly cloudy throughout the day.", "Partly …
summary(data)
## Formatted.Date Summary Precip.Type Temperature..C.
## Length:96453 Length:96453 Length:96453 Min. :-21.822
## Class :character Class :character Class :character 1st Qu.: 4.689
## Mode :character Mode :character Mode :character Median : 12.000
## Mean : 11.933
## 3rd Qu.: 18.839
## Max. : 39.906
## Apparent.Temperature..C. Humidity Wind.Speed..km.h.
## Min. :-27.717 Min. :0.0000 Min. : 0.000
## 1st Qu.: 2.311 1st Qu.:0.6000 1st Qu.: 5.828
## Median : 12.000 Median :0.7800 Median : 9.966
## Mean : 10.855 Mean :0.7349 Mean :10.811
## 3rd Qu.: 18.839 3rd Qu.:0.8900 3rd Qu.:14.136
## Max. : 39.344 Max. :1.0000 Max. :63.853
## Wind.Bearing..degrees. Visibility..km. Loud.Cover Pressure..millibars.
## Min. : 0.0 Min. : 0.00 Min. :0 Min. : 0
## 1st Qu.:116.0 1st Qu.: 8.34 1st Qu.:0 1st Qu.:1012
## Median :180.0 Median :10.05 Median :0 Median :1016
## Mean :187.5 Mean :10.35 Mean :0 Mean :1003
## 3rd Qu.:290.0 3rd Qu.:14.81 3rd Qu.:0 3rd Qu.:1021
## Max. :359.0 Max. :16.10 Max. :0 Max. :1046
## Daily.Summary
## Length:96453
## Class :character
## Mode :character
##
##
##
# Menyalin ke variabel kerja
data_clean <- data
# Membuat label target berdasarkan "Summary"
data_clean$RainLabel <- ifelse(grepl("Rain", data_clean$Summary), "Rain", "NotRain")
data_clean$RainLabel <- as.factor(data_clean$RainLabel)
# Mengecek distribusi kelas
table(data_clean$RainLabel)
##
## NotRain Rain
## 96380 73
set.seed(123)
index <- createDataPartition(data_clean$RainLabel, p = 0.7, list = FALSE)
train_data <- data_clean[index, ]
test_data <- data_clean[-index, ]
# Menghapus kolom non-numerik yang tidak diperlukan
rf_model <- randomForest(RainLabel ~ . -Summary -Formatted.Date -Daily.Summary,
data = train_data, ntree = 100, importance = TRUE)
# Melakukan prediksi
pred_rf <- predict(rf_model, newdata = test_data)
# Confusion Matrix
confusionMatrix(data = pred_rf, reference = test_data$RainLabel, positive = "Rain")
## Confusion Matrix and Statistics
##
## Reference
## Prediction NotRain Rain
## NotRain 28914 17
## Rain 0 4
##
## Accuracy : 0.9994
## 95% CI : (0.9991, 0.9997)
## No Information Rate : 0.9993
## P-Value [Acc > NIR] : 0.2268698
##
## Kappa : 0.3198
##
## Mcnemar's Test P-Value : 0.0001042
##
## Sensitivity : 0.1904762
## Specificity : 1.0000000
## Pos Pred Value : 1.0000000
## Neg Pred Value : 0.9994124
## Prevalence : 0.0007258
## Detection Rate : 0.0001382
## Detection Prevalence : 0.0001382
## Balanced Accuracy : 0.5952381
##
## 'Positive' Class : Rain
##