Pendahuluan

Dataset Electric Vehicle Population Data digunakan untuk menganalisis karakteristik kendaraan listrik berdasarkan berbagai atribut kendaraan. Variabel target yang digunakan adalah Electric Vehicle Type yang terdiri dari Battery Electric Vehicle (BEV) dan Plug-in Hybrid Electric Vehicle (PHEV). # Import Library

library(readxl)
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(randomForest)

Import Data

data <- read_excel("Electric_Vehicle_Population_Data.xlsx")

dim(data)
## [1] 279780     17
str(data)
## tibble [279,780 × 17] (S3: tbl_df/tbl/data.frame)
##  $ VIN (1-10)                                       : chr [1:279780] "JN1AZ0CP5C" "JTMABABA7P" "1N4AZ1CP1J" "5UX43EU09S" ...
##  $ County                                           : chr [1:279780] "Stevens" "Yakima" "King" "Kitsap" ...
##  $ City                                             : chr [1:279780] "Colville" "Yakima" "Seattle" "Poulsbo" ...
##  $ State                                            : chr [1:279780] "WA" "WA" "WA" "WA" ...
##  $ Postal Code                                      : num [1:279780] 99114 98903 98122 98370 98597 ...
##  $ Model Year                                       : num [1:279780] 2012 2023 2018 2025 2015 ...
##  $ Make                                             : chr [1:279780] "NISSAN" "SUBARU" "NISSAN" "BMW" ...
##  $ Model                                            : chr [1:279780] "LEAF" "SOLTERRA" "LEAF" "X5" ...
##  $ Electric Vehicle Type                            : chr [1:279780] "Battery Electric Vehicle (BEV)" "Battery Electric Vehicle (BEV)" "Battery Electric Vehicle (BEV)" "Plug-in Hybrid Electric Vehicle (PHEV)" ...
##  $ Clean Alternative Fuel Vehicle (CAFV) Eligibility: chr [1:279780] "Clean Alternative Fuel Vehicle Eligible" "Eligibility unknown as battery range has not been researched" "Clean Alternative Fuel Vehicle Eligible" "Clean Alternative Fuel Vehicle Eligible" ...
##  $ Electric Range                                   : num [1:279780] 73 0 151 40 87 84 81 21 0 19 ...
##  $ Legislative District                             : num [1:279780] 7 15 37 23 2 22 32 44 9 23 ...
##  $ DOL Vehicle ID                                   : num [1:279780] 1.53e+08 2.54e+08 3.33e+08 2.68e+08 4.74e+08 ...
##  $ Vehicle Location                                 : chr [1:279780] "POINT (-117.90454 48.54657)" "POINT (-120.71847 46.55029)" "POINT (-122.31009 47.60803)" "POINT (-122.64681 47.73689)" ...
##  $ Electric Utility                                 : chr [1:279780] "AVISTA CORP" "PACIFICORP" "CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA)" "PUGET SOUND ENERGY INC" ...
##  $ 2020 Census Tract                                : num [1:279780] 5.31e+10 5.31e+10 5.30e+10 5.30e+10 5.31e+10 ...
##  $ ...17                                            : logi [1:279780] NA NA NA NA NA NA ...

Preprocessing Data

Memilih variabel yang digunakan dalam analisis.

data <- data %>%
  select(
    `Model Year`,
    Make,
    `Electric Vehicle Type`,
    `Clean Alternative Fuel Vehicle (CAFV) Eligibility`,
    `Electric Range`,
    `Legislative District`
  )
names(data) <- c(
  "Model_Year",
  "Make",
  "Electric_Vehicle_Type",
  "CAFV_Eligibility",
  "Electric_Range",
  "Legislative_District"
)
data$Model_Year <- as.numeric(data$Model_Year)

data$Electric_Range <- as.numeric(data$Electric_Range)

data$Legislative_District <- as.numeric(data$Legislative_District)

data$Make <- as.factor(data$Make)

data$CAFV_Eligibility <- as.factor(data$CAFV_Eligibility)

data$Electric_Vehicle_Type <- as.factor(data$Electric_Vehicle_Type)

Penanganan Missing Value

data$Electric_Range[
  is.na(data$Electric_Range)
] <- median(
  data$Electric_Range,
  na.rm = TRUE
)

data$Legislative_District[
  is.na(data$Legislative_District)
] <- median(
  data$Legislative_District,
  na.rm = TRUE
)

data <- na.omit(data)

Distribusi Jenis Kendaraan

ggplot(data,
       aes(x = Electric_Vehicle_Type,
           fill = Electric_Vehicle_Type)) +
  geom_bar() +
  theme_minimal()

## Distribusi Tahun Produksi

ggplot(data,
       aes(x = Model_Year)) +
  geom_histogram(
    bins = 20,
    color = "black"
  ) +
  theme_minimal()

## Distribusi Electric Range

ggplot(data,
       aes(x = Electric_Range)) +
  geom_histogram(
    bins = 30,
    color = "black"
  ) +
  theme_minimal()

## Top 10 Merek Kendaraan

top_make <- data %>%
  count(Make) %>%
  arrange(desc(n)) %>%
  slice(1:10)

ggplot(top_make,
       aes(x = reorder(Make,n),
           y = n)) +
  geom_col() +
  coord_flip()

# Pembagian Data Training dan Testing

set.seed(123)

index <- sample(
  1:nrow(data),
  size = 0.8*nrow(data)
)

train_data <- data[index,]

test_data <- data[-index,]

Decision Tree

tree_model <- rpart(
  Electric_Vehicle_Type ~ .,
  data = train_data,
  method = "class"
)

rpart.plot(tree_model)

## Confusion Matrix

tree_pred <- predict(
  tree_model,
  test_data,
  type = "class"
)

cm_tree <- table(
  Aktual = test_data$Electric_Vehicle_Type,
  Prediksi = tree_pred
)

cm_tree
##                                         Prediksi
## Aktual                                   Battery Electric Vehicle (BEV)
##   Battery Electric Vehicle (BEV)                                  44727
##   Plug-in Hybrid Electric Vehicle (PHEV)                            127
##                                         Prediksi
## Aktual                                   Plug-in Hybrid Electric Vehicle (PHEV)
##   Battery Electric Vehicle (BEV)                                             61
##   Plug-in Hybrid Electric Vehicle (PHEV)                                  11041

Evaluasi Decision Tree

accuracy_tree <- sum(diag(cm_tree)) / sum(cm_tree)

TP <- cm_tree[1,1]
FP <- cm_tree[2,1]
FN <- cm_tree[1,2]

precision_tree <- TP/(TP+FP)

recall_tree <- TP/(TP+FN)

f1_tree <- 2 *
  precision_tree *
  recall_tree /
  (precision_tree + recall_tree)

accuracy_tree
## [1] 0.9966402
precision_tree
## [1] 0.9971686
recall_tree
## [1] 0.998638
f1_tree
## [1] 0.9979028

Random Forest

rf_model <- randomForest(
  Electric_Vehicle_Type ~ .,
  data = train_data,
  ntree = 100,
  importance = TRUE
)

rf_model
## 
## Call:
##  randomForest(formula = Electric_Vehicle_Type ~ ., data = train_data,      ntree = 100, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##                                        Battery Electric Vehicle (BEV)
## Battery Electric Vehicle (BEV)                                 179086
## Plug-in Hybrid Electric Vehicle (PHEV)                              1
##                                        Plug-in Hybrid Electric Vehicle (PHEV)
## Battery Electric Vehicle (BEV)                                             10
## Plug-in Hybrid Electric Vehicle (PHEV)                                  44727
##                                         class.error
## Battery Electric Vehicle (BEV)         5.583598e-05
## Plug-in Hybrid Electric Vehicle (PHEV) 2.235736e-05

Random Forest

rf_model <- randomForest(
  Electric_Vehicle_Type ~ .,
  data = train_data,
  ntree = 100,
  importance = TRUE
)

Prediksi Random Forest

rf_pred <- predict(
  rf_model,
  test_data
)

Confusion Matrix Random Forest

cm_rf <- table(
  Aktual = test_data$Electric_Vehicle_Type,
  Prediksi = rf_pred
)

cm_rf
##                                         Prediksi
## Aktual                                   Battery Electric Vehicle (BEV)
##   Battery Electric Vehicle (BEV)                                  44786
##   Plug-in Hybrid Electric Vehicle (PHEV)                              0
##                                         Prediksi
## Aktual                                   Plug-in Hybrid Electric Vehicle (PHEV)
##   Battery Electric Vehicle (BEV)                                              2
##   Plug-in Hybrid Electric Vehicle (PHEV)                                  11168

Evaluasi Random Forest

accuracy_rf <- sum(diag(cm_rf))/sum(cm_rf)

TP <- cm_rf[1,1]
FP <- cm_rf[2,1]
FN <- cm_rf[1,2]

precision_rf <- TP/(TP+FP)

recall_rf <- TP/(TP+FN)

f1_rf <- 2*(precision_rf*recall_rf)/
  (precision_rf+recall_rf)

accuracy_rf
## [1] 0.9999643
precision_rf
## [1] 1
recall_rf
## [1] 0.9999553
f1_rf
## [1] 0.9999777

Variable Importance

varImpPlot(rf_model)

# Perbandingan Model

hasil <- data.frame(
  Model = c("Decision Tree","Random Forest"),
  Accuracy = c(accuracy_tree,accuracy_rf),
  Precision = c(precision_tree,precision_rf),
  Recall = c(recall_tree,recall_rf),
  F1_Score = c(f1_tree,f1_rf)
)

hasil
##           Model  Accuracy Precision    Recall  F1_Score
## 1 Decision Tree 0.9966402 0.9971686 0.9986380 0.9979028
## 2 Random Forest 0.9999643 1.0000000 0.9999553 0.9999777

Kesimpulan

Berdasarkan hasil analisis, model Random Forest memberikan performa terbaik dengan nilai Accuracy sebesar 99,996%, Precision sebesar 100%, Recall sebesar 99,996%, dan F1-Score sebesar 99,998%. Variabel yang paling berpengaruh dalam klasifikasi adalah CAFV Eligibility dan Electric Range.