Dataset Electric Vehicle Population Data digunakan untuk menganalisis karakteristik kendaraan listrik berdasarkan berbagai atribut kendaraan. Variabel target yang digunakan adalah Electric Vehicle Type yang terdiri dari Battery Electric Vehicle (BEV) dan Plug-in Hybrid Electric Vehicle (PHEV). # Import Library
library(readxl)
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(randomForest)
data <- read_excel("Electric_Vehicle_Population_Data.xlsx")
dim(data)
## [1] 279780 17
str(data)
## tibble [279,780 × 17] (S3: tbl_df/tbl/data.frame)
## $ VIN (1-10) : chr [1:279780] "JN1AZ0CP5C" "JTMABABA7P" "1N4AZ1CP1J" "5UX43EU09S" ...
## $ County : chr [1:279780] "Stevens" "Yakima" "King" "Kitsap" ...
## $ City : chr [1:279780] "Colville" "Yakima" "Seattle" "Poulsbo" ...
## $ State : chr [1:279780] "WA" "WA" "WA" "WA" ...
## $ Postal Code : num [1:279780] 99114 98903 98122 98370 98597 ...
## $ Model Year : num [1:279780] 2012 2023 2018 2025 2015 ...
## $ Make : chr [1:279780] "NISSAN" "SUBARU" "NISSAN" "BMW" ...
## $ Model : chr [1:279780] "LEAF" "SOLTERRA" "LEAF" "X5" ...
## $ Electric Vehicle Type : chr [1:279780] "Battery Electric Vehicle (BEV)" "Battery Electric Vehicle (BEV)" "Battery Electric Vehicle (BEV)" "Plug-in Hybrid Electric Vehicle (PHEV)" ...
## $ Clean Alternative Fuel Vehicle (CAFV) Eligibility: chr [1:279780] "Clean Alternative Fuel Vehicle Eligible" "Eligibility unknown as battery range has not been researched" "Clean Alternative Fuel Vehicle Eligible" "Clean Alternative Fuel Vehicle Eligible" ...
## $ Electric Range : num [1:279780] 73 0 151 40 87 84 81 21 0 19 ...
## $ Legislative District : num [1:279780] 7 15 37 23 2 22 32 44 9 23 ...
## $ DOL Vehicle ID : num [1:279780] 1.53e+08 2.54e+08 3.33e+08 2.68e+08 4.74e+08 ...
## $ Vehicle Location : chr [1:279780] "POINT (-117.90454 48.54657)" "POINT (-120.71847 46.55029)" "POINT (-122.31009 47.60803)" "POINT (-122.64681 47.73689)" ...
## $ Electric Utility : chr [1:279780] "AVISTA CORP" "PACIFICORP" "CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA)" "PUGET SOUND ENERGY INC" ...
## $ 2020 Census Tract : num [1:279780] 5.31e+10 5.31e+10 5.30e+10 5.30e+10 5.31e+10 ...
## $ ...17 : logi [1:279780] NA NA NA NA NA NA ...
Memilih variabel yang digunakan dalam analisis.
data <- data %>%
select(
`Model Year`,
Make,
`Electric Vehicle Type`,
`Clean Alternative Fuel Vehicle (CAFV) Eligibility`,
`Electric Range`,
`Legislative District`
)
names(data) <- c(
"Model_Year",
"Make",
"Electric_Vehicle_Type",
"CAFV_Eligibility",
"Electric_Range",
"Legislative_District"
)
data$Model_Year <- as.numeric(data$Model_Year)
data$Electric_Range <- as.numeric(data$Electric_Range)
data$Legislative_District <- as.numeric(data$Legislative_District)
data$Make <- as.factor(data$Make)
data$CAFV_Eligibility <- as.factor(data$CAFV_Eligibility)
data$Electric_Vehicle_Type <- as.factor(data$Electric_Vehicle_Type)
data$Electric_Range[
is.na(data$Electric_Range)
] <- median(
data$Electric_Range,
na.rm = TRUE
)
data$Legislative_District[
is.na(data$Legislative_District)
] <- median(
data$Legislative_District,
na.rm = TRUE
)
data <- na.omit(data)
ggplot(data,
aes(x = Electric_Vehicle_Type,
fill = Electric_Vehicle_Type)) +
geom_bar() +
theme_minimal()
## Distribusi Tahun Produksi
ggplot(data,
aes(x = Model_Year)) +
geom_histogram(
bins = 20,
color = "black"
) +
theme_minimal()
## Distribusi Electric Range
ggplot(data,
aes(x = Electric_Range)) +
geom_histogram(
bins = 30,
color = "black"
) +
theme_minimal()
## Top 10 Merek Kendaraan
top_make <- data %>%
count(Make) %>%
arrange(desc(n)) %>%
slice(1:10)
ggplot(top_make,
aes(x = reorder(Make,n),
y = n)) +
geom_col() +
coord_flip()
# Pembagian Data Training dan Testing
set.seed(123)
index <- sample(
1:nrow(data),
size = 0.8*nrow(data)
)
train_data <- data[index,]
test_data <- data[-index,]
tree_model <- rpart(
Electric_Vehicle_Type ~ .,
data = train_data,
method = "class"
)
rpart.plot(tree_model)
## Confusion Matrix
tree_pred <- predict(
tree_model,
test_data,
type = "class"
)
cm_tree <- table(
Aktual = test_data$Electric_Vehicle_Type,
Prediksi = tree_pred
)
cm_tree
## Prediksi
## Aktual Battery Electric Vehicle (BEV)
## Battery Electric Vehicle (BEV) 44727
## Plug-in Hybrid Electric Vehicle (PHEV) 127
## Prediksi
## Aktual Plug-in Hybrid Electric Vehicle (PHEV)
## Battery Electric Vehicle (BEV) 61
## Plug-in Hybrid Electric Vehicle (PHEV) 11041
accuracy_tree <- sum(diag(cm_tree)) / sum(cm_tree)
TP <- cm_tree[1,1]
FP <- cm_tree[2,1]
FN <- cm_tree[1,2]
precision_tree <- TP/(TP+FP)
recall_tree <- TP/(TP+FN)
f1_tree <- 2 *
precision_tree *
recall_tree /
(precision_tree + recall_tree)
accuracy_tree
## [1] 0.9966402
precision_tree
## [1] 0.9971686
recall_tree
## [1] 0.998638
f1_tree
## [1] 0.9979028
rf_model <- randomForest(
Electric_Vehicle_Type ~ .,
data = train_data,
ntree = 100,
importance = TRUE
)
rf_model
##
## Call:
## randomForest(formula = Electric_Vehicle_Type ~ ., data = train_data, ntree = 100, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## Battery Electric Vehicle (BEV)
## Battery Electric Vehicle (BEV) 179086
## Plug-in Hybrid Electric Vehicle (PHEV) 1
## Plug-in Hybrid Electric Vehicle (PHEV)
## Battery Electric Vehicle (BEV) 10
## Plug-in Hybrid Electric Vehicle (PHEV) 44727
## class.error
## Battery Electric Vehicle (BEV) 5.583598e-05
## Plug-in Hybrid Electric Vehicle (PHEV) 2.235736e-05
rf_model <- randomForest(
Electric_Vehicle_Type ~ .,
data = train_data,
ntree = 100,
importance = TRUE
)
rf_pred <- predict(
rf_model,
test_data
)
cm_rf <- table(
Aktual = test_data$Electric_Vehicle_Type,
Prediksi = rf_pred
)
cm_rf
## Prediksi
## Aktual Battery Electric Vehicle (BEV)
## Battery Electric Vehicle (BEV) 44786
## Plug-in Hybrid Electric Vehicle (PHEV) 0
## Prediksi
## Aktual Plug-in Hybrid Electric Vehicle (PHEV)
## Battery Electric Vehicle (BEV) 2
## Plug-in Hybrid Electric Vehicle (PHEV) 11168
accuracy_rf <- sum(diag(cm_rf))/sum(cm_rf)
TP <- cm_rf[1,1]
FP <- cm_rf[2,1]
FN <- cm_rf[1,2]
precision_rf <- TP/(TP+FP)
recall_rf <- TP/(TP+FN)
f1_rf <- 2*(precision_rf*recall_rf)/
(precision_rf+recall_rf)
accuracy_rf
## [1] 0.9999643
precision_rf
## [1] 1
recall_rf
## [1] 0.9999553
f1_rf
## [1] 0.9999777
varImpPlot(rf_model)
# Perbandingan Model
hasil <- data.frame(
Model = c("Decision Tree","Random Forest"),
Accuracy = c(accuracy_tree,accuracy_rf),
Precision = c(precision_tree,precision_rf),
Recall = c(recall_tree,recall_rf),
F1_Score = c(f1_tree,f1_rf)
)
hasil
## Model Accuracy Precision Recall F1_Score
## 1 Decision Tree 0.9966402 0.9971686 0.9986380 0.9979028
## 2 Random Forest 0.9999643 1.0000000 0.9999553 0.9999777
Berdasarkan hasil analisis, model Random Forest memberikan performa terbaik dengan nilai Accuracy sebesar 99,996%, Precision sebesar 100%, Recall sebesar 99,996%, dan F1-Score sebesar 99,998%. Variabel yang paling berpengaruh dalam klasifikasi adalah CAFV Eligibility dan Electric Range.