library(readr) # Untuk membaca data CSV library(library(dplyr) # Untuk manipulasi data (data wrangling) library(ggplot2) # Untuk visualisasi data library(caret) # Untuk klasifikasi/machine learning library(cluster) # Untuk clustering (k-means) library(purrr) # Untuk iterasi (misalnya, Elbow Method) library(scales) # Untuk memformat angka pada plot
file_id <- “uploaded:Car Sales.xlsx - car_data.csv.zip/Car Sales.xlsx - car_data.csv” data_mentah <- read_csv(file_id)
cat(“— Struktur Data Awal —”) print(str(data_mentah))
data_bersih <- data_mentah %>% # Mengubah nama kolom agar mudah
diakses rename( Annual_Income = Annual Income, Price =
Price ($), Body_Style = Body Style )
%>%
# Menghapus kolom yang tidak relevan untuk analisis/model select(-Car_id, -Dealer_No, -Phone, -Dealer_Name) %>%
# Mengonversi kolom ‘Date’ ke tipe Date mutate(Date = as.Date(Date, format = “%m/%d/%Y”)) %>%
# Menangani Missing Values (Menghapus baris yang memiliki nilai NA) na.omit() %>%
# Mengonversi variabel karakter yang tersisa menjadi faktor mutate_if(is.character, as.factor)
cat(“— Struktur Data Setelah Cleaning —”) print(glimpse(data_bersih))
cat(“— Menghasilkan Visualisasi Data Eksploratif —”)
hist_price <- ggplot(data_bersih, aes(x = Price)) + geom_histogram(binwidth = 5000, fill = “#1F77B4”, color = “white”) + labs(title = “Distribusi Harga Mobil”, x = “Harga ($)”, y = “Frekuensi”) + theme_minimal() print(hist_price)
scatter_income_price <- ggplot(data_bersih, aes(x = Annual_Income, y = Price, color = Gender)) + geom_point(alpha = 0.6) + labs(title = “Harga Mobil vs. Annual Income”, x = “Annual Income (\()", y = "Harga (\))”) + scale_y_continuous(labels = scales::comma) + scale_x_continuous(labels = scales::comma) + theme_bw() print(scatter_income_price)
box_style_price <- ggplot(data_bersih, aes(x = Body_Style, y = Price, fill = Body_Style)) + geom_boxplot() + labs(title = “Perbandingan Harga berdasarkan Body Style”, x = “Body Style”, y = “Harga ($)”) + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = “none”) print(box_style_price)
cat(“— Melakukan K-Means Clustering —”)
data_cluster <- data_bersih %>% select(Annual_Income, Price) %>% scale()
set.seed(42) wss <- map_dbl(1:10, function(k) { kmeans(data_cluster, centers = k, nstart = 25)$tot.withinss })
elbow_plot <- ggplot(data.frame(k = 1:10, wss = wss), aes(x = k, y = wss)) + geom_line(color = “darkred”) + geom_point(color = “darkred”) + geom_vline(xintercept = 3, linetype = “dashed”, color = “blue”) + # Asumsi k=3 labs(title = “Metode Elbow untuk K-Means”, x = “Jumlah Cluster (k)”, y = “WSS”) + theme_minimal() print(elbow_plot)
k_optimum <- 3 kmeans_model <- kmeans(data_cluster, centers = k_optimum, nstart = 25)
data_bersih\(Cluster <- as.factor(kmeans_model\)cluster)
cluster_plot <- ggplot(data_bersih, aes(x = Annual_Income, y = Price, color = Cluster)) + geom_point(alpha = 0.7) + geom_point(data = as.data.frame(kmeans_model\(centers) %>% rename(Annual_Income = `Annual_Income`, Price = `Price`), aes(x = Annual_Income*attr(data_cluster, 'scaled:scale')[1] + attr(data_cluster, 'scaled:center')[1], y = Price*attr(data_cluster, 'scaled:scale')[2] + attr(data_cluster, 'scaled:center')[2]), size = 4, shape = 8, color = "black") + # Menampilkan Centroid labs(title = paste("Hasil K-Means Clustering (k=", k_optimum, ")"), x = "Annual Income (\))“, y =”Harga ($)“) + scale_y_continuous(labels = scales::comma) + scale_x_continuous(labels = scales::comma) + theme_bw() print(cluster_plot)
cat(“— Melakukan Klasifikasi (Random Forest) untuk Body_Style —”)
data_klasifikasi <- data_bersih %>% select(-Date, -Company, -Model, -Cluster) # Cluster ditambahkan terakhir, dihapus untuk klasifikasi
set.seed(42) indeks_training <- createDataPartition(data_klasifikasi$Body_Style, p = 0.7, list = FALSE) data_training <- data_klasifikasi[indeks_training, ] data_testing <- data_klasifikasi[-indeks_training, ]
[Image of Random Forest structure]
model_rf <- train( Body_Style ~ ., data = data_training, method = “rf”, trControl = trainControl(method = “cv”, number = 5), # Cross-Validation 5-fold preProcess = c(“center”, “scale”, “dummy”) # Pre-processing: Scaling dan One-hot encoding )
cat(“— Ringkasan Model Random Forest —”) print(model_rf)
prediksi_rf <- predict(model_rf, data_testing) matriks_konfusi <- confusionMatrix(prediksi_rf, data_testing$Body_Style)
cat(“— Confusion Matrix Hasil Klasifikasi —”) print(matriks_konfusi)
var_importance <- varImp(model_rf, scale = FALSE) cat(“— Variabel Importance —”) print(var_importance)
importance_plot <- ggplot(var_importance) + labs(title = “Variabel Importance untuk Prediksi Body Style”) + theme_minimal() print(importance_plot)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.