setwd("C:/Users/georg/Downloads")
cars_data <- read.csv("Large Cars Dataset.csv", stringsAsFactors = FALSE)
str(cars_data)
## 'data.frame': 428 obs. of 15 variables:
## $ Brand : chr "Acura" "Acura" "Acura" "Acura" ...
## $ Model : chr "MDX" "RSX Type S 2dr" "TSX 4dr" "TL 4dr" ...
## $ VehicleClass: chr "SUV" "Sedan" "Sedan" "Sedan" ...
## $ Region : chr "Asia" "Asia" "Asia" "Asia" ...
## $ DriveTrain : chr "All" "Front" "Front" "Front" ...
## $ MSRP : chr "$36,945 " "$23,820 " "$26,990 " "$33,195 " ...
## $ DealerCost : chr "$33,337 " "$21,761 " "$24,647 " "$30,299 " ...
## $ EngineSize : num 3.5 2 2.4 3.2 3.5 3.5 3.2 1.8 1.8 3 ...
## $ Cylinders : int 6 4 4 6 6 6 6 4 4 6 ...
## $ HorsePower : int 265 200 200 270 225 225 290 170 170 220 ...
## $ MPG_City : int 17 24 22 20 18 18 17 22 23 20 ...
## $ MPG_Highway : int 23 31 29 28 24 24 24 31 30 28 ...
## $ Weight : int 4451 2778 3230 3575 3880 3893 3153 3252 3638 3462 ...
## $ Wheelbase : int 106 101 105 108 115 115 100 104 105 104 ...
## $ Length : int 189 172 183 186 197 197 174 179 180 179 ...
head(cars_data)
## Brand Model VehicleClass Region DriveTrain MSRP
## 1 Acura MDX SUV Asia All $36,945
## 2 Acura RSX Type S 2dr Sedan Asia Front $23,820
## 3 Acura TSX 4dr Sedan Asia Front $26,990
## 4 Acura TL 4dr Sedan Asia Front $33,195
## 5 Acura 3.5 RL 4dr Sedan Asia Front $43,755
## 6 Acura 3.5 RL w/Navigation 4dr Sedan Asia Front $46,100
## DealerCost EngineSize Cylinders HorsePower MPG_City MPG_Highway Weight
## 1 $33,337 3.5 6 265 17 23 4451
## 2 $21,761 2.0 4 200 24 31 2778
## 3 $24,647 2.4 4 200 22 29 3230
## 4 $30,299 3.2 6 270 20 28 3575
## 5 $39,014 3.5 6 225 18 24 3880
## 6 $41,100 3.5 6 225 18 24 3893
## Wheelbase Length
## 1 106 189
## 2 101 172
## 3 105 183
## 4 108 186
## 5 115 197
## 6 115 197
summary(cars_data)
## Brand Model VehicleClass Region
## Length:428 Length:428 Length:428 Length:428
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## DriveTrain MSRP DealerCost EngineSize
## Length:428 Length:428 Length:428 Min. :1.300
## Class :character Class :character Class :character 1st Qu.:2.375
## Mode :character Mode :character Mode :character Median :3.000
## Mean :3.197
## 3rd Qu.:3.900
## Max. :8.300
##
## Cylinders HorsePower MPG_City MPG_Highway
## Min. : 3.000 Min. : 73.0 Min. :10.00 Min. :12.00
## 1st Qu.: 4.000 1st Qu.:165.0 1st Qu.:17.00 1st Qu.:24.00
## Median : 6.000 Median :210.0 Median :19.00 Median :26.00
## Mean : 5.808 Mean :215.9 Mean :20.06 Mean :26.84
## 3rd Qu.: 6.000 3rd Qu.:255.0 3rd Qu.:21.25 3rd Qu.:29.00
## Max. :12.000 Max. :500.0 Max. :60.00 Max. :66.00
## NA's :2
## Weight Wheelbase Length
## Min. :1850 Min. : 89.0 Min. :143.0
## 1st Qu.:3104 1st Qu.:103.0 1st Qu.:178.0
## Median :3474 Median :107.0 Median :187.0
## Mean :3578 Mean :108.2 Mean :186.4
## 3rd Qu.:3978 3rd Qu.:112.0 3rd Qu.:194.0
## Max. :7190 Max. :144.0 Max. :238.0
##
Insight: Dataset ini berisi informasi tentang spesifikasi mobil seperti MSRP, dealer cost, efisiensi bahan bakar, horsepower, engine size, dan lainnya. Data perlu dibersihkan karena format MSRP dan DealerCost menggunakan simbol dolar dan koma.
cars_data <- cars_data %>%
mutate(
MSRP = as.numeric(gsub("[\\$,]", "", MSRP)),
DealerCost = as.numeric(gsub("[\\$,]", "", DealerCost)),
Price_Category = case_when(
MSRP > 50000 ~ "High-End",
MSRP > 25000 ~ "Mid-Range",
TRUE ~ "Budget"
)
)
Insight: MSRP dan DealerCost diubah menjadi numerik agar bisa dianalisis. Kategori harga dibuat berdasarkan MSRP.
cars_data_long <- cars_data %>%
mutate(
Price_Category = factor(Price_Category, levels = c("Budget", "Mid-Range", "High-End"), ordered = TRUE),
MPG_Group = cut(MPG_City, breaks = c(0, 15, 25, 35, 45, Inf),
labels = c("0-15", "15-25", "25-35", "35-45", "45+"),
include.lowest = TRUE)
) %>%
filter(!is.na(MSRP), !is.na(MPG_City))
Insight: Kita mengelompokkan efisiensi bahan bakar (MPG_City) ke dalam 5 kelompok. Data difilter untuk menghindari nilai NA yang mengganggu visualisasi.
ggplot(cars_data_long, aes(x = MSRP / 1000, y = MPG_City)) +
geom_point(aes(color = MPG_Group), alpha = 0.8, size = 3) +
geom_smooth(method = "loess", se = FALSE, linewidth = 1.5, color = "blue") +
scale_color_manual(values = c("#1a1a1a", "#4d4d4d", "#800000", "#b30000", "#ff0000")) +
facet_wrap(~ Price_Category, nrow = 1) +
scale_x_continuous(labels = dollar_format(prefix = "$", suffix = "k")) +
scale_y_continuous(limits = c(0, 50)) +
labs(title = "Efisiensi vs Harga Mobil", x = "Harga (ribuan USD)", y = "MPG Kota") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
Insight: Mobil di kategori harga tinggi cenderung memiliki efisiensi bahan bakar lebih rendah, kemungkinan karena mesin yang lebih besar dan performa tinggi.
ggplot(cars_data_long, aes(x = DriveTrain, y = MPG_City, fill = DriveTrain)) +
geom_boxplot() +
labs(title = "MPG Berdasarkan Drive Type", x = "Drive Type", y = "MPG") +
theme_minimal()
Insight: Mobil dengan penggerak FWD umumnya lebih efisien dibanding AWD.
ggplot(cars_data_long, aes(x = DriveTrain, y = HorsePower, fill = DriveTrain)) +
geom_boxplot() +
labs(title = "Horsepower Berdasarkan Drive Type", x = "Drive Type", y = "Horsepower") +
theme_minimal()
Insight: Mobil AWD cenderung memiliki horsepower yang lebih tinggi.
ggplot(cars_data_long, aes(x = MSRP, y = HorsePower, color = Brand)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "Harga vs Horsepower", x = "Harga (USD)", y = "Horsepower") +
theme_minimal()
Insight: Ada korelasi positif antara harga dan horsepower.
ggplot(cars_data_long, aes(x = MSRP, y = EngineSize, color = Brand)) +
geom_point(alpha = 0.6, size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "Harga vs Ukuran Mesin", x = "Harga (USD)", y = "Engine Size (L)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Insight: Mobil dengan mesin lebih besar biasanya lebih mahal.
ggplot(cars_data_long, aes(x = reorder(Brand, MSRP, FUN = median), y = MSRP, fill = Brand)) +
geom_boxplot() +
coord_flip() +
labs(title = "Distribusi Harga per Brand", x = "Brand", y = "Harga (USD)") +
theme_minimal() +
theme(legend.position = "none")
ggplot(cars_data_long, aes(x = reorder(Brand, MPG_City, FUN = median), y = MPG_City, fill = Brand)) +
geom_boxplot() +
coord_flip() +
labs(title = "Distribusi MPG per Brand", x = "Brand", y = "MPG Kota") +
theme_minimal() +
theme(legend.position = "none")
Insight: Beberapa brand fokus pada efisiensi, sedangkan yang lain fokus pada performa.