# Load required libraries
library(readr)
library(ggplot2)
library(reshape2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(outliers)
library(viridis)
## Loading required package: viridisLite
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(Rtsne)
library(forcats)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
## The following object is masked from 'package:readr':
##
## col_factor
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(umap)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(cluster)
library(stats)
library(broom)
library(RColorBrewer)
# Download the dataset from GitHub
url <- "https://raw.githubusercontent.com/fongbubble/UoB_MGRCM0034_Car_Sales/main/car_sales.csv"
csv_file_path <- tempfile(fileext = ".csv")
download.file(url, destfile = csv_file_path)
print(paste("CSV File Path:", csv_file_path))
## [1] "CSV File Path: /var/folders/r3/_4zcy19s7x70_8k48tjkyqch0000gn/T//Rtmp0VIX8Y/filea6722527caaf.csv"
# Read the CSV file
df <- read_csv(csv_file_path)
## Rows: 23906 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): Car_id, Date, Customer Name, Gender, Dealer_Name, Company, Model, ...
## dbl (3): Annual Income, Price ($), Phone
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df, 5)
## # A tibble: 5 × 16
## Car_id Date `Customer Name` Gender `Annual Income` Dealer_Name Company Model
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 C_CND_… 1/2/… Geraldine Male 13500 Buddy Stor… Ford Expe…
## 2 C_CND_… 1/2/… Gia Male 1480000 C & M Moto… Dodge Dura…
## 3 C_CND_… 1/2/… Gianna Male 1035000 Capitol KIA Cadill… Eldo…
## 4 C_CND_… 1/2/… Giselle Male 13500 Chrysler o… Toyota Celi…
## 5 C_CND_… 1/2/… Grace Male 1465000 Chrysler P… Acura TL
## # ℹ 8 more variables: Engine <chr>, Transmission <chr>, Color <chr>,
## # `Price ($)` <dbl>, Dealer_No <chr>, `Body Style` <chr>, Phone <dbl>,
## # Dealer_Region <chr>
# Convert Date to proper Date type for m/d/y format
df$Date <- as.Date(df$Date, format = "%m/%d/%Y")
# Replace '5-Sep' with '9-5' and '3-Sep' with '9-3' in the 'Model' column using mutate
df <- df %>%
mutate(Model = gsub('5-Sep', '9-5', gsub('3-Sep', '9-3', Model)))
colnames(df)[colnames(df) == "Price ($)"] <- "Price"
colnames(df)[colnames(df) == "Annual Income"] <- "Annual_Income"
# Count the number of missing values in each column
missing_values_per_column <- colSums(is.na(df))
print(missing_values_per_column)
## Car_id Date Customer Name Gender Annual_Income
## 0 0 0 0 0
## Dealer_Name Company Model Engine Transmission
## 0 0 0 0 0
## Color Price Dealer_No Body Style Phone
## 0 0 0 0 0
## Dealer_Region
## 0
str(df)
## tibble [23,906 × 16] (S3: tbl_df/tbl/data.frame)
## $ Car_id : chr [1:23906] "C_CND_000001" "C_CND_000002" "C_CND_000003" "C_CND_000004" ...
## $ Date : Date[1:23906], format: "2022-01-02" "2022-01-02" ...
## $ Customer Name: chr [1:23906] "Geraldine" "Gia" "Gianna" "Giselle" ...
## $ Gender : chr [1:23906] "Male" "Male" "Male" "Male" ...
## $ Annual_Income: num [1:23906] 13500 1480000 1035000 13500 1465000 ...
## $ Dealer_Name : chr [1:23906] "Buddy Storbeck's Diesel Service Inc" "C & M Motors Inc" "Capitol KIA" "Chrysler of Tri-Cities" ...
## $ Company : chr [1:23906] "Ford" "Dodge" "Cadillac" "Toyota" ...
## $ Model : chr [1:23906] "Expedition" "Durango" "Eldorado" "Celica" ...
## $ Engine : chr [1:23906] "Double Overhead Camshaft" "Double Overhead Camshaft" "Overhead Camshaft" "Overhead Camshaft" ...
## $ Transmission : chr [1:23906] "Auto" "Auto" "Manual" "Manual" ...
## $ Color : chr [1:23906] "Black" "Black" "Red" "Pale White" ...
## $ Price : num [1:23906] 26000 19000 31500 14000 24500 12000 14000 42000 82000 15000 ...
## $ Dealer_No : chr [1:23906] "06457-3834" "60504-7114" "38701-8047" "99301-3882" ...
## $ Body Style : chr [1:23906] "SUV" "SUV" "Passenger" "SUV" ...
## $ Phone : num [1:23906] 8264678 6848189 7298798 6257557 7081483 ...
## $ Dealer_Region: chr [1:23906] "Middletown" "Aurora" "Greenville" "Pasco" ...
# Check for missing values in the dataset
sum(is.na(df))
## [1] 0
# Check for duplicate rows in the dataset
sum(duplicated(df))
## [1] 0
# Display unique summary for each column
summary(df)
## Car_id Date Customer Name Gender
## Length:23906 Min. :2022-01-02 Length:23906 Length:23906
## Class :character 1st Qu.:2022-09-20 Class :character Class :character
## Mode :character Median :2023-03-13 Mode :character Mode :character
## Mean :2023-03-01
## 3rd Qu.:2023-09-08
## Max. :2023-12-31
## Annual_Income Dealer_Name Company Model
## Min. : 10080 Length:23906 Length:23906 Length:23906
## 1st Qu.: 386000 Class :character Class :character Class :character
## Median : 735000 Mode :character Mode :character Mode :character
## Mean : 830840
## 3rd Qu.: 1175750
## Max. :11200000
## Engine Transmission Color Price
## Length:23906 Length:23906 Length:23906 Min. : 1200
## Class :character Class :character Class :character 1st Qu.:18001
## Mode :character Mode :character Mode :character Median :23000
## Mean :28090
## 3rd Qu.:34000
## Max. :85800
## Dealer_No Body Style Phone Dealer_Region
## Length:23906 Length:23906 Min. :6000101 Length:23906
## Class :character Class :character 1st Qu.:6746495 Class :character
## Mode :character Mode :character Median :7496198 Mode :character
## Mean :7497741
## 3rd Qu.:8248146
## Max. :8999579
# List of categorical columns to encode
categorical_columns <- c("Car_id", "Customer Name", "Gender", "Dealer_Name",
"Company", "Model", "Engine", "Transmission",
"Color", "Dealer_No", "Body Style", "Dealer_Region")
# Apply label encoding to each categorical column
df_encoded <- df %>%
mutate(across(all_of(categorical_columns), ~ as.integer(factor(.))))
# Exclude the 'Date' column from df_encoded using base R
df_numeric <- df_encoded[, !names(df_encoded) %in% "Date"]
# Use the preProcess function from the caret package to standardize the data
preProc <- preProcess(df_numeric, method = c("center", "scale"))
df_std <- predict(preProc, newdata = df_numeric)
df_selected <- df_std[, c("Annual_Income", "Price", "Company", "Model")]
df_selected <- unique(df_selected) # Remove duplicate rows
# Calculate the average price by month over years
df_monthly_avg <- df %>%
group_by(Month = floor_date(Date, "month")) %>%
summarise(Average_Price = mean(Price, na.rm = TRUE)) %>% filter(!is.na(Average_Price))
# Step 2: Plot the time series with color changes and a regression line
# Define segments for rising and falling lines
ggplot(df_monthly_avg, aes(x = Month, y = Average_Price)) +
geom_segment(data = df_monthly_avg %>% mutate(Next_Avg = lead(Average_Price), Change = ifelse(Average_Price < lead(Average_Price), 'Up', 'Down')),
aes(xend = lead(Month), yend = Next_Avg, color = Change),
size = 1) +
scale_color_manual(values = c("Up" = "green", "Down" = "red")) +
geom_smooth(method = "lm", color = "darkgoldenrod", linetype = "dashed", se = FALSE) + # Add regression line in yellow
theme_minimal() +
ggtitle("Average Monthly Price Over Years") +
xlab("Month") +
ylab("Average Price") +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_segment()`).

# Filter the top 5 popular car brands
top_5_brands <- names(sort(table(df$Company), decreasing = TRUE)[1:5])
df_top_5 <- df[df$Company %in% top_5_brands, ]
# Create a boxplot for the top 5 popular car brands to visualize price distribution
ggplot(df_top_5, aes(x = Company, y = Price, fill = Company)) +
scale_fill_brewer(palette = "Spectral") +
geom_boxplot() +
ggtitle("Price Distribution of Top 5 Popular Car Brands") +
xlab("Car Company") +
ylab("Price") +
theme_minimal() +
theme(
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
legend.position = "none"
)

# Summarize data by Company
company_summary <- df %>%
group_by(Company) %>%
summarise(
Avg_Price = mean(Price, na.rm = TRUE),
Avg_Annual_Income = mean(Annual_Income, na.rm = TRUE),
Count = n() # Frequency of each Company
)
# Plot Bubble Chart with Annual Income, Price, and Company frequency as bubble size
ggplot(company_summary, aes(x = Avg_Annual_Income, y = Avg_Price, size = Count, fill = Company)) +
geom_point(alpha = 0.6, shape = 21, color = "black") +
scale_size(range = c(3, 15)) +
scale_fill_viridis_d(option = "plasma") +
theme_minimal() +
ggtitle("Bubble Chart of Price vs Annual Income by Company") +
labs(x = "Average Annual Income ($)", y = "Average Price ($)", size = "Number of Cars", fill = "Company") +
theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 13, face = "bold")
)

# Calculate the correlation matrix for standardized data
cor_matrix <- cor(df_std, use = "pairwise.complete.obs")
cor_melted <- melt(cor_matrix)
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value, size = abs(value))) +
geom_point(shape = 21, color = "black", alpha = 0.8) +
scale_fill_viridis(option = "plasma", limits = c(-1, 1), guide = "colorbar") +
scale_size(range = c(1, 7)) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "right"
) +
labs(
title = "Bubble Correlation Heatmap",
x = "",
y = "",
fill = "Correlation",
size = "Absolute Correlation"
)

# Rank Companies by Count (Frequency)
company_count <- df %>%
group_by(Company) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
mutate(Rank = row_number())
# Plot the ranking of companies with Plasma colors indicating rank
ggplot(company_count, aes(x = reorder(Company, -Count), y = Count, fill = Rank)) +
geom_bar(stat = "identity") +
scale_fill_viridis_c(option = "plasma", direction = -1) +
theme_minimal() +
theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
legend.position = "right"
) +
ggtitle("Ranking of Car Brand Popularity") +
labs(x = "Car Brand", y = "Amount", fill = "Rank")

body_transmission_summary <- df %>%
group_by(`Body Style`, Transmission) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Body Style'. You can override using the
## `.groups` argument.
# Plot the stacked bar chart using Spectral colors
ggplot(body_transmission_summary, aes(x = `Body Style`, y = Count, fill = Transmission)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_brewer(palette = "Spectral") +
theme_minimal() +
ggtitle("Stacked Bar Chart of Body Style by Transmission Type") +
labs(x = "Body Style", y = "Count", fill = "Transmission") +
theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
)

set.seed(24150)
# Run t-SNE on the Selected Variables
tsne_results <- Rtsne(
df_selected,
dims = 2,
perplexity = 30,
verbose = TRUE,
max_iter = 500
)
## Performing PCA
## Read the 19235 x 4 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## - point 10000 of 19235
## Done in 1.35 seconds (sparsity = 0.005858)!
## Learning embedding...
## Iteration 50: error is 106.677403 (50 iterations in 2.76 seconds)
## Iteration 100: error is 99.390975 (50 iterations in 4.32 seconds)
## Iteration 150: error is 83.737977 (50 iterations in 2.59 seconds)
## Iteration 200: error is 79.587865 (50 iterations in 2.54 seconds)
## Iteration 250: error is 77.274703 (50 iterations in 2.61 seconds)
## Iteration 300: error is 3.136290 (50 iterations in 2.53 seconds)
## Iteration 350: error is 2.689448 (50 iterations in 2.55 seconds)
## Iteration 400: error is 2.377166 (50 iterations in 2.62 seconds)
## Iteration 450: error is 2.143877 (50 iterations in 2.69 seconds)
## Iteration 500: error is 1.962691 (50 iterations in 2.71 seconds)
## Fitting performed in 27.91 seconds.
# Extract t-SNE Embeddings and Apply K-Means Clustering
embedding <- as.data.frame(tsne_results$Y)
colnames(embedding) <- c("Dim1", "Dim2")
# Apply K-Means clustering to the t-SNE results to identify clusters
kmeans_res <- kmeans(embedding, centers = 9)
embedding$Cluster <- factor(kmeans_res$cluster)
custom_cluster_names <- c(
"Mid-Income, Brand-Focused Buyers",
"High-Income, Luxury Seekers",
"High-Income, Cost-Conscious Buyers",
"Low-Income, Practical Buyers",
"Low-Income, Value-Driven Buyers",
"Mid-Income, Budget-Conscious Buyers",
"High-Income, Brand-Focused Buyers",
"Low-Income, Budget-Focused Buyers",
"Mid-Income, Practical Buyers"
)
cluster_order <- c(
# High-Income Groups
"High-Income, Luxury Seekers",
"High-Income, Brand-Focused Buyers",
"High-Income, Cost-Conscious Buyers",
# Mid-Income Groups
"Mid-Income, Brand-Focused Buyers",
"Mid-Income, Practical Buyers",
"Mid-Income, Budget-Conscious Buyers",
# Low-Income Groups
"Low-Income, Value-Driven Buyers",
"Low-Income, Practical Buyers",
"Low-Income, Budget-Focused Buyers"
)
embedding$Cluster <- factor(embedding$Cluster, levels = 1:9, labels = custom_cluster_names)
embedding$Cluster <- factor(embedding$Cluster, levels = cluster_order)
# Visualize the t-SNE Results Using ggplot2
ggplot(embedding, aes(x = Dim1, y = Dim2, color = Cluster)) +
geom_point(alpha = 0.7, size = 3) +
scale_color_brewer(palette = "Spectral") +
ggtitle("t-SNE with K-means Clustering") +
theme_minimal() +
theme(
axis.title = element_text(size = 12),
legend.title = element_text(size = 12),
legend.position = "right",
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
) +
guides(
color = guide_legend(title = "Cluster Groups", override.aes = list(order = cluster_order))
)

set.seed(24150)
# Perform PCA on the selected variables matrix
pca_result <- prcomp(df_selected, center = TRUE, scale. = TRUE)
# Extract the first two principal components
pca_data <- as.data.frame(pca_result$x[, 1:2])
colnames(pca_data) <- c("PC1", "PC2")
# Flip the direction of PC1 or PC2
pca_data$PC1 <- -pca_data$PC1
pca_data$PC2 <- -pca_data$PC2
# Apply K-Means clustering on the PCA results
kmeans_res <- kmeans(pca_data, centers = 9)
pca_data$Cluster <- factor(kmeans_res$cluster)
custom_cluster_names <- c(
"High-Income, Cost-Conscious Buyers",
"Mid-Income, Practical Buyers",
"High-Income, Brand-Focused Buyers",
"Low-Income, Practical Buyers",
"Mid-Income, Budget-Conscious Buyers",
"Low-Income, Budget-Focused Buyers",
"High-Income, Luxury Seekers",
"Low-Income, Value-Driven Buyers",
"Mid-Income, Brand-Focused Buyers"
)
cluster_order <- c(
# High-Income Groups
"High-Income, Luxury Seekers",
"High-Income, Brand-Focused Buyers",
"High-Income, Cost-Conscious Buyers",
# Mid-Income Groups
"Mid-Income, Brand-Focused Buyers",
"Mid-Income, Practical Buyers",
"Mid-Income, Budget-Conscious Buyers",
# Low-Income Groups
"Low-Income, Value-Driven Buyers",
"Low-Income, Practical Buyers",
"Low-Income, Budget-Focused Buyers"
)
pca_data$Cluster <- factor(pca_data$Cluster, levels = 1:9, labels = custom_cluster_names)
pca_data$Cluster <- factor(pca_data$Cluster, levels = cluster_order)
# Add Centroids to the PCA Data
centroids <- as.data.frame(kmeans_res$centers)
colnames(centroids) <- c("PC1", "PC2")
centroids$Cluster <- factor(1:9, labels = custom_cluster_names)
centroids$Cluster <- factor(centroids$Cluster, levels = cluster_order)
# Create a Grid for Decision Boundaries
grid_size <- 0.05
x_min <- min(pca_data$PC1) - 1
x_max <- max(pca_data$PC1) + 1
y_min <- min(pca_data$PC2) - 1
y_max <- max(pca_data$PC2) + 1
# Create a grid of points covering the range of the data
grid <- expand.grid(PC1 = seq(x_min, x_max, by = grid_size),
PC2 = seq(y_min, y_max, by = grid_size))
# Custom function to predict clusters based on centroids using Euclidean distance
predict_kmeans <- function(new_data, centroids) {
distance_matrix <- matrix(NA, nrow = nrow(new_data), ncol = nrow(centroids))
for (i in 1:nrow(centroids)) {
distance_matrix[, i] <- sqrt((new_data$PC1 - centroids$PC1[i])^2 +
(new_data$PC2 - centroids$PC2[i])^2)
}
cluster_assignments <- max.col(-distance_matrix) # Negative for finding the minimum distance
return(cluster_assignments)
}
# Predict cluster assignment for each grid point
grid$Cluster <- factor(predict_kmeans(grid, centroids), levels = 1:9, labels = custom_cluster_names)
grid$Cluster <- factor(grid$Cluster, levels = cluster_order)
# Visualize the PCA with K-Means clustering using ggplot2
ggplot() +
geom_tile(data = grid, aes(x = PC1, y = PC2, fill = Cluster), alpha = 0.15) + # Use lighter fill for cluster areas
geom_point(data = pca_data, aes(x = PC1, y = PC2, color = Cluster), alpha = 0.8, size = 2.5) +
scale_fill_brewer(palette = "Spectral") +
scale_color_brewer(palette = "Spectral") +
# Plot centroids with red star markers to clearly denote them
geom_point(data = centroids, aes(x = PC1, y = PC2), size = 4, shape = 8, color = "red") +
ggtitle("PCA with K-Means Clustering") +
xlab("PC1: Car Preference & Affordability Component") +
ylab("PC2: Income Component") +
theme_minimal() +
theme(
axis.title = element_text(size = 14),
legend.title = element_text(size = 12),
legend.position = "right",
plot.title = element_text(hjust = 0.5, size = 18, face = "bold")
) +
guides(
fill = guide_legend(title = "Cluster Areas and Labels"),
color = guide_legend(title = "Cluster Areas and Labels")
)

loadings <- as.data.frame(pca_result$rotation[, 1:2]) # Extract loadings for PC1 and PC2
# Flipping the signs of PC1 and/or PC2
loadings$PC1 <- -loadings$PC1
loadings$PC2 <- -loadings$PC2
loadings$Variable <- rownames(loadings)
loadings_long <- loadings %>%
pivot_longer(cols = c(PC1, PC2), names_to = "Component", values_to = "Contribution")
# Define the color palette using the Spectral palette
spectral_palette <- brewer.pal(n = length(unique(loadings_long$Variable)), name = "Spectral")
# Plot bar chart to visualize the contribution of each variable to PC1 and PC2
ggplot(loadings_long, aes(x = Component, y = Contribution, fill = Variable)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7) +
scale_fill_manual(values = spectral_palette) + # Apply Spectral palette
ggtitle("Feature Contribution to Each Principal Component") +
theme_minimal() +
theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 0.5),
legend.title = element_text(size = 12),
legend.position = "right",
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
) +
labs(x = "Principal Components", y = "Contribution") +
guides(fill = guide_legend(title = "Features"))

# Combine original variables with PCA cluster information
selected_vars <- as.data.frame(df_selected)
selected_vars$Cluster <- pca_data$Cluster
# Calculate average value of each variable for each cluster group
cluster_summary <- selected_vars %>%
group_by(Cluster) %>%
summarise(across(c(Annual_Income, Price), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(c(Annual_Income, Price), mean, na.rm = TRUE)`.
## ℹ In group 1: `Cluster = "High-Income, Luxury Seekers"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
# Reshape the data to long format for plotting
cluster_summary_long <- cluster_summary %>%
pivot_longer(cols = -Cluster, names_to = "Variable", values_to = "Average_Value")
# Plot the bar chart showing the average value of each variable for each cluster group
ggplot(cluster_summary_long, aes(x = Variable, y = Average_Value, fill = Cluster)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Spectral") +
theme_minimal() +
ggtitle("Annual Income and Price Across Cluster Groups") +
xlab("Variables") +
ylab("Value") +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
legend.title = element_text(size = 12),
legend.position = "right"
)
