library(cluster)
## Warning: package 'cluster' was built under R version 4.2.3
data <- data.frame(
Name = c("Kia Picanto 1.1. Start", "Suzuki Splash 1.0", "Renault Clio 1.0",
"Dacia Sandero 1.6", "Fiat Grande Punto 1.4", "Peugot 207 1.4",
"Renault Clio 1.6", "Porsche Cayman", "Nissan 350Z", "Mercedes c200 CDI",
"VW Passat Variant 2.0", "Skoda Octavia 2.0", "Mercedes E280", "Audi A6 2.4",
"BMW 525i"),
Displacement = c(1086, 996, 1149, 1598, 1598, 1360, 1368, 3386, 3498, 2148,
1968, 1968, 2996, 2393, 2497),
Moment = c(97, 90, 105, 128, 140, 133, 125, 340, 353, 270, 320, 320, 300, 230, 250),
Horsepower = c(65, 65, 75, 87, 88, 88, 95, 295, 301, 136, 140, 140, 231, 177, 218),
Length = c(3535, 3715, 3986, 4020, 3986, 4030, 4030, 4341, 4315, 4595, 4774, 4572, 4852, 4916, 4841),
Width = c(1595, 1680, 1719, 1746, 1719, 1748, 1687, 1801, 1815, 1770, 1820, 1769, 1822, 1855, 1846),
Weight = c(929, 1050, 1155, 1111, 1215, 1214, 1135, 1340, 1610, 1605, 1596, 1425, 1660, 1525, 1550),
Trunk = c(127, 178, 288, 320, 288, 270, 275, 410, 235, 485, 588, 580, 540, 546, 520),
Speed = c(154, 160, 167, 174, 177, 180, 178, 275, 250, 208, 201, 207, 250, 231, 245),
Acceleration = c(15.10, 14.70, 13.40, 11.50, 11.90, 12.70, 11.40, 5.40, 5.80, 10.80,
10.50, 9.70, 7.30, 8.90, 7.50)
)
print(data)
## Name Displacement Moment Horsepower Length Width Weight
## 1 Kia Picanto 1.1. Start 1086 97 65 3535 1595 929
## 2 Suzuki Splash 1.0 996 90 65 3715 1680 1050
## 3 Renault Clio 1.0 1149 105 75 3986 1719 1155
## 4 Dacia Sandero 1.6 1598 128 87 4020 1746 1111
## 5 Fiat Grande Punto 1.4 1598 140 88 3986 1719 1215
## 6 Peugot 207 1.4 1360 133 88 4030 1748 1214
## 7 Renault Clio 1.6 1368 125 95 4030 1687 1135
## 8 Porsche Cayman 3386 340 295 4341 1801 1340
## 9 Nissan 350Z 3498 353 301 4315 1815 1610
## 10 Mercedes c200 CDI 2148 270 136 4595 1770 1605
## 11 VW Passat Variant 2.0 1968 320 140 4774 1820 1596
## 12 Skoda Octavia 2.0 1968 320 140 4572 1769 1425
## 13 Mercedes E280 2996 300 231 4852 1822 1660
## 14 Audi A6 2.4 2393 230 177 4916 1855 1525
## 15 BMW 525i 2497 250 218 4841 1846 1550
## Trunk Speed Acceleration
## 1 127 154 15.1
## 2 178 160 14.7
## 3 288 167 13.4
## 4 320 174 11.5
## 5 288 177 11.9
## 6 270 180 12.7
## 7 275 178 11.4
## 8 410 275 5.4
## 9 235 250 5.8
## 10 485 208 10.8
## 11 588 201 10.5
## 12 580 207 9.7
## 13 540 250 7.3
## 14 546 231 8.9
## 15 520 245 7.5
str(data)
## 'data.frame': 15 obs. of 10 variables:
## $ Name : chr "Kia Picanto 1.1. Start" "Suzuki Splash 1.0" "Renault Clio 1.0" "Dacia Sandero 1.6" ...
## $ Displacement: num 1086 996 1149 1598 1598 ...
## $ Moment : num 97 90 105 128 140 133 125 340 353 270 ...
## $ Horsepower : num 65 65 75 87 88 88 95 295 301 136 ...
## $ Length : num 3535 3715 3986 4020 3986 ...
## $ Width : num 1595 1680 1719 1746 1719 ...
## $ Weight : num 929 1050 1155 1111 1215 ...
## $ Trunk : num 127 178 288 320 288 270 275 410 235 485 ...
## $ Speed : num 154 160 167 174 177 180 178 275 250 208 ...
## $ Acceleration: num 15.1 14.7 13.4 11.5 11.9 12.7 11.4 5.4 5.8 10.8 ...
correlation_matrix <- cor(data[, -1]) # Exclude the Name column
# Print correlation matrix
print(correlation_matrix)
## Displacement Moment Horsepower Length Width
## Displacement 1.0000000 0.8752790 0.9833009 0.6567364 0.7643470
## Moment 0.8752790 1.0000000 0.8466490 0.7665564 0.7657827
## Horsepower 0.9833009 0.8466490 1.0000000 0.6082133 0.7319137
## Length 0.6567364 0.7665564 0.6082133 1.0000000 0.9116335
## Width 0.7643470 0.7657827 0.7319137 0.9116335 1.0000000
## Weight 0.7678926 0.8618937 0.7143511 0.9213978 0.8837887
## Trunk 0.4698137 0.6908975 0.4079151 0.9342425 0.7833397
## Speed 0.9669150 0.8592533 0.9683902 0.7411032 0.8192538
## Acceleration -0.9685791 -0.8609295 -0.9614512 -0.7142153 -0.8182177
## Weight Trunk Speed Acceleration
## Displacement 0.7678926 0.4698137 0.9669150 -0.9685791
## Moment 0.8618937 0.6908975 0.8592533 -0.8609295
## Horsepower 0.7143511 0.4079151 0.9683902 -0.9614512
## Length 0.9213978 0.9342425 0.7411032 -0.7142153
## Width 0.8837887 0.7833397 0.8192538 -0.8182177
## Weight 1.0000000 0.7854973 0.7783739 -0.7627800
## Trunk 0.7854973 1.0000000 0.5789343 -0.5521259
## Speed 0.7783739 0.5789343 1.0000000 -0.9709323
## Acceleration -0.7627800 -0.5521259 -0.9709323 1.0000000
##Computed the correlation matrix for the variables in the dataset,
excluding the “Name” column. Printed the correlation matrix to examine
the relationships between different variables.Displayed the correlation
matrix as a table using the kable function from the
knitr package, formatting it for easy readability. Excluded
the 4th column (Horsepower) from the dataset using the line
data <- data[, -4]. This workflow suggests that you have
analyzed the correlation between various attributes of vehicles in the
dataset, observed strong correlations between some variables, and
removed the “Horsepower” variable from the dataset,due to
multicollinearity.
library(knitr)
## Warning: package 'knitr' was built under R version 4.2.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.2.3
kable(round(correlation_matrix, 2), caption = "Correlation Matrix", format = "pipe", digits = 2)
| Displacement | Moment | Horsepower | Length | Width | Weight | Trunk | Speed | Acceleration | |
|---|---|---|---|---|---|---|---|---|---|
| Displacement | 1.00 | 0.88 | 0.98 | 0.66 | 0.76 | 0.77 | 0.47 | 0.97 | -0.97 |
| Moment | 0.88 | 1.00 | 0.85 | 0.77 | 0.77 | 0.86 | 0.69 | 0.86 | -0.86 |
| Horsepower | 0.98 | 0.85 | 1.00 | 0.61 | 0.73 | 0.71 | 0.41 | 0.97 | -0.96 |
| Length | 0.66 | 0.77 | 0.61 | 1.00 | 0.91 | 0.92 | 0.93 | 0.74 | -0.71 |
| Width | 0.76 | 0.77 | 0.73 | 0.91 | 1.00 | 0.88 | 0.78 | 0.82 | -0.82 |
| Weight | 0.77 | 0.86 | 0.71 | 0.92 | 0.88 | 1.00 | 0.79 | 0.78 | -0.76 |
| Trunk | 0.47 | 0.69 | 0.41 | 0.93 | 0.78 | 0.79 | 1.00 | 0.58 | -0.55 |
| Speed | 0.97 | 0.86 | 0.97 | 0.74 | 0.82 | 0.78 | 0.58 | 1.00 | -0.97 |
| Acceleration | -0.97 | -0.86 | -0.96 | -0.71 | -0.82 | -0.76 | -0.55 | -0.97 | 1.00 |
data <- data[, -4] # Exclude the 4th column (Horsepower)
data
## Name Displacement Moment Length Width Weight Trunk Speed
## 1 Kia Picanto 1.1. Start 1086 97 3535 1595 929 127 154
## 2 Suzuki Splash 1.0 996 90 3715 1680 1050 178 160
## 3 Renault Clio 1.0 1149 105 3986 1719 1155 288 167
## 4 Dacia Sandero 1.6 1598 128 4020 1746 1111 320 174
## 5 Fiat Grande Punto 1.4 1598 140 3986 1719 1215 288 177
## 6 Peugot 207 1.4 1360 133 4030 1748 1214 270 180
## 7 Renault Clio 1.6 1368 125 4030 1687 1135 275 178
## 8 Porsche Cayman 3386 340 4341 1801 1340 410 275
## 9 Nissan 350Z 3498 353 4315 1815 1610 235 250
## 10 Mercedes c200 CDI 2148 270 4595 1770 1605 485 208
## 11 VW Passat Variant 2.0 1968 320 4774 1820 1596 588 201
## 12 Skoda Octavia 2.0 1968 320 4572 1769 1425 580 207
## 13 Mercedes E280 2996 300 4852 1822 1660 540 250
## 14 Audi A6 2.4 2393 230 4916 1855 1525 546 231
## 15 BMW 525i 2497 250 4841 1846 1550 520 245
## Acceleration
## 1 15.1
## 2 14.7
## 3 13.4
## 4 11.5
## 5 11.9
## 6 12.7
## 7 11.4
## 8 5.4
## 9 5.8
## 10 10.8
## 11 10.5
## 12 9.7
## 13 7.3
## 14 8.9
## 15 7.5
# Standardize the variables
scaled_data <- scale(data[, -1]) # Exclude the Name column and standardize
hc <- hclust(dist(scaled_data), method = "ward.D2")
within_cluster_variation <- hc$height
print(within_cluster_variation)
## [1] 0.5965032 0.7015980 0.7301346 0.9293429 1.0138401 1.1507777
## [7] 1.1988690 1.3836543 1.4157920 1.7481772 3.0924637 3.7398755
## [13] 4.8404522 12.8066644
dist_matrix <- dist(scaled_data, method = 'euclidean')
# Plot the hierarchical clustering dendrogram
plot(hc, labels = data$Name, main = 'Hierarchical Clustering with Single Linkage', hang = -1, cex = 0.6)
# Add a horizontal line at the height of the tallest merge
abline(h = hc$height[length(hc$height) - 1], col = "red", lwd = 2, lty = 2)
# Add a legend for the line
legend("topright", legend = "Tallest Merge", col = "red", lty = 2, lwd = 2, bty = "n")
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
# Calculate PCA
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
# Calculate variance explained
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
# Plot the scree plot
plot(var_explained,
xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = 'b',
pch = 19,
col = "blue") # Change color to blue
# Add grid lines
grid(col = "gray")
# Add vertical line at the elbow point
abline(v = which.max(diff(diff(var_explained))),
col = "red",
lty = 2)
# Add title with enhanced formatting
title(main = "Scree Plot of PCA Results",
sub = "Showing Proportion of Variance Explained by Principal Components",
col.main = "darkblue",
col.sub = "gray",
font.main = 2,
font.sub = 3)
# Add legend
legend("topright",
legend = "Elbow Point",
col = "red",
lty = 2,
lwd = 2,
bty = "n",
cex = 0.8)
K <- 2
kM <- kmeans(scaled_data, centers = K)
print(kM)
## K-means clustering with 2 clusters of sizes 8, 7
##
## Cluster means:
## Displacement Moment Length Width Weight Trunk
## 1 0.7444185 0.8507027 0.7955259 0.7390952 0.8213619 0.7173000
## 2 -0.8507639 -0.9722316 -0.9091725 -0.8446802 -0.9386993 -0.8197714
## Speed Acceleration
## 1 0.7747260 -0.7330157
## 2 -0.8854012 0.8377322
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 20.608874 9.385799
## (between_SS / total_SS = 73.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Add cluster membership to the original dataset
km_clusters <- kM$cluster
# Display cluster centers
rownames(scaled_data) <- data$Name
print(scaled_data)
## Displacement Moment Length Width
## Kia Picanto 1.1. Start -1.12322877 -1.1722023 -1.73892815 -2.3029338
## Suzuki Splash 1.0 -1.23375861 -1.2426956 -1.33005364 -1.1127268
## Renault Clio 1.0 -1.04585789 -1.0916386 -0.71447035 -0.5666319
## Dacia Sandero 1.6 -0.49443681 -0.8600178 -0.63723850 -0.1885661
## Fiat Grande Punto 1.4 -0.49443681 -0.7391722 -0.71447035 -0.5666319
## Peugot 207 1.4 -0.78672682 -0.8096655 -0.61452325 -0.1605613
## Renault Clio 1.6 -0.77690195 -0.8902292 -0.61452325 -1.0147098
## Porsche Cayman 1.70142263 1.2749211 0.09192105 0.5815678
## Nissan 350Z 1.83897088 1.4058371 0.03286140 0.7776019
## Mercedes c200 CDI 0.18102331 0.5699884 0.66888841 0.1474923
## VW Passat Variant 2.0 -0.04003636 1.0735117 1.07549140 0.8476141
## Skoda Octavia 2.0 -0.04003636 1.0735117 0.61664334 0.1334899
## Mercedes E280 1.22246000 0.8721024 1.25267035 0.8756189
## Audi A6 2.4 0.48191009 0.1671697 1.39804796 1.3376993
## BMW 525i 0.60963346 0.3685791 1.22768358 1.2116774
## Weight Trunk Speed Acceleration
## Kia Picanto 1.1. Start -1.71444790 -1.6085559 -1.30452598 1.55089812
## Suzuki Splash 1.0 -1.21133991 -1.2799724 -1.14735417 1.41777382
## Renault Clio 1.0 -0.77475859 -0.5712628 -0.96398707 0.98511984
## Dacia Sandero 1.6 -0.95770695 -0.3650928 -0.78061996 0.35277940
## Fiat Grande Punto 1.4 -0.52528355 -0.5712628 -0.70203406 0.48590370
## Peugot 207 1.4 -0.52944147 -0.6872335 -0.62344816 0.75215231
## Renault Clio 1.6 -0.85791694 -0.6550194 -0.67583876 0.31949833
## Porsche Cayman -0.00554389 0.2147605 1.86510541 -1.67736621
## Nissan 350Z 1.11709378 -0.9127320 1.21022289 -1.54424190
## Mercedes c200 CDI 1.09630419 0.6979715 0.11002026 0.11981187
## VW Passat Variant 2.0 1.05888294 1.3615813 -0.07334684 0.01996865
## Skoda Octavia 2.0 0.34787908 1.3100388 0.08382496 -0.24627996
## Mercedes E280 1.32498965 1.0523263 1.21022289 -1.04502577
## Audi A6 2.4 0.76367081 1.0909832 0.71251218 -0.51252856
## BMW 525i 0.86761874 0.9234700 1.07924639 -0.97846362
## attr(,"scaled:center")
## Displacement Moment Length Width Weight Trunk
## 2000.6000 213.4000 4300.5333 1759.4667 1341.3333 376.6667
## Speed Acceleration
## 203.8000 10.4400
## attr(,"scaled:scale")
## Displacement Moment Length Width Weight Trunk
## 814.259769 99.300266 440.232871 71.416151 240.505024 155.211683
## Speed Acceleration
## 38.174786 3.004711
#Table for initial cluster center
kable(kM$centers, caption = "Initial Cluster Centers", digits = 2)
| Displacement | Moment | Length | Width | Weight | Trunk | Speed | Acceleration |
|---|---|---|---|---|---|---|---|
| 0.74 | 0.85 | 0.80 | 0.74 | 0.82 | 0.72 | 0.77 | -0.73 |
| -0.85 | -0.97 | -0.91 | -0.84 | -0.94 | -0.82 | -0.89 | 0.84 |
#Validating
clust_2 <- cutree(hc,2)
print(clust_2)
## [1] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
clust_2 <- as.factor(clust_2)
anova_tb <- aov(cbind(data$Displacement,
data$Moment,
data$Length,
data$Width,
data$Weight,
data$Trunk,
data$Speed,
data$Acceleration) ~ clust_2, data = data)
# get a summary of anova results
summary(anova_tb)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 6298591 6298591 27.443 0.00016 ***
## Residuals 13 2983674 229513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 122332 122332 101.19 1.68e-07 ***
## Residuals 13 15716 1209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 2102601 2102601 44.76 1.493e-05 ***
## Residuals 13 610669 46975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 47761 47761 26.262 0.0001951 ***
## Residuals 13 23642 1819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 668961 668961 61.749 2.719e-06 ***
## Residuals 13 140837 10834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 212488 212488 22.137 0.0004115 ***
## Residuals 13 124782 9599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 7 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 14994.5 14994 36.045 4.419e-05 ***
## Residuals 13 5407.9 416
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 8 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 83.160 83.160 25.004 0.0002428 ***
## Residuals 13 43.236 3.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary_df <- summary(anova_tb)
print(summary_df)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 6298591 6298591 27.443 0.00016 ***
## Residuals 13 2983674 229513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 122332 122332 101.19 1.68e-07 ***
## Residuals 13 15716 1209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 2102601 2102601 44.76 1.493e-05 ***
## Residuals 13 610669 46975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 47761 47761 26.262 0.0001951 ***
## Residuals 13 23642 1819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 668961 668961 61.749 2.719e-06 ***
## Residuals 13 140837 10834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 212488 212488 22.137 0.0004115 ***
## Residuals 13 124782 9599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 7 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 14994.5 14994 36.045 4.419e-05 ***
## Residuals 13 5407.9 416
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 8 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 83.160 83.160 25.004 0.0002428 ***
## Residuals 13 43.236 3.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cluster_means <- aggregate(. ~ clust_2, data = data[, -1], FUN = mean, na.rm = TRUE)
print(cluster_means)
## clust_2 Displacement Moment Length Width Weight Trunk Speed
## 1 1 1307.857 116.8571 3900.286 1699.143 1115.571 249.4286 170.000
## 2 2 2606.750 297.8750 4650.750 1812.250 1538.875 488.0000 233.375
## Acceleration
## 1 12.95714
## 2 8.23750
##Cluster 1 represents vehicles with relatively smaller engine displacements, lengths, widths, and weights. They tend to have smaller trunks and lower speeds but higher acceleration compared to Cluster 2.
##Cluster 2 represents vehicles with larger engine displacements, lengths, widths, and weights compared to Cluster 1. These vehicles tend to have larger trunks and higher speeds but lower acceleration
##The analysis reveals two distinct segments within the automotive market based on the characteristics of the vehicles.Understanding the differences between the clusters can help in tailoring marketing strategies and product offerings to better target each segment’s preferences and needs.By comparing the characteristics of the clusters, we can gain insights into the performance trade-offs between different types of vehicles and optimize their product portfolios accordingly.