library(cluster)

## Warning: package 'cluster' was built under R version 4.2.3

Load the dataset

data <- data.frame(
  Name = c("Kia Picanto 1.1. Start", "Suzuki Splash 1.0", "Renault Clio 1.0", 
           "Dacia Sandero 1.6", "Fiat Grande Punto 1.4", "Peugot 207 1.4", 
           "Renault Clio 1.6", "Porsche Cayman", "Nissan 350Z", "Mercedes c200 CDI", 
           "VW Passat Variant 2.0", "Skoda Octavia 2.0", "Mercedes E280", "Audi A6 2.4", 
           "BMW 525i"),
  Displacement = c(1086, 996, 1149, 1598, 1598, 1360, 1368, 3386, 3498, 2148, 
                   1968, 1968, 2996, 2393, 2497),
  Moment = c(97, 90, 105, 128, 140, 133, 125, 340, 353, 270, 320, 320, 300, 230, 250),
  Horsepower = c(65, 65, 75, 87, 88, 88, 95, 295, 301, 136, 140, 140, 231, 177, 218),
  Length = c(3535, 3715, 3986, 4020, 3986, 4030, 4030, 4341, 4315, 4595, 4774, 4572, 4852, 4916, 4841),
  Width = c(1595, 1680, 1719, 1746, 1719, 1748, 1687, 1801, 1815, 1770, 1820, 1769, 1822, 1855, 1846),
  Weight = c(929, 1050, 1155, 1111, 1215, 1214, 1135, 1340, 1610, 1605, 1596, 1425, 1660, 1525, 1550),
  Trunk = c(127, 178, 288, 320, 288, 270, 275, 410, 235, 485, 588, 580, 540, 546, 520),
  Speed = c(154, 160, 167, 174, 177, 180, 178, 275, 250, 208, 201, 207, 250, 231, 245),
  Acceleration = c(15.10, 14.70, 13.40, 11.50, 11.90, 12.70, 11.40, 5.40, 5.80, 10.80, 
                   10.50, 9.70, 7.30, 8.90, 7.50)
)

Display the dataset

print(data)

##                      Name Displacement Moment Horsepower Length Width Weight
## 1  Kia Picanto 1.1. Start         1086     97         65   3535  1595    929
## 2       Suzuki Splash 1.0          996     90         65   3715  1680   1050
## 3        Renault Clio 1.0         1149    105         75   3986  1719   1155
## 4       Dacia Sandero 1.6         1598    128         87   4020  1746   1111
## 5   Fiat Grande Punto 1.4         1598    140         88   3986  1719   1215
## 6          Peugot 207 1.4         1360    133         88   4030  1748   1214
## 7        Renault Clio 1.6         1368    125         95   4030  1687   1135
## 8          Porsche Cayman         3386    340        295   4341  1801   1340
## 9             Nissan 350Z         3498    353        301   4315  1815   1610
## 10      Mercedes c200 CDI         2148    270        136   4595  1770   1605
## 11  VW Passat Variant 2.0         1968    320        140   4774  1820   1596
## 12      Skoda Octavia 2.0         1968    320        140   4572  1769   1425
## 13          Mercedes E280         2996    300        231   4852  1822   1660
## 14            Audi A6 2.4         2393    230        177   4916  1855   1525
## 15               BMW 525i         2497    250        218   4841  1846   1550
##    Trunk Speed Acceleration
## 1    127   154         15.1
## 2    178   160         14.7
## 3    288   167         13.4
## 4    320   174         11.5
## 5    288   177         11.9
## 6    270   180         12.7
## 7    275   178         11.4
## 8    410   275          5.4
## 9    235   250          5.8
## 10   485   208         10.8
## 11   588   201         10.5
## 12   580   207          9.7
## 13   540   250          7.3
## 14   546   231          8.9
## 15   520   245          7.5

str(data)

## 'data.frame':    15 obs. of  10 variables:
##  $ Name        : chr  "Kia Picanto 1.1. Start" "Suzuki Splash 1.0" "Renault Clio 1.0" "Dacia Sandero 1.6" ...
##  $ Displacement: num  1086 996 1149 1598 1598 ...
##  $ Moment      : num  97 90 105 128 140 133 125 340 353 270 ...
##  $ Horsepower  : num  65 65 75 87 88 88 95 295 301 136 ...
##  $ Length      : num  3535 3715 3986 4020 3986 ...
##  $ Width       : num  1595 1680 1719 1746 1719 ...
##  $ Weight      : num  929 1050 1155 1111 1215 ...
##  $ Trunk       : num  127 178 288 320 288 270 275 410 235 485 ...
##  $ Speed       : num  154 160 167 174 177 180 178 275 250 208 ...
##  $ Acceleration: num  15.1 14.7 13.4 11.5 11.9 12.7 11.4 5.4 5.8 10.8 ...

Compute the correlation matrix

correlation_matrix <- cor(data[, -1])  # Exclude the Name column

# Print correlation matrix
print(correlation_matrix)

##              Displacement     Moment Horsepower     Length      Width
## Displacement    1.0000000  0.8752790  0.9833009  0.6567364  0.7643470
## Moment          0.8752790  1.0000000  0.8466490  0.7665564  0.7657827
## Horsepower      0.9833009  0.8466490  1.0000000  0.6082133  0.7319137
## Length          0.6567364  0.7665564  0.6082133  1.0000000  0.9116335
## Width           0.7643470  0.7657827  0.7319137  0.9116335  1.0000000
## Weight          0.7678926  0.8618937  0.7143511  0.9213978  0.8837887
## Trunk           0.4698137  0.6908975  0.4079151  0.9342425  0.7833397
## Speed           0.9669150  0.8592533  0.9683902  0.7411032  0.8192538
## Acceleration   -0.9685791 -0.8609295 -0.9614512 -0.7142153 -0.8182177
##                  Weight      Trunk      Speed Acceleration
## Displacement  0.7678926  0.4698137  0.9669150   -0.9685791
## Moment        0.8618937  0.6908975  0.8592533   -0.8609295
## Horsepower    0.7143511  0.4079151  0.9683902   -0.9614512
## Length        0.9213978  0.9342425  0.7411032   -0.7142153
## Width         0.8837887  0.7833397  0.8192538   -0.8182177
## Weight        1.0000000  0.7854973  0.7783739   -0.7627800
## Trunk         0.7854973  1.0000000  0.5789343   -0.5521259
## Speed         0.7783739  0.5789343  1.0000000   -0.9709323
## Acceleration -0.7627800 -0.5521259 -0.9709323    1.0000000

Display the correlation matrix as a plain text table

##Computed the correlation matrix for the variables in the dataset, excluding the “Name” column. Printed the correlation matrix to examine the relationships between different variables.Displayed the correlation matrix as a table using the kable function from the knitr package, formatting it for easy readability. Excluded the 4th column (Horsepower) from the dataset using the line data <- data[, -4]. This workflow suggests that you have analyzed the correlation between various attributes of vehicles in the dataset, observed strong correlations between some variables, and removed the “Horsepower” variable from the dataset,due to multicollinearity.

library(knitr)

## Warning: package 'knitr' was built under R version 4.2.3

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.2.3

kable(round(correlation_matrix, 2), caption = "Correlation Matrix", format = "pipe", digits = 2)

Correlation Matrix
	Displacement	Moment	Horsepower	Length	Width	Weight	Trunk	Speed	Acceleration
Displacement	1.00	0.88	0.98	0.66	0.76	0.77	0.47	0.97	-0.97
Moment	0.88	1.00	0.85	0.77	0.77	0.86	0.69	0.86	-0.86
Horsepower	0.98	0.85	1.00	0.61	0.73	0.71	0.41	0.97	-0.96
Length	0.66	0.77	0.61	1.00	0.91	0.92	0.93	0.74	-0.71
Width	0.76	0.77	0.73	0.91	1.00	0.88	0.78	0.82	-0.82
Weight	0.77	0.86	0.71	0.92	0.88	1.00	0.79	0.78	-0.76
Trunk	0.47	0.69	0.41	0.93	0.78	0.79	1.00	0.58	-0.55
Speed	0.97	0.86	0.97	0.74	0.82	0.78	0.58	1.00	-0.97
Acceleration	-0.97	-0.86	-0.96	-0.71	-0.82	-0.76	-0.55	-0.97	1.00

data <- data[, -4]  # Exclude the 4th column (Horsepower)

data

##                      Name Displacement Moment Length Width Weight Trunk Speed
## 1  Kia Picanto 1.1. Start         1086     97   3535  1595    929   127   154
## 2       Suzuki Splash 1.0          996     90   3715  1680   1050   178   160
## 3        Renault Clio 1.0         1149    105   3986  1719   1155   288   167
## 4       Dacia Sandero 1.6         1598    128   4020  1746   1111   320   174
## 5   Fiat Grande Punto 1.4         1598    140   3986  1719   1215   288   177
## 6          Peugot 207 1.4         1360    133   4030  1748   1214   270   180
## 7        Renault Clio 1.6         1368    125   4030  1687   1135   275   178
## 8          Porsche Cayman         3386    340   4341  1801   1340   410   275
## 9             Nissan 350Z         3498    353   4315  1815   1610   235   250
## 10      Mercedes c200 CDI         2148    270   4595  1770   1605   485   208
## 11  VW Passat Variant 2.0         1968    320   4774  1820   1596   588   201
## 12      Skoda Octavia 2.0         1968    320   4572  1769   1425   580   207
## 13          Mercedes E280         2996    300   4852  1822   1660   540   250
## 14            Audi A6 2.4         2393    230   4916  1855   1525   546   231
## 15               BMW 525i         2497    250   4841  1846   1550   520   245
##    Acceleration
## 1          15.1
## 2          14.7
## 3          13.4
## 4          11.5
## 5          11.9
## 6          12.7
## 7          11.4
## 8           5.4
## 9           5.8
## 10         10.8
## 11         10.5
## 12          9.7
## 13          7.3
## 14          8.9
## 15          7.5

# Standardize the variables
scaled_data <- scale(data[, -1])  # Exclude the Name column and standardize

Compute hierarchical clustering

hc <- hclust(dist(scaled_data), method = "ward.D2")

within_cluster_variation <- hc$height
print(within_cluster_variation)

##  [1]  0.5965032  0.7015980  0.7301346  0.9293429  1.0138401  1.1507777
##  [7]  1.1988690  1.3836543  1.4157920  1.7481772  3.0924637  3.7398755
## [13]  4.8404522 12.8066644

dist_matrix <- dist(scaled_data, method = 'euclidean')

Plot the dendrogram

# Plot the hierarchical clustering dendrogram
plot(hc, labels = data$Name, main = 'Hierarchical Clustering with Single Linkage', hang = -1, cex = 0.6)


# Add a horizontal line at the height of the tallest merge
abline(h = hc$height[length(hc$height) - 1], col = "red", lwd = 2, lty = 2)

# Add a legend for the line
legend("topright", legend = "Tallest Merge", col = "red", lty = 2, lwd = 2, bty = "n")

Scree Plot

pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)

var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)

# Calculate PCA
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)

# Calculate variance explained
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)

# Plot the scree plot
plot(var_explained, 
     xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained", 
     type = 'b', 
     pch = 19, 
     col = "blue")  # Change color to blue

# Add grid lines
grid(col = "gray")

# Add vertical line at the elbow point
abline(v = which.max(diff(diff(var_explained))), 
       col = "red", 
       lty = 2)

# Add title with enhanced formatting
title(main = "Scree Plot of PCA Results", 
      sub = "Showing Proportion of Variance Explained by Principal Components", 
      col.main = "darkblue", 
      col.sub = "gray", 
      font.main = 2, 
      font.sub = 3)

# Add legend
legend("topright", 
       legend = "Elbow Point", 
       col = "red", 
       lty = 2, 
       lwd = 2, 
       bty = "n", 
       cex = 0.8)

Compute k-means clustering

K <- 2

kM <- kmeans(scaled_data, centers = K)
print(kM)

## K-means clustering with 2 clusters of sizes 8, 7
## 
## Cluster means:
##   Displacement     Moment     Length      Width     Weight      Trunk
## 1    0.7444185  0.8507027  0.7955259  0.7390952  0.8213619  0.7173000
## 2   -0.8507639 -0.9722316 -0.9091725 -0.8446802 -0.9386993 -0.8197714
##        Speed Acceleration
## 1  0.7747260   -0.7330157
## 2 -0.8854012    0.8377322
## 
## Clustering vector:
##  [1] 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 20.608874  9.385799
##  (between_SS / total_SS =  73.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# Add cluster membership to the original dataset

km_clusters <- kM$cluster

# Display cluster centers
rownames(scaled_data) <- data$Name
print(scaled_data)

##                        Displacement     Moment      Length      Width
## Kia Picanto 1.1. Start  -1.12322877 -1.1722023 -1.73892815 -2.3029338
## Suzuki Splash 1.0       -1.23375861 -1.2426956 -1.33005364 -1.1127268
## Renault Clio 1.0        -1.04585789 -1.0916386 -0.71447035 -0.5666319
## Dacia Sandero 1.6       -0.49443681 -0.8600178 -0.63723850 -0.1885661
## Fiat Grande Punto 1.4   -0.49443681 -0.7391722 -0.71447035 -0.5666319
## Peugot 207 1.4          -0.78672682 -0.8096655 -0.61452325 -0.1605613
## Renault Clio 1.6        -0.77690195 -0.8902292 -0.61452325 -1.0147098
## Porsche Cayman           1.70142263  1.2749211  0.09192105  0.5815678
## Nissan 350Z              1.83897088  1.4058371  0.03286140  0.7776019
## Mercedes c200 CDI        0.18102331  0.5699884  0.66888841  0.1474923
## VW Passat Variant 2.0   -0.04003636  1.0735117  1.07549140  0.8476141
## Skoda Octavia 2.0       -0.04003636  1.0735117  0.61664334  0.1334899
## Mercedes E280            1.22246000  0.8721024  1.25267035  0.8756189
## Audi A6 2.4              0.48191009  0.1671697  1.39804796  1.3376993
## BMW 525i                 0.60963346  0.3685791  1.22768358  1.2116774
##                             Weight      Trunk       Speed Acceleration
## Kia Picanto 1.1. Start -1.71444790 -1.6085559 -1.30452598   1.55089812
## Suzuki Splash 1.0      -1.21133991 -1.2799724 -1.14735417   1.41777382
## Renault Clio 1.0       -0.77475859 -0.5712628 -0.96398707   0.98511984
## Dacia Sandero 1.6      -0.95770695 -0.3650928 -0.78061996   0.35277940
## Fiat Grande Punto 1.4  -0.52528355 -0.5712628 -0.70203406   0.48590370
## Peugot 207 1.4         -0.52944147 -0.6872335 -0.62344816   0.75215231
## Renault Clio 1.6       -0.85791694 -0.6550194 -0.67583876   0.31949833
## Porsche Cayman         -0.00554389  0.2147605  1.86510541  -1.67736621
## Nissan 350Z             1.11709378 -0.9127320  1.21022289  -1.54424190
## Mercedes c200 CDI       1.09630419  0.6979715  0.11002026   0.11981187
## VW Passat Variant 2.0   1.05888294  1.3615813 -0.07334684   0.01996865
## Skoda Octavia 2.0       0.34787908  1.3100388  0.08382496  -0.24627996
## Mercedes E280           1.32498965  1.0523263  1.21022289  -1.04502577
## Audi A6 2.4             0.76367081  1.0909832  0.71251218  -0.51252856
## BMW 525i                0.86761874  0.9234700  1.07924639  -0.97846362
## attr(,"scaled:center")
## Displacement       Moment       Length        Width       Weight        Trunk 
##    2000.6000     213.4000    4300.5333    1759.4667    1341.3333     376.6667 
##        Speed Acceleration 
##     203.8000      10.4400 
## attr(,"scaled:scale")
## Displacement       Moment       Length        Width       Weight        Trunk 
##   814.259769    99.300266   440.232871    71.416151   240.505024   155.211683 
##        Speed Acceleration 
##    38.174786     3.004711

#Table for initial cluster center

kable(kM$centers, caption = "Initial Cluster Centers", digits = 2)

Initial Cluster Centers
Displacement	Moment	Length	Width	Weight	Trunk	Speed	Acceleration
0.74	0.85	0.80	0.74	0.82	0.72	0.77	-0.73
-0.85	-0.97	-0.91	-0.84	-0.94	-0.82	-0.89	0.84

#Validating 
clust_2 <- cutree(hc,2)
print(clust_2)

##  [1] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2

clust_2 <- as.factor(clust_2)

Running anova

anova_tb <- aov(cbind(data$Displacement,
                      data$Moment,
                      data$Length,
                      data$Width,
                      data$Weight,
                      data$Trunk,
                      data$Speed,
                      data$Acceleration) ~ clust_2, data = data)
# get a summary of anova results
summary(anova_tb)

##  Response 1 :
##             Df  Sum Sq Mean Sq F value  Pr(>F)    
## clust_2      1 6298591 6298591  27.443 0.00016 ***
## Residuals   13 2983674  229513                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## clust_2      1 122332  122332  101.19 1.68e-07 ***
## Residuals   13  15716    1209                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 2102601 2102601   44.76 1.493e-05 ***
## Residuals   13  610669   46975                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1  47761   47761  26.262 0.0001951 ***
## Residuals   13  23642    1819                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 5 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 668961  668961  61.749 2.719e-06 ***
## Residuals   13 140837   10834                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 6 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 212488  212488  22.137 0.0004115 ***
## Residuals   13 124782    9599                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 7 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 14994.5   14994  36.045 4.419e-05 ***
## Residuals   13  5407.9     416                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 8 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 83.160  83.160  25.004 0.0002428 ***
## Residuals   13 43.236   3.326                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary_df <- summary(anova_tb)

print(summary_df)

##  Response 1 :
##             Df  Sum Sq Mean Sq F value  Pr(>F)    
## clust_2      1 6298591 6298591  27.443 0.00016 ***
## Residuals   13 2983674  229513                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## clust_2      1 122332  122332  101.19 1.68e-07 ***
## Residuals   13  15716    1209                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 2102601 2102601   44.76 1.493e-05 ***
## Residuals   13  610669   46975                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1  47761   47761  26.262 0.0001951 ***
## Residuals   13  23642    1819                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 5 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 668961  668961  61.749 2.719e-06 ***
## Residuals   13 140837   10834                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 6 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 212488  212488  22.137 0.0004115 ***
## Residuals   13 124782    9599                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 7 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 14994.5   14994  36.045 4.419e-05 ***
## Residuals   13  5407.9     416                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 8 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 83.160  83.160  25.004 0.0002428 ***
## Residuals   13 43.236   3.326                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cluster_means <- aggregate(. ~ clust_2, data = data[, -1], FUN = mean, na.rm = TRUE)

print(cluster_means)

##   clust_2 Displacement   Moment   Length    Width   Weight    Trunk   Speed
## 1       1     1307.857 116.8571 3900.286 1699.143 1115.571 249.4286 170.000
## 2       2     2606.750 297.8750 4650.750 1812.250 1538.875 488.0000 233.375
##   Acceleration
## 1     12.95714
## 2      8.23750

##Cluster 1 represents vehicles with relatively smaller engine displacements, lengths, widths, and weights. They tend to have smaller trunks and lower speeds but higher acceleration compared to Cluster 2.

##Cluster 2 represents vehicles with larger engine displacements, lengths, widths, and weights compared to Cluster 1. These vehicles tend to have larger trunks and higher speeds but lower acceleration

##The analysis reveals two distinct segments within the automotive market based on the characteristics of the vehicles.Understanding the differences between the clusters can help in tailoring marketing strategies and product offerings to better target each segment’s preferences and needs.By comparing the characteristics of the clusters, we can gain insights into the performance trade-offs between different types of vehicles and optimize their product portfolios accordingly.

Clustering Assignment

Anuranjan Saxena

2024-03-22