Market Research

Install packages

install.packages("tidyverse", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dyplyr", repos = "https://cloud.r-project.org")
## Warning: package 'dyplyr' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(dplyr)
install.packages("psych", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
install.packages("factoextra", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
install.packages("NbClust", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(NbClust)
library(ggplot2)
install.packages("ggfortify", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(ggfortify)
library(stats)
library(corrplot)
## corrplot 0.92 loaded

Read the Data

dt <- read_csv("Theltegosdata1.csv")
## Rows: 15 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Name
## dbl (9): Displacement, Moment, Horsepower, Length, Width, Weight, Trunk, Spe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(dt)
## spc_tbl_ [15 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Name        : chr [1:15] "Kia Picanto 1.1. Start" "Suzuki Splash 1.0" "Renault Clio 1.0" "Dacia Sandero 1.6" ...
##  $ Displacement: num [1:15] 1086 996 1149 1598 1598 ...
##  $ Moment      : num [1:15] 97 90 105 128 140 133 125 340 353 270 ...
##  $ Horsepower  : num [1:15] 65 65 75 87 88 88 95 295 301 136 ...
##  $ Length      : num [1:15] 3535 3715 3986 4020 3986 ...
##  $ Width       : num [1:15] 1595 1680 1719 1746 1719 ...
##  $ Weight      : num [1:15] 929 1050 1155 1111 1215 ...
##  $ Trunk       : num [1:15] 127 178 288 320 288 270 275 410 235 485 ...
##  $ Speed       : num [1:15] 154 160 167 174 177 180 178 275 250 208 ...
##  $ Acceleration: num [1:15] 15.1 14.7 13.4 11.5 11.9 12.7 11.4 5.4 5.8 10.8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Name = col_character(),
##   ..   Displacement = col_double(),
##   ..   Moment = col_double(),
##   ..   Horsepower = col_double(),
##   ..   Length = col_double(),
##   ..   Width = col_double(),
##   ..   Weight = col_double(),
##   ..   Trunk = col_double(),
##   ..   Speed = col_double(),
##   ..   Acceleration = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Step 1

Collinearity Assesstment

Calculate Pearson correlation matrix

cor_matrix <- cor(dt[,sapply(dt, is.numeric)], method = "pearson")

Inspect the correlation matrix

print(cor_matrix)
##              Displacement     Moment Horsepower     Length      Width
## Displacement    1.0000000  0.8752790  0.9833009  0.6567364  0.7643470
## Moment          0.8752790  1.0000000  0.8466490  0.7665564  0.7657827
## Horsepower      0.9833009  0.8466490  1.0000000  0.6082133  0.7319137
## Length          0.6567364  0.7665564  0.6082133  1.0000000  0.9116335
## Width           0.7643470  0.7657827  0.7319137  0.9116335  1.0000000
## Weight          0.7678926  0.8618937  0.7143511  0.9213978  0.8837887
## Trunk           0.4698137  0.6908975  0.4079151  0.9342425  0.7833397
## Speed           0.9669150  0.8592533  0.9683902  0.7411032  0.8192538
## Acceleration   -0.9685791 -0.8609295 -0.9614512 -0.7142153 -0.8182177
##                  Weight      Trunk      Speed Acceleration
## Displacement  0.7678926  0.4698137  0.9669150   -0.9685791
## Moment        0.8618937  0.6908975  0.8592533   -0.8609295
## Horsepower    0.7143511  0.4079151  0.9683902   -0.9614512
## Length        0.9213978  0.9342425  0.7411032   -0.7142153
## Width         0.8837887  0.7833397  0.8192538   -0.8182177
## Weight        1.0000000  0.7854973  0.7783739   -0.7627800
## Trunk         0.7854973  1.0000000  0.5789343   -0.5521259
## Speed         0.7783739  0.5789343  1.0000000   -0.9709323
## Acceleration -0.7627800 -0.5521259 -0.9709323    1.0000000

Create a more readable table using kable

library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
kable(round(cor_matrix, 2), caption = "Correlation Matrix", format = "html", digits = 2)
Correlation Matrix
Displacement Moment Horsepower Length Width Weight Trunk Speed Acceleration
Displacement 1.00 0.88 0.98 0.66 0.76 0.77 0.47 0.97 -0.97
Moment 0.88 1.00 0.85 0.77 0.77 0.86 0.69 0.86 -0.86
Horsepower 0.98 0.85 1.00 0.61 0.73 0.71 0.41 0.97 -0.96
Length 0.66 0.77 0.61 1.00 0.91 0.92 0.93 0.74 -0.71
Width 0.76 0.77 0.73 0.91 1.00 0.88 0.78 0.82 -0.82
Weight 0.77 0.86 0.71 0.92 0.88 1.00 0.79 0.78 -0.76
Trunk 0.47 0.69 0.41 0.93 0.78 0.79 1.00 0.58 -0.55
Speed 0.97 0.86 0.97 0.74 0.82 0.78 0.58 1.00 -0.97
Acceleration -0.97 -0.86 -0.96 -0.71 -0.82 -0.76 -0.55 -0.97 1.00
kable(round(cor_matrix, 2), caption = "Correlation Matrix", format = "html", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  column_spec(1, border_right = TRUE, extra_css = "padding-right:10px;") %>%
  column_spec(2, border_right = TRUE, extra_css = "padding-right:10px;")
Correlation Matrix
Displacement Moment Horsepower Length Width Weight Trunk Speed Acceleration
Displacement 1.00 0.88 0.98 0.66 0.76 0.77 0.47 0.97 -0.97
Moment 0.88 1.00 0.85 0.77 0.77 0.86 0.69 0.86 -0.86
Horsepower 0.98 0.85 1.00 0.61 0.73 0.71 0.41 0.97 -0.96
Length 0.66 0.77 0.61 1.00 0.91 0.92 0.93 0.74 -0.71
Width 0.76 0.77 0.73 0.91 1.00 0.88 0.78 0.82 -0.82
Weight 0.77 0.86 0.71 0.92 0.88 1.00 0.79 0.78 -0.76
Trunk 0.47 0.69 0.41 0.93 0.78 0.79 1.00 0.58 -0.55
Speed 0.97 0.86 0.97 0.74 0.82 0.78 0.58 1.00 -0.97
Acceleration -0.97 -0.86 -0.96 -0.71 -0.82 -0.76 -0.55 -0.97 1.00

Removing the Horsepower variable due to Collinearity issues.

dt2 <- dt[,-4]

Step 2

Hierarchical Clustering

Standardize the data

Excluding the non-numeric variable to standardize the dataset

data_numeric2 <- dt2[, !colnames(dt2) %in% c('Name')]
data_scaled <- scale(data_numeric2)

Calculate the Euclidean distance (Dissimilarity Matrix)

dist_matrix <- dist(data_scaled, method = 'euclidean')

Creating the hierarchical clustering (single linkage)

hc <- hclust(dist_matrix, method = 'single')
print(hc)
## 
## Call:
## hclust(d = dist_matrix, method = "single")
## 
## Cluster method   : single 
## Distance         : euclidean 
## Number of objects: 15

Checking the height

within_cluster_variation <- hc$height
print(within_cluster_variation)
##  [1] 0.5965032 0.6456749 0.6777943 0.7015980 0.7519380 0.9950021 1.1507777
##  [8] 1.1725207 1.2841234 1.4157920 1.5810159 1.7481772 2.2640748 3.1028633

Step 3

Plot the dendogram

plot(hc, labels = dt2$Name,
     main = 'Hierarchical Clustering with Single Linkage', 
     hang = -1,
     cex = 0.6)

# Step 3

Creatig the Scree plot

pca_result <- prcomp(data_scaled, center = TRUE, scale. = TRUE)

Extracting the variance explained by each principal component

var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)

Plotting the Scree Plot

plot(var_explained, xlab = "Principal Component", ylab = "Proportion of Variance Explained", type = 'b', pch = 19, main = "Scree Plot")
abline(v = which.max(diff(diff(var_explained))), col = "red", lty = 2)

# Adding a cumulative variance explained line
var_explained_cumulative <- cumsum(var_explained)
lines(var_explained_cumulative, col = "blue", lty = 2)

legend("topright", legend = c("Variance Explained", "Cumulative Variance"), col = c("black", "blue"), lty = 1:2)

fviz_eig(pca_result)

# Step 4

K-means Clustering

Number of cluster identified

K <- 2
kM <- kmeans(data_scaled, centers = K)
print(kM)
## K-means clustering with 2 clusters of sizes 7, 8
## 
## Cluster means:
##   Displacement     Moment     Length      Width     Weight      Trunk
## 1   -0.8507639 -0.9722316 -0.9091725 -0.8446802 -0.9386993 -0.8197714
## 2    0.7444185  0.8507027  0.7955259  0.7390952  0.8213619  0.7173000
##        Speed Acceleration
## 1 -0.8854012    0.8377322
## 2  0.7747260   -0.7330157
## 
## Clustering vector:
##  [1] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1]  9.385799 20.608874
##  (between_SS / total_SS =  73.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
autoplot(kM, data_scaled, frame = TRUE)

km_clusters <- kM$cluster
rownames(data_scaled) <- dt2$Name
print(data_scaled)
##                        Displacement     Moment      Length      Width
## Kia Picanto 1.1. Start  -1.12322877 -1.1722023 -1.73892815 -2.3029338
## Suzuki Splash 1.0       -1.23375861 -1.2426956 -1.33005364 -1.1127268
## Renault Clio 1.0        -1.04585789 -1.0916386 -0.71447035 -0.5666319
## Dacia Sandero 1.6       -0.49443681 -0.8600178 -0.63723850 -0.1885661
## Fiat Grande Punto 1.4   -0.49443681 -0.7391722 -0.71447035 -0.5666319
## Peugot 207 1.4          -0.78672682 -0.8096655 -0.61452325 -0.1605613
## Renault Clio 1.6        -0.77690195 -0.8902292 -0.61452325 -1.0147098
## Porsche Cayman           1.70142263  1.2749211  0.09192105  0.5815678
## Nissan 350Z              1.83897088  1.4058371  0.03286140  0.7776019
## Mercedes c200 CDI        0.18102331  0.5699884  0.66888841  0.1474923
## VW Passat Variant 2.0   -0.04003636  1.0735117  1.07549140  0.8476141
## Skoda Octavia 2.0       -0.04003636  1.0735117  0.61664334  0.1334899
## Mercedes E280            1.22246000  0.8721024  1.25267035  0.8756189
## Audi A6 2.4              0.48191009  0.1671697  1.39804796  1.3376993
## BMW 525i                 0.60963346  0.3685791  1.22768358  1.2116774
##                             Weight      Trunk       Speed Acceleration
## Kia Picanto 1.1. Start -1.71444790 -1.6085559 -1.30452598   1.55089812
## Suzuki Splash 1.0      -1.21133991 -1.2799724 -1.14735417   1.41777382
## Renault Clio 1.0       -0.77475859 -0.5712628 -0.96398707   0.98511984
## Dacia Sandero 1.6      -0.95770695 -0.3650928 -0.78061996   0.35277940
## Fiat Grande Punto 1.4  -0.52528355 -0.5712628 -0.70203406   0.48590370
## Peugot 207 1.4         -0.52944147 -0.6872335 -0.62344816   0.75215231
## Renault Clio 1.6       -0.85791694 -0.6550194 -0.67583876   0.31949833
## Porsche Cayman         -0.00554389  0.2147605  1.86510541  -1.67736621
## Nissan 350Z             1.11709378 -0.9127320  1.21022289  -1.54424190
## Mercedes c200 CDI       1.09630419  0.6979715  0.11002026   0.11981187
## VW Passat Variant 2.0   1.05888294  1.3615813 -0.07334684   0.01996865
## Skoda Octavia 2.0       0.34787908  1.3100388  0.08382496  -0.24627996
## Mercedes E280           1.32498965  1.0523263  1.21022289  -1.04502577
## Audi A6 2.4             0.76367081  1.0909832  0.71251218  -0.51252856
## BMW 525i                0.86761874  0.9234700  1.07924639  -0.97846362
## attr(,"scaled:center")
## Displacement       Moment       Length        Width       Weight        Trunk 
##    2000.6000     213.4000    4300.5333    1759.4667    1341.3333     376.6667 
##        Speed Acceleration 
##     203.8000      10.4400 
## attr(,"scaled:scale")
## Displacement       Moment       Length        Width       Weight        Trunk 
##   814.259769    99.300266   440.232871    71.416151   240.505024   155.211683 
##        Speed Acceleration 
##    38.174786     3.004711
fviz_cluster(list(data = data_scaled, cluster = km_clusters))

kM$centers
##   Displacement     Moment     Length      Width     Weight      Trunk
## 1   -0.8507639 -0.9722316 -0.9091725 -0.8446802 -0.9386993 -0.8197714
## 2    0.7444185  0.8507027  0.7955259  0.7390952  0.8213619  0.7173000
##        Speed Acceleration
## 1 -0.8854012    0.8377322
## 2  0.7747260   -0.7330157

Step 6

Create a table for the Initial cluster centers

kable(kM$centers, caption = "Initial Cluster Centers", digits = 2)
Initial Cluster Centers
Displacement Moment Length Width Weight Trunk Speed Acceleration
-0.85 -0.97 -0.91 -0.84 -0.94 -0.82 -0.89 0.84
0.74 0.85 0.80 0.74 0.82 0.72 0.77 -0.73
kable(round(kM$centers, 2), caption = "Initial Clusters Centers", format = "html", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  column_spec(1, border_right = TRUE, extra_css = "padding-right:10px;") %>%
  column_spec(2, border_right = TRUE, extra_css = "padding-right:10px;")
Initial Clusters Centers
Displacement Moment Length Width Weight Trunk Speed Acceleration
-0.85 -0.97 -0.91 -0.84 -0.94 -0.82 -0.89 0.84
0.74 0.85 0.80 0.74 0.82 0.72 0.77 -0.73

Validating the Clustering

clust_2 <- cutree(hc,2)
print(clust_2)
##  [1] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2

Since Clust_2 is a discrete variable with 2 levels do as.factor

clust_2 <- as.factor(clust_2)
# run anova on all the variables
anova_tb <- aov(cbind(dt2$Displacement,
                      dt2$Moment,
                      dt2$Length,
                      dt2$Width,
                      dt2$Weight,
                      dt2$Trunk,
                      dt2$Speed,
                      dt2$Acceleration) ~ clust_2, data = dt2)
# get a summary of anova results
summary(anova_tb)
##  Response 1 :
##             Df  Sum Sq Mean Sq F value  Pr(>F)    
## clust_2      1 6298591 6298591  27.443 0.00016 ***
## Residuals   13 2983674  229513                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## clust_2      1 122332  122332  101.19 1.68e-07 ***
## Residuals   13  15716    1209                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 2102601 2102601   44.76 1.493e-05 ***
## Residuals   13  610669   46975                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1  47761   47761  26.262 0.0001951 ***
## Residuals   13  23642    1819                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 5 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 668961  668961  61.749 2.719e-06 ***
## Residuals   13 140837   10834                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 6 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 212488  212488  22.137 0.0004115 ***
## Residuals   13 124782    9599                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 7 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 14994.5   14994  36.045 4.419e-05 ***
## Residuals   13  5407.9     416                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 8 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 83.160  83.160  25.004 0.0002428 ***
## Residuals   13 43.236   3.326                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary_df <- summary(anova_tb)
print(summary_df)
##  Response 1 :
##             Df  Sum Sq Mean Sq F value  Pr(>F)    
## clust_2      1 6298591 6298591  27.443 0.00016 ***
## Residuals   13 2983674  229513                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## clust_2      1 122332  122332  101.19 1.68e-07 ***
## Residuals   13  15716    1209                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 2102601 2102601   44.76 1.493e-05 ***
## Residuals   13  610669   46975                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1  47761   47761  26.262 0.0001951 ***
## Residuals   13  23642    1819                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 5 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 668961  668961  61.749 2.719e-06 ***
## Residuals   13 140837   10834                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 6 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 212488  212488  22.137 0.0004115 ***
## Residuals   13 124782    9599                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 7 :
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 14994.5   14994  36.045 4.419e-05 ***
## Residuals   13  5407.9     416                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 8 :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## clust_2      1 83.160  83.160  25.004 0.0002428 ***
## Residuals   13 43.236   3.326                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Calculate means of all variables by cluster

cluster_means <- aggregate(. ~ clust_2, data = data_numeric2, FUN = mean)