install.packages("tidyverse", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dyplyr", repos = "https://cloud.r-project.org")
## Warning: package 'dyplyr' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(dplyr)
install.packages("psych", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
install.packages("factoextra", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
install.packages("NbClust", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(NbClust)
library(ggplot2)
install.packages("ggfortify", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpGHvvkJ/downloaded_packages
library(ggfortify)
library(stats)
library(corrplot)
## corrplot 0.92 loaded
dt <- read_csv("Theltegosdata1.csv")
## Rows: 15 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Name
## dbl (9): Displacement, Moment, Horsepower, Length, Width, Weight, Trunk, Spe...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(dt)
## spc_tbl_ [15 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Name : chr [1:15] "Kia Picanto 1.1. Start" "Suzuki Splash 1.0" "Renault Clio 1.0" "Dacia Sandero 1.6" ...
## $ Displacement: num [1:15] 1086 996 1149 1598 1598 ...
## $ Moment : num [1:15] 97 90 105 128 140 133 125 340 353 270 ...
## $ Horsepower : num [1:15] 65 65 75 87 88 88 95 295 301 136 ...
## $ Length : num [1:15] 3535 3715 3986 4020 3986 ...
## $ Width : num [1:15] 1595 1680 1719 1746 1719 ...
## $ Weight : num [1:15] 929 1050 1155 1111 1215 ...
## $ Trunk : num [1:15] 127 178 288 320 288 270 275 410 235 485 ...
## $ Speed : num [1:15] 154 160 167 174 177 180 178 275 250 208 ...
## $ Acceleration: num [1:15] 15.1 14.7 13.4 11.5 11.9 12.7 11.4 5.4 5.8 10.8 ...
## - attr(*, "spec")=
## .. cols(
## .. Name = col_character(),
## .. Displacement = col_double(),
## .. Moment = col_double(),
## .. Horsepower = col_double(),
## .. Length = col_double(),
## .. Width = col_double(),
## .. Weight = col_double(),
## .. Trunk = col_double(),
## .. Speed = col_double(),
## .. Acceleration = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
cor_matrix <- cor(dt[,sapply(dt, is.numeric)], method = "pearson")
print(cor_matrix)
## Displacement Moment Horsepower Length Width
## Displacement 1.0000000 0.8752790 0.9833009 0.6567364 0.7643470
## Moment 0.8752790 1.0000000 0.8466490 0.7665564 0.7657827
## Horsepower 0.9833009 0.8466490 1.0000000 0.6082133 0.7319137
## Length 0.6567364 0.7665564 0.6082133 1.0000000 0.9116335
## Width 0.7643470 0.7657827 0.7319137 0.9116335 1.0000000
## Weight 0.7678926 0.8618937 0.7143511 0.9213978 0.8837887
## Trunk 0.4698137 0.6908975 0.4079151 0.9342425 0.7833397
## Speed 0.9669150 0.8592533 0.9683902 0.7411032 0.8192538
## Acceleration -0.9685791 -0.8609295 -0.9614512 -0.7142153 -0.8182177
## Weight Trunk Speed Acceleration
## Displacement 0.7678926 0.4698137 0.9669150 -0.9685791
## Moment 0.8618937 0.6908975 0.8592533 -0.8609295
## Horsepower 0.7143511 0.4079151 0.9683902 -0.9614512
## Length 0.9213978 0.9342425 0.7411032 -0.7142153
## Width 0.8837887 0.7833397 0.8192538 -0.8182177
## Weight 1.0000000 0.7854973 0.7783739 -0.7627800
## Trunk 0.7854973 1.0000000 0.5789343 -0.5521259
## Speed 0.7783739 0.5789343 1.0000000 -0.9709323
## Acceleration -0.7627800 -0.5521259 -0.9709323 1.0000000
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(round(cor_matrix, 2), caption = "Correlation Matrix", format = "html", digits = 2)
| Displacement | Moment | Horsepower | Length | Width | Weight | Trunk | Speed | Acceleration | |
|---|---|---|---|---|---|---|---|---|---|
| Displacement | 1.00 | 0.88 | 0.98 | 0.66 | 0.76 | 0.77 | 0.47 | 0.97 | -0.97 |
| Moment | 0.88 | 1.00 | 0.85 | 0.77 | 0.77 | 0.86 | 0.69 | 0.86 | -0.86 |
| Horsepower | 0.98 | 0.85 | 1.00 | 0.61 | 0.73 | 0.71 | 0.41 | 0.97 | -0.96 |
| Length | 0.66 | 0.77 | 0.61 | 1.00 | 0.91 | 0.92 | 0.93 | 0.74 | -0.71 |
| Width | 0.76 | 0.77 | 0.73 | 0.91 | 1.00 | 0.88 | 0.78 | 0.82 | -0.82 |
| Weight | 0.77 | 0.86 | 0.71 | 0.92 | 0.88 | 1.00 | 0.79 | 0.78 | -0.76 |
| Trunk | 0.47 | 0.69 | 0.41 | 0.93 | 0.78 | 0.79 | 1.00 | 0.58 | -0.55 |
| Speed | 0.97 | 0.86 | 0.97 | 0.74 | 0.82 | 0.78 | 0.58 | 1.00 | -0.97 |
| Acceleration | -0.97 | -0.86 | -0.96 | -0.71 | -0.82 | -0.76 | -0.55 | -0.97 | 1.00 |
kable(round(cor_matrix, 2), caption = "Correlation Matrix", format = "html", digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
column_spec(1, border_right = TRUE, extra_css = "padding-right:10px;") %>%
column_spec(2, border_right = TRUE, extra_css = "padding-right:10px;")
| Displacement | Moment | Horsepower | Length | Width | Weight | Trunk | Speed | Acceleration | |
|---|---|---|---|---|---|---|---|---|---|
| Displacement | 1.00 | 0.88 | 0.98 | 0.66 | 0.76 | 0.77 | 0.47 | 0.97 | -0.97 |
| Moment | 0.88 | 1.00 | 0.85 | 0.77 | 0.77 | 0.86 | 0.69 | 0.86 | -0.86 |
| Horsepower | 0.98 | 0.85 | 1.00 | 0.61 | 0.73 | 0.71 | 0.41 | 0.97 | -0.96 |
| Length | 0.66 | 0.77 | 0.61 | 1.00 | 0.91 | 0.92 | 0.93 | 0.74 | -0.71 |
| Width | 0.76 | 0.77 | 0.73 | 0.91 | 1.00 | 0.88 | 0.78 | 0.82 | -0.82 |
| Weight | 0.77 | 0.86 | 0.71 | 0.92 | 0.88 | 1.00 | 0.79 | 0.78 | -0.76 |
| Trunk | 0.47 | 0.69 | 0.41 | 0.93 | 0.78 | 0.79 | 1.00 | 0.58 | -0.55 |
| Speed | 0.97 | 0.86 | 0.97 | 0.74 | 0.82 | 0.78 | 0.58 | 1.00 | -0.97 |
| Acceleration | -0.97 | -0.86 | -0.96 | -0.71 | -0.82 | -0.76 | -0.55 | -0.97 | 1.00 |
dt2 <- dt[,-4]
data_numeric2 <- dt2[, !colnames(dt2) %in% c('Name')]
data_scaled <- scale(data_numeric2)
dist_matrix <- dist(data_scaled, method = 'euclidean')
hc <- hclust(dist_matrix, method = 'single')
print(hc)
##
## Call:
## hclust(d = dist_matrix, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 15
within_cluster_variation <- hc$height
print(within_cluster_variation)
## [1] 0.5965032 0.6456749 0.6777943 0.7015980 0.7519380 0.9950021 1.1507777
## [8] 1.1725207 1.2841234 1.4157920 1.5810159 1.7481772 2.2640748 3.1028633
plot(hc, labels = dt2$Name,
main = 'Hierarchical Clustering with Single Linkage',
hang = -1,
cex = 0.6)
# Step 3
pca_result <- prcomp(data_scaled, center = TRUE, scale. = TRUE)
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
plot(var_explained, xlab = "Principal Component", ylab = "Proportion of Variance Explained", type = 'b', pch = 19, main = "Scree Plot")
abline(v = which.max(diff(diff(var_explained))), col = "red", lty = 2)
# Adding a cumulative variance explained line
var_explained_cumulative <- cumsum(var_explained)
lines(var_explained_cumulative, col = "blue", lty = 2)
legend("topright", legend = c("Variance Explained", "Cumulative Variance"), col = c("black", "blue"), lty = 1:2)
fviz_eig(pca_result)
# Step 4
K <- 2
kM <- kmeans(data_scaled, centers = K)
print(kM)
## K-means clustering with 2 clusters of sizes 7, 8
##
## Cluster means:
## Displacement Moment Length Width Weight Trunk
## 1 -0.8507639 -0.9722316 -0.9091725 -0.8446802 -0.9386993 -0.8197714
## 2 0.7444185 0.8507027 0.7955259 0.7390952 0.8213619 0.7173000
## Speed Acceleration
## 1 -0.8854012 0.8377322
## 2 0.7747260 -0.7330157
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 9.385799 20.608874
## (between_SS / total_SS = 73.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
autoplot(kM, data_scaled, frame = TRUE)
km_clusters <- kM$cluster
rownames(data_scaled) <- dt2$Name
print(data_scaled)
## Displacement Moment Length Width
## Kia Picanto 1.1. Start -1.12322877 -1.1722023 -1.73892815 -2.3029338
## Suzuki Splash 1.0 -1.23375861 -1.2426956 -1.33005364 -1.1127268
## Renault Clio 1.0 -1.04585789 -1.0916386 -0.71447035 -0.5666319
## Dacia Sandero 1.6 -0.49443681 -0.8600178 -0.63723850 -0.1885661
## Fiat Grande Punto 1.4 -0.49443681 -0.7391722 -0.71447035 -0.5666319
## Peugot 207 1.4 -0.78672682 -0.8096655 -0.61452325 -0.1605613
## Renault Clio 1.6 -0.77690195 -0.8902292 -0.61452325 -1.0147098
## Porsche Cayman 1.70142263 1.2749211 0.09192105 0.5815678
## Nissan 350Z 1.83897088 1.4058371 0.03286140 0.7776019
## Mercedes c200 CDI 0.18102331 0.5699884 0.66888841 0.1474923
## VW Passat Variant 2.0 -0.04003636 1.0735117 1.07549140 0.8476141
## Skoda Octavia 2.0 -0.04003636 1.0735117 0.61664334 0.1334899
## Mercedes E280 1.22246000 0.8721024 1.25267035 0.8756189
## Audi A6 2.4 0.48191009 0.1671697 1.39804796 1.3376993
## BMW 525i 0.60963346 0.3685791 1.22768358 1.2116774
## Weight Trunk Speed Acceleration
## Kia Picanto 1.1. Start -1.71444790 -1.6085559 -1.30452598 1.55089812
## Suzuki Splash 1.0 -1.21133991 -1.2799724 -1.14735417 1.41777382
## Renault Clio 1.0 -0.77475859 -0.5712628 -0.96398707 0.98511984
## Dacia Sandero 1.6 -0.95770695 -0.3650928 -0.78061996 0.35277940
## Fiat Grande Punto 1.4 -0.52528355 -0.5712628 -0.70203406 0.48590370
## Peugot 207 1.4 -0.52944147 -0.6872335 -0.62344816 0.75215231
## Renault Clio 1.6 -0.85791694 -0.6550194 -0.67583876 0.31949833
## Porsche Cayman -0.00554389 0.2147605 1.86510541 -1.67736621
## Nissan 350Z 1.11709378 -0.9127320 1.21022289 -1.54424190
## Mercedes c200 CDI 1.09630419 0.6979715 0.11002026 0.11981187
## VW Passat Variant 2.0 1.05888294 1.3615813 -0.07334684 0.01996865
## Skoda Octavia 2.0 0.34787908 1.3100388 0.08382496 -0.24627996
## Mercedes E280 1.32498965 1.0523263 1.21022289 -1.04502577
## Audi A6 2.4 0.76367081 1.0909832 0.71251218 -0.51252856
## BMW 525i 0.86761874 0.9234700 1.07924639 -0.97846362
## attr(,"scaled:center")
## Displacement Moment Length Width Weight Trunk
## 2000.6000 213.4000 4300.5333 1759.4667 1341.3333 376.6667
## Speed Acceleration
## 203.8000 10.4400
## attr(,"scaled:scale")
## Displacement Moment Length Width Weight Trunk
## 814.259769 99.300266 440.232871 71.416151 240.505024 155.211683
## Speed Acceleration
## 38.174786 3.004711
fviz_cluster(list(data = data_scaled, cluster = km_clusters))
kM$centers
## Displacement Moment Length Width Weight Trunk
## 1 -0.8507639 -0.9722316 -0.9091725 -0.8446802 -0.9386993 -0.8197714
## 2 0.7444185 0.8507027 0.7955259 0.7390952 0.8213619 0.7173000
## Speed Acceleration
## 1 -0.8854012 0.8377322
## 2 0.7747260 -0.7330157
kable(kM$centers, caption = "Initial Cluster Centers", digits = 2)
| Displacement | Moment | Length | Width | Weight | Trunk | Speed | Acceleration |
|---|---|---|---|---|---|---|---|
| -0.85 | -0.97 | -0.91 | -0.84 | -0.94 | -0.82 | -0.89 | 0.84 |
| 0.74 | 0.85 | 0.80 | 0.74 | 0.82 | 0.72 | 0.77 | -0.73 |
kable(round(kM$centers, 2), caption = "Initial Clusters Centers", format = "html", digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
column_spec(1, border_right = TRUE, extra_css = "padding-right:10px;") %>%
column_spec(2, border_right = TRUE, extra_css = "padding-right:10px;")
| Displacement | Moment | Length | Width | Weight | Trunk | Speed | Acceleration |
|---|---|---|---|---|---|---|---|
| -0.85 | -0.97 | -0.91 | -0.84 | -0.94 | -0.82 | -0.89 | 0.84 |
| 0.74 | 0.85 | 0.80 | 0.74 | 0.82 | 0.72 | 0.77 | -0.73 |
clust_2 <- cutree(hc,2)
print(clust_2)
## [1] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
clust_2 <- as.factor(clust_2)
# run anova on all the variables
anova_tb <- aov(cbind(dt2$Displacement,
dt2$Moment,
dt2$Length,
dt2$Width,
dt2$Weight,
dt2$Trunk,
dt2$Speed,
dt2$Acceleration) ~ clust_2, data = dt2)
# get a summary of anova results
summary(anova_tb)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 6298591 6298591 27.443 0.00016 ***
## Residuals 13 2983674 229513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 122332 122332 101.19 1.68e-07 ***
## Residuals 13 15716 1209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 2102601 2102601 44.76 1.493e-05 ***
## Residuals 13 610669 46975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 47761 47761 26.262 0.0001951 ***
## Residuals 13 23642 1819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 668961 668961 61.749 2.719e-06 ***
## Residuals 13 140837 10834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 212488 212488 22.137 0.0004115 ***
## Residuals 13 124782 9599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 7 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 14994.5 14994 36.045 4.419e-05 ***
## Residuals 13 5407.9 416
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 8 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 83.160 83.160 25.004 0.0002428 ***
## Residuals 13 43.236 3.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary_df <- summary(anova_tb)
print(summary_df)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 6298591 6298591 27.443 0.00016 ***
## Residuals 13 2983674 229513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 122332 122332 101.19 1.68e-07 ***
## Residuals 13 15716 1209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 2102601 2102601 44.76 1.493e-05 ***
## Residuals 13 610669 46975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 47761 47761 26.262 0.0001951 ***
## Residuals 13 23642 1819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 668961 668961 61.749 2.719e-06 ***
## Residuals 13 140837 10834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 212488 212488 22.137 0.0004115 ***
## Residuals 13 124782 9599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 7 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 14994.5 14994 36.045 4.419e-05 ***
## Residuals 13 5407.9 416
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 8 :
## Df Sum Sq Mean Sq F value Pr(>F)
## clust_2 1 83.160 83.160 25.004 0.0002428 ***
## Residuals 13 43.236 3.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cluster_means <- aggregate(. ~ clust_2, data = data_numeric2, FUN = mean)
print(cluster_means)
## clust_2 Displacement Moment Length Width Weight Trunk Speed
## 1 1 1307.857 116.8571 3900.286 1699.143 1115.571 249.4286 170.000
## 2 2 2606.750 297.8750 4650.750 1812.250 1538.875 488.0000 233.375
## Acceleration
## 1 12.95714
## 2 8.23750