Background

In this paper I will present Multidimensional Scaling and Principal Component Analysis on the data webscraped from Otomoto.pl (How I have done it please look at the other of my paper)

MDS

MDS is a means of visualizing the level of similarity of individual cases of a dataset. In my data set I have prepared a distance matrix which shows the distances between specific cars’ marks.

library(ggplot2)
library(MASS)
#### MDS ####
df$price_norm <- scale(sqrt(df$mean_price))
df$mileage_norm <- scale(sqrt(df$mean_mileage))
df$capacity_norm <- scale(sqrt(df$mean_capacity))
df$fuel_norm <- scale(sqrt(df$mean_fuel))
df$age_norm <- scale(sqrt(df$mean_age))

odleglosci <- dist(df[,c("price_norm","mileage_norm","capacity_norm","fuel_norm","age_norm")], method = "manhattan")
mds <- as.matrix((odleglosci))#macierz odleglosci miedzy zmiennymi
##                     BMW Chevrolet  Citroën     Dacia     Fiat     Ford
## BMW            0.000000  9.616018 6.354678 12.373918 8.564652 6.244998
## Chevrolet      9.616018  0.000000 5.662551  4.495851 5.263566 5.137377
## Citroën        6.354678  5.662551 0.000000  6.653350 3.597366 1.058944
## Dacia         12.373918  4.495851 6.653350  0.000000 6.023492 6.320317
## Fiat           8.564652  5.263566 3.597366  6.023492 0.000000 3.503067
## Ford           6.244998  5.137377 1.058944  6.320317 3.503067 0.000000
## Honda          9.941816  8.126865 7.227153  5.023346 6.244411 7.688317
## Hyundai       10.261828  6.770747 5.329219  6.328049 1.731854 5.234921
## Kia            6.966314  4.845367 2.091764  6.120519 1.686009 1.997465
## Mazda          5.432980  6.748353 3.113083  6.940938 4.807182 3.477985
## Mercedes-Benz  6.354572  9.299320 4.259979  6.770785 5.352322 4.285372
## Mitsubishi     5.078434  9.115038 7.694965  9.151962 8.055290 8.059867
## Nissan        15.351477  7.760671 9.839297  4.321412 9.609835 9.906660
## Opel           7.620269  4.269609 1.899702  5.553262 2.709782 1.411602
## Peugeot        8.727797  4.604846 2.373119  5.646950 1.902832 2.482799
## Renault        6.263399  4.899024 1.662535  7.439761 3.670498 2.069753
## Seat           6.562388  6.199150 2.111976  5.811530 2.967488 2.573140
## Škoda          8.797237  6.191452 3.712385  4.141593 4.952049 3.779749
## Suzuki        10.019995  6.575657 5.166917  2.502773 3.948438 5.392344
## Toyota         7.979812  6.278586 3.130860  4.394105 2.331589 3.427566
## Volkswagen     3.883450  8.744842 3.620883  8.540610 4.797845 3.607465
## Volvo          4.865547  8.837771 4.326481 10.979832 7.170567 4.814581
##                  Honda   Hyundai      Kia    Mazda Mercedes-Benz
## BMW           9.941816 10.261828 6.966314 5.432980      6.354572
## Chevrolet     8.126865  6.770747 4.845367 6.748353      9.299320
## Citroën       7.227153  5.329219 2.091764 3.113083      4.259979
## Dacia         5.023346  6.328049 6.120519 6.940938      6.770785
## Fiat          6.244411  1.731854 1.686009 4.807182      5.352322
## Ford          7.688317  5.234921 1.997465 3.477985      4.285372
## Honda         0.000000  6.850029 6.248702 5.614747      6.269042
## Hyundai       6.850029  0.000000 3.295513 5.804852      5.415134
## Kia           6.248702  3.295513 0.000000 3.121173      4.453953
## Mazda         5.614747  5.804852 3.121173 0.000000      3.039675
## Mercedes-Benz 6.269042  5.415134 4.453953 3.039675      0.000000
## Mitsubishi    6.984513  9.052961 6.953505 5.366015      8.193517
## Nissan        5.667824  9.482653 9.706862 9.918497      9.465427
## Opel          7.528936  4.430992 1.435458 3.852155      5.029711
## Peugeot       7.291765  3.634686 1.857536 4.978709      5.702125
## Renault       7.955383  5.402352 2.164896 2.860116      5.379318
## Seat          5.115177  4.308932 2.021333 3.102105      4.276476
## Škoda         6.369734  5.049540 4.146416 3.862384      3.828201
## Suzuki        2.780555  4.069473 3.952729 4.587015      4.268020
## Toyota        4.317700  3.104931 1.987950 2.699921      3.020733
## Volkswagen    6.108509  6.428520 3.926205 3.611057      3.747583
## Volvo         8.788412  8.867742 5.572229 4.038894      5.354349
##               Mitsubishi    Nissan     Opel  Peugeot   Renault     Seat
## BMW             5.078434 15.351477 7.620269 8.727797  6.263399 6.562388
## Chevrolet       9.115038  7.760671 4.269609 4.604846  4.899024 6.199150
## Citroën         7.694965  9.839297 1.899702 2.373119  1.662535 2.111976
## Dacia           9.151962  4.321412 5.553262 5.646950  7.439761 5.811530
## Fiat            8.055290  9.609835 2.709782 1.902832  3.670498 2.967488
## Ford            8.059867  9.906660 1.411602 2.482799  2.069753 2.573140
## Honda           6.984513  5.667824 7.528936 7.291765  7.955383 5.115177
## Hyundai         9.052961  9.482653 4.430992 3.634686  5.402352 4.308932
## Kia             6.953505  9.706862 1.435458 1.857536  2.164896 2.021333
## Mazda           5.366015  9.918497 3.852155 4.978709  2.860116 3.102105
## Mercedes-Benz   8.193517  9.465427 5.029711 5.702125  5.379318 4.276476
## Mitsubishi      0.000000 12.129521 8.388963 8.605931  7.127958 6.467097
## Nissan         12.129521  0.000000 9.139605 9.233293 11.026104 8.789089
## Opel            8.388963  9.139605 0.000000 1.457413  2.431411 2.413759
## Peugeot         8.605931  9.233293 1.457413 0.000000  2.896858 2.601956
## Renault         7.127958 11.026104 2.431411 2.896858  0.000000 3.120316
## Seat            6.467097  8.789089 2.413759 2.601956  3.120316 0.000000
## Škoda           8.029556  6.554240 3.012694 3.877957  4.899192 2.581056
## Suzuki          6.798039  5.775606 5.232963 4.995792  5.659410 4.146630
## Toyota          5.977522  7.744510 3.268184 3.031013  3.694631 2.110572
## Volkswagen      6.485740 11.518170 4.475233 5.147647  4.824840 2.875161
## Volvo           8.369616 13.957391 6.226183 6.691630  3.938748 5.168302
##                  Škoda    Suzuki   Toyota Volkswagen     Volvo
## BMW           8.797237 10.019995 7.979812   3.883450  4.865547
## Chevrolet     6.191452  6.575657 6.278586   8.744842  8.837771
## Citroën       3.712385  5.166917 3.130860   3.620883  4.326481
## Dacia         4.141593  2.502773 4.394105   8.540610 10.979832
## Fiat          4.952049  3.948438 2.331589   4.797845  7.170567
## Ford          3.779749  5.392344 3.427566   3.607465  4.814581
## Honda         6.369734  2.780555 4.317700   6.108509  8.788412
## Hyundai       5.049540  4.069473 3.104931   6.428520  8.867742
## Kia           4.146416  3.952729 1.987950   3.926205  5.572229
## Mazda         3.862384  4.587015 2.699921   3.611057  4.038894
## Mercedes-Benz 3.828201  4.268020 3.020733   3.747583  5.354349
## Mitsubishi    8.029556  6.798039 5.977522   6.485740  8.369616
## Nissan        6.554240  5.775606 7.744510  11.518170 13.957391
## Opel          3.012694  5.232963 3.268184   4.475233  6.226183
## Peugeot       3.877957  4.995792 3.031013   5.147647  6.691630
## Renault       4.899192  5.659410 3.694631   4.824840  3.938748
## Seat          2.581056  4.146630 2.110572   2.875161  5.168302
## Škoda         0.000000  4.460631 2.852207   4.963930  7.403151
## Suzuki        4.460631  0.000000 2.040183   6.186688  8.625909
## Toyota        2.852207  2.040183 0.000000   4.146505  6.585726
## Volkswagen    4.963930  6.186688 4.146505   0.000000  4.065326
## Volvo         7.403151  8.625909 6.585726   4.065326  0.000000


After showing data on the graph we can easily find created patterns. We see that in the centre on the small area there are two groups: (Kia, Citroen, Fiat, Hyundai, Renault, Ford) and (Volvo, Mercedes, Mazda). There are also outstanding marks that seems to not belonging to any of them.

fit<-cmdscale(mds, eig=TRUE, k=2)
x <- fit$points[, 1]
y <- fit$points[, 2]
mark<-df$mark

x <- 0 - x
y <- 0 - y
plot(x, y, pch = 19, xlim = range(x) )
text(x, y, pos = 4, labels = mark)

PCA

Principal Component Analysis is used to reduce a large set of variables to a small set that still contains most of the information in the large set


The graph we generated from data, shows us the individuals. - here rule is like in MDS graph. Similar marks are grouped together.

# pca
res.pca<-prcomp(pca, scale=TRUE)
fviz_eig(res.pca)
#1 dimension, now graph of individuals that are similiar to each other
fviz_pca_ind(res.pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("blue", "red", "green"),
             repel = TRUE     # Avoid text overlapping
)


The next graph shows us the variables which were used for analysis. These of them which are positively correlated are on the same side. In this example we can distinguish mean_price and mean_capacity with positive correlation.

#graph of variables:
fviz_pca_var(res.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("blue", "red", "green"),
             repel = TRUE     # Avoid text overlapping
)

fviz_pca_biplot(res.pca, repel = TRUE, col.var = “#2E9FDF”, # Variables color col.ind = “#696969” # Individuals color )

fviz_pca_biplot(res.pca, repel = TRUE,
                col.var = "blue", # Variables color
                col.ind = "red"  # Individuals color
)

The last plot shows correlations of car’s mark (red color) and the variables (green color). What can be interesting the cars considered as medium-price have negative correlation with the variables.


Location

In this chapter I applied above formulas on the location variable. We can see how next variables are correlated with the locations. Moreover we can check which locations are similar and compose patterns.

library(factoextra)


# pca locations
res.pca<-prcomp(pca, scale=TRUE)
fviz_eig(res.pca)
#1 dimension, now graph of individuals that are similiar to each other
fviz_pca_ind(res.pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("blue", "red", "green"),
             repel = TRUE     # Avoid text overlapping
)


fviz_pca_biplot(res.pca, repel = TRUE,
                col.var = "blue", # Variables color
                col.ind = "red"  # Individuals color
)