Za avtomobile ste pridobili množico karakteristik. Ali jih lahko povzamete z manjšim številom novo ustvarjenih spremenljivk?
library(readxl)
podatki <- read_xlsx("./Avtomobili.xlsx")
podatki <- as.data.frame(podatki)
head(podatki)
## Znamka Tip Sedezi Motor KW Navor Dolzina Sirina Medosna
## 1 Renault MEGANE1,9DT 5 1870 70 176 413 170 258
## 2 Kia PRIDE1,3 5 1324 47 135 356 160 230
## 3 Ford SCORPIO2,5TD 5 2500 92 293 483 176 277
## 4 Toyota LEUSLS400 5 3969 194 365 500 183 285
## 5 Peugeot 1061,6 5 1587 65 135 368 159 239
## 6 Opel FRONTERA2,2 5 2198 100 202 469 176 276
## Teza Pospesek Poraba
## 1 1130 12.3 6.3
## 2 795 13.7 6.7
## 3 1545 11.1 11.2
## 4 1680 7.5 9.6
## 5 895 12.2 6.7
## 6 1790 13.6 10.5
Opis spremenljivk:
podatki_MGK <- podatki[ ,c(-1, -2)]
library(pastecs)
round(stat.desc(podatki_MGK, basic = FALSE), 2)
## Sedezi Motor KW Navor Dolzina Sirina Medosna
## median 5.00 1926.50 84.00 171.50 445.00 172.00 259.00
## mean 4.95 2128.74 93.98 191.66 432.83 172.80 258.92
## SE.mean 0.08 85.53 5.46 8.97 4.09 0.85 1.68
## CI.mean.0.95 0.15 169.72 10.83 17.80 8.12 1.70 3.33
## var 0.59 731602.19 2977.82 8043.48 1675.94 72.99 280.94
## std.dev 0.77 855.34 54.57 89.69 40.94 8.54 16.76
## coef.var 0.16 0.40 0.58 0.47 0.09 0.05 0.06
## Teza Pospesek Poraba
## median 1242.50 11.30 8.75
## mean 1280.20 12.06 10.11
## SE.mean 30.92 0.34 0.42
## CI.mean.0.95 61.34 0.68 0.84
## var 95580.26 11.84 17.75
## std.dev 309.16 3.44 4.21
## coef.var 0.24 0.29 0.42
R <- cor(podatki_MGK)
library(psych)
corPlot(R)
#### koleracijska matrika dimenzije 10x10. ali so podatki primerni za
izvedbo MGK? Analiza koleracijske matrike, kjer si želimo vsaj 0,3
koleracije (visoke koleracije).v tem primeru so. pri pospešku želimo čim
manj sekund, zato so negativne koleracije.
library(psych)
cortest.bartlett(R, n = nrow(podatki_MGK))
## $chisq
## [1] 1192.79
##
## $p.value
## [1] 2.003247e-220
##
## $df
## [1] 45
nrow(podatki_MGK)
## [1] 100
det(R)
## [1] 3.447879e-06
library(psych)
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA = 0.84
## MSA for each item =
## Sedezi Motor KW Navor Dolzina Sirina Medosna
## 0.61 0.86 0.80 0.88 0.82 0.87 0.77
## Teza Pospesek Poraba
## 0.88 0.71 0.95
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following object is masked from 'package:psychTools':
##
## recode
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
podatki_MGK <- podatki_MGK %>%
select(!Sedezi)
library(FactoMineR)
komponente <- PCA(podatki_MGK,
scale.unit = TRUE,
graph = FALSE)
library(factoextra)
get_eigenvalue(komponente)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 5.96319357 66.2577063 66.25771
## Dim.2 1.47214086 16.3571207 82.61483
## Dim.3 0.75600371 8.4000412 91.01487
## Dim.4 0.30775458 3.4194953 94.43436
## Dim.5 0.17768306 1.9742562 96.40862
## Dim.6 0.15620416 1.7356018 98.14422
## Dim.7 0.09243133 1.0270147 99.17124
## Dim.8 0.03979512 0.4421680 99.61340
## Dim.9 0.03479361 0.3865957 100.00000
library(factoextra)
fviz_eig(komponente,
choice = "eigenvalue",
main = "Scree plot",
ylab = "Eigenvalue",
xlab = "Principal component",
addlabels = TRUE)
#### delamo diagram lastnih vrednosti. iščemo prelom, najbolj očiten je
pri 2 (obdržiš 1 gk), 2, prelom je pri 4 (obdržiš 3). ali 1 ali 3.
library(psych)
fa.parallel(podatki_MGK,
sim = FALSE,
fa = "pc")
## Parallel analysis suggests that the number of factors = NA and the number of components = 2
library(FactoMineR)
komponente <- PCA(podatki_MGK,
scale.unit = TRUE,
graph = FALSE,
ncp = 2)
komponente
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 100 individuals, described by 9 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
print(komponente$var$cor)
## Dim.1 Dim.2
## Motor 0.9178408 -0.2319550
## KW 0.8718298 -0.4285367
## Navor 0.9328963 -0.2476584
## Dolzina 0.8155532 0.4498017
## Sirina 0.8963112 0.2454027
## Medosna 0.6839111 0.6248435
## Teza 0.8462009 0.3525296
## Pospesek -0.5304962 0.5759235
## Poraba 0.7460978 -0.2538127
print(komponente$var$contrib)
## Dim.1 Dim.2
## Motor 14.127190 3.654752
## KW 12.746310 12.474604
## Navor 14.594453 4.166361
## Dolzina 11.153873 13.743356
## Sirina 13.472207 4.090812
## Medosna 7.843690 26.521200
## Teza 12.007927 8.441931
## Pospesek 4.719388 22.530986
## Poraba 9.334962 4.375999
library(factoextra)
fviz_pca_var(komponente,
repel = TRUE)
#### 1.GK meri splošne lastnosti. vse so na desni razen pospeška, ker je
ravno obratno pri pospešku. 2.GK je kontrast med motorjem in dimenzijami
avta.
library(factoextra)
fviz_pca_biplot(komponente)
#### če gledamo vodorovno: avtomobili, ki so zelo na desno so zelo
dobri, na levi pa zelo slabi. Npr. 99: ima zelo boljši motor kot zunanje
karakteristike. npr: 49 ima zelo boljšo zunanjost kot motor. 44 je zelo
dober avto, ki nima izrazitega kontrasta. 35 je zelo slab avto, nima pa
izrazitih kontrastov.
head(komponente$ind$coord)
## Dim.1 Dim.2
## 1 -1.099729 0.05573836
## 2 -3.474045 -1.34179843
## 3 1.876879 0.85114096
## 4 4.344504 -0.03056201
## 5 -2.825345 -1.26816527
## 6 1.341276 1.52978259
komponente$ind$coord[99, ]
## Dim.1 Dim.2
## 8.532164 -4.865681
podatki_MGK_std <- scale(podatki_MGK)
podatki_MGK_std[99, ]
## Motor KW Navor Dolzina Sirina Medosna
## 3.9110411 4.8199187 4.2073716 0.5415473 2.3643948 -0.5321765
## Teza Pospesek Poraba
## 1.3255248 -2.2264082 6.0504283
podatki$PC1 <- komponente$ind$coord[ , 1]
podatki$PC2 <- komponente$ind$coord[ , 2]
head(podatki, 3)
## Znamka Tip Sedezi Motor KW Navor Dolzina Sirina Medosna
## 1 Renault MEGANE1,9DT 5 1870 70 176 413 170 258
## 2 Kia PRIDE1,3 5 1324 47 135 356 160 230
## 3 Ford SCORPIO2,5TD 5 2500 92 293 483 176 277
## Teza Pospesek Poraba PC1 PC2
## 1 1130 12.3 6.3 -1.099729 0.05573836
## 2 795 13.7 6.7 -3.474045 -1.34179843
## 3 1545 11.1 11.2 1.876879 0.85114096
cor(x=podatki$PC1, y=podatki$PC2)
## [1] 4.601682e-16