str(mtcars)##CRAE IN THIS DATASET some are num bbit should be factor such cyl...
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
X=mtcars[,c(1,3,4,5,6,7)]#SOME Numerical variables
summary(X)
## mpg disp hp drat
## Min. :10.40 Min. : 71.1 Min. : 52.0 Min. :2.760
## 1st Qu.:15.43 1st Qu.:120.8 1st Qu.: 96.5 1st Qu.:3.080
## Median :19.20 Median :196.3 Median :123.0 Median :3.695
## Mean :20.09 Mean :230.7 Mean :146.7 Mean :3.597
## 3rd Qu.:22.80 3rd Qu.:326.0 3rd Qu.:180.0 3rd Qu.:3.920
## Max. :33.90 Max. :472.0 Max. :335.0 Max. :4.930
## wt qsec
## Min. :1.513 Min. :14.50
## 1st Qu.:2.581 1st Qu.:16.89
## Median :3.325 Median :17.71
## Mean :3.217 Mean :17.85
## 3rd Qu.:3.610 3rd Qu.:18.90
## Max. :5.424 Max. :22.90
Now your correlated anticorrelated variables
plot(X)
library(corrplot)
## corrplot 0.95 loaded
corrplot(cor(as.matrix(X)),method="ellipse",diag=FALSE,is.corr=TRUE)
library(psych)
pairs.panels(X)
(Although above nothing looks really colinear here check INFLATED regression SE of coef)
prcomp(X,center=TRUE,scale=TRUE)
## Standard deviations (1, .., p=6):
## [1] 2.0463129 1.0714999 0.5773705 0.3928874 0.3532648 0.2279872
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5 PC6
## mpg -0.4586835 -0.05867609 0.19479235 -0.78205878 0.1111533 -0.35249327
## disp 0.4660354 0.06065296 -0.09688406 -0.60001871 -0.2946297 0.56825752
## hp 0.4258534 -0.36147576 -0.14613554 -0.12301873 0.8057408 -0.04771555
## drat -0.3670963 -0.43652537 -0.80049152 -0.02259258 -0.1437714 0.11277675
## wt 0.4386179 0.29953457 -0.41776208 -0.10438337 -0.2301541 -0.69246040
## qsec -0.2528320 0.76284877 -0.34059066 -0.04268124 0.4218755 0.24152663
pr=prcomp(X,center=TRUE,scale=TRUE)
summary(pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0463 1.0715 0.57737 0.39289 0.3533 0.22799
## Proportion of Variance 0.6979 0.1913 0.05556 0.02573 0.0208 0.00866
## Cumulative Proportion 0.6979 0.8892 0.94481 0.97054 0.9913 1.00000
screeplot(pr)#E axes suffisent 88% de variance expliqué par les 2 1ere PCA
(4+1)/6 #ou du screeplot approx
## [1] 0.8333333
biplot(pr,cex=0.7)#CARE on Biplot the corcle of correlation of variables not really reliable rpojected
library(FactoMineR)
Facto=princomp(scale(X))##princomp do not forget to scale otherwise covariance Matrix is used
summary(Facto)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.0140855 1.0546249 0.56827746 0.38669985 0.34770121
## Proportion of Variance 0.6978994 0.1913520 0.05555944 0.02572676 0.02079933
## Cumulative Proportion 0.6978994 0.8892514 0.94481088 0.97053763 0.99133697
## Comp.6
## Standard deviation 0.224396671
## Proportion of Variance 0.008663031
## Cumulative Proportion 1.000000000
###PCA+++++
respca = PCA(X, scale.unit=TRUE, ncp=6)##PCA EASY plot
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
DISP, Q SEC , DRAT is sufficient to predict a model They are at 90 degrees of each other (orthogonality on PC Axis) they will avoid colinearity perform dimension reduction and capturing more than 88% of your variance. Note that drat is slightly less weel projected on PC1-2 the arrow doesnt point to end of circle of correlation (+1;-1) but dig maybe in PCA3….