Invoking required packages
library(readxl)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(foreign)
library(reshape2)
library(formatR)
library(UsingR)
## Warning: package 'UsingR' was built under R version 3.3.2
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.3.2
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
library(psych)
## Warning: package 'psych' was built under R version 3.3.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:UsingR':
##
## galton, headtail
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(rmarkdown)
# Structure and Summary for red wine
getwd()
## [1] "C:/Users/Amit/Desktop/MSDA/MSDA/IS6713"
wine_white <- read.csv("C:/Users/Amit/Desktop/MSDA/MSDA/IS6713/winequality-white.csv", sep= ";")
View(wine_white)
str(wine_white)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
summary(wine_white)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
White wine data has 4899 observation and quality variable has levels from 3 to 9.
# install.packages("corrplot")
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.2
# Scatter plot matrix
plot(wine_white)
wine_corr <- cor(wine_white)
summary(wine_corr)
## fixed.acidity volatile.acidity citric.acid
## Min. :-0.425858 Min. :-0.194723 Min. :-0.16375
## 1st Qu.:-0.065463 1st Qu.:-0.051049 1st Qu.:-0.02584
## Median : 0.002971 Median : 0.002208 Median : 0.09414
## Mean : 0.084004 Mean : 0.065612 Mean : 0.12722
## 3rd Qu.: 0.134635 3rd Qu.: 0.068416 3rd Qu.: 0.12822
## Max. : 1.000000 Max. : 1.000000 Max. : 1.00000
## residual.sugar chlorides free.sulfur.dioxide
## Min. :-0.45063 Min. :-0.36019 Min. :-0.25010
## 1st Qu.:-0.04439 1st Qu.:-0.01004 1st Qu.:-0.01281
## Median : 0.08885 Median : 0.07960 Median : 0.07665
## Mean : 0.17556 Mean : 0.10086 Mean : 0.17288
## 3rd Qu.: 0.32468 3rd Qu.: 0.13550 3rd Qu.: 0.29543
## Max. : 1.00000 Max. : 1.00000 Max. : 1.00000
## total.sulfur.dioxide density pH
## Min. :-0.44889 Min. :-0.780138 Min. :-0.42586
## 1st Qu.: 0.06753 1st Qu.:-0.003063 1st Qu.:-0.11113
## Median : 0.12785 Median : 0.203357 Median :-0.01627
## Mean : 0.21337 Mean : 0.187988 Mean : 0.03157
## 3rd Qu.: 0.43355 3rd Qu.: 0.353128 3rd Qu.: 0.10493
## Max. : 1.00000 Max. : 1.000000 Max. : 1.00000
## sulphates alcohol quality
## Min. :-0.03573 Min. :-0.78014 Min. :-0.30712
## 1st Qu.:-0.01722 1st Qu.:-0.38236 1st Qu.:-0.17973
## Median : 0.05645 Median :-0.09830 Median :-0.05339
## Mean : 0.12167 Mean :-0.07327 Mean : 0.04082
## 3rd Qu.: 0.08951 3rd Qu.: 0.08115 3rd Qu.: 0.06512
## Max. : 1.00000 Max. : 1.00000 Max. : 1.00000
corrplot(wine_corr,method="number")
Correlation table indicates that density and residual sugar has high correlation (0.84) and there is a negative correlation (-.78) between density and alcohol. The correlation between quality and alcohol is 0.44
# Skewness and kurtosis
# install.packages("psych")
library("psych")
describe(wine_white)
## vars n mean sd median trimmed mad min
## fixed.acidity 1 4898 6.85 0.84 6.80 6.82 0.74 3.80
## volatile.acidity 2 4898 0.28 0.10 0.26 0.27 0.09 0.08
## citric.acid 3 4898 0.33 0.12 0.32 0.33 0.09 0.00
## residual.sugar 4 4898 6.39 5.07 5.20 5.80 5.34 0.60
## chlorides 5 4898 0.05 0.02 0.04 0.04 0.01 0.01
## free.sulfur.dioxide 6 4898 35.31 17.01 34.00 34.36 16.31 2.00
## total.sulfur.dioxide 7 4898 138.36 42.50 134.00 136.96 43.00 9.00
## density 8 4898 0.99 0.00 0.99 0.99 0.00 0.99
## pH 9 4898 3.19 0.15 3.18 3.18 0.15 2.72
## sulphates 10 4898 0.49 0.11 0.47 0.48 0.10 0.22
## alcohol 11 4898 10.51 1.23 10.40 10.43 1.48 8.00
## quality 12 4898 5.88 0.89 6.00 5.85 1.48 3.00
## max range skew kurtosis se
## fixed.acidity 14.20 10.40 0.65 2.17 0.01
## volatile.acidity 1.10 1.02 1.58 5.08 0.00
## citric.acid 1.66 1.66 1.28 6.16 0.00
## residual.sugar 65.80 65.20 1.08 3.46 0.07
## chlorides 0.35 0.34 5.02 37.51 0.00
## free.sulfur.dioxide 289.00 287.00 1.41 11.45 0.24
## total.sulfur.dioxide 440.00 431.00 0.39 0.57 0.61
## density 1.04 0.05 0.98 9.78 0.00
## pH 3.82 1.10 0.46 0.53 0.00
## sulphates 1.08 0.86 0.98 1.59 0.00
## alcohol 14.20 6.20 0.49 -0.70 0.02
## quality 9.00 6.00 0.16 0.21 0.01
# Histogram and table to get frquency of different qualities
#hist_1 <- hist(winequality.red$X.quality.)
#hist_1
#print(hist_1$breaks)
# Change number of breaks and add labels
hist_2 <- hist(wine_white$quality, breaks = 5, col ="lightblue", xlab = "Quality ", main= " Histogram of Quality rating for white wine ")
table(wine_white$quality)
##
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
. From describe function output, we observe that skewness coefficient for quality is 0.21 and kurtosis is 0.01.
Let’s load a package called FactoMineR in R to run principal component analysis.
if (!require("FactoMineR")) install.packages("FactoMineR")
## Loading required package: FactoMineR
## Warning: package 'FactoMineR' was built under R version 3.3.2
# Unscaled data
pc_white <- prcomp(wine_white)
pc_white
## Standard deviations:
## [1] 43.949221756 12.979721330 4.643585542 1.147246458 0.828680478
## [6] 0.707396079 0.135607404 0.118811827 0.106986464 0.090589602
## [11] 0.019895607 0.000559343
##
## Rotation:
## PC1 PC2 PC3
## fixed.acidity -1.544525e-03 -9.166733e-03 -1.292446e-02
## volatile.acidity -1.690309e-04 -1.546248e-03 -9.343979e-04
## citric.acid -3.386468e-04 1.403673e-04 -1.257927e-03
## residual.sugar -4.732751e-02 1.493143e-02 -9.951321e-01
## chlorides -9.757940e-05 -7.203906e-05 -7.999827e-05
## free.sulfur.dioxide -2.618723e-01 9.646376e-01 2.628366e-02
## total.sulfur.dioxide -9.638533e-01 -2.626820e-01 4.285064e-02
## density -3.597064e-05 -1.839769e-05 -4.470891e-04
## pH -3.361997e-06 -4.080579e-05 7.022487e-03
## sulphates -3.408882e-04 -3.605330e-04 2.145496e-03
## alcohol 1.250436e-02 6.479656e-03 8.288867e-02
## quality 3.280412e-03 1.099334e-02 9.537000e-03
## PC4 PC5 PC6
## fixed.acidity 0.1244224095 -0.9845307068 -0.0849236772
## volatile.acidity -0.0050464160 0.0038497157 0.0409544532
## citric.acid 0.0029386326 -0.0416636471 -0.0082748751
## residual.sugar -0.0759758326 0.0005949154 0.0378225593
## chlorides 0.0058640653 0.0015129289 -0.0021355214
## free.sulfur.dioxide 0.0108349352 -0.0078862716 0.0049067954
## total.sulfur.dioxide -0.0119772833 0.0017391578 0.0010440067
## density 0.0009775556 -0.0003252625 -0.0005772027
## pH -0.0166565597 0.0754813438 -0.0001749859
## sulphates -0.0050451785 0.0035612543 -0.0104262703
## alcohol -0.8258312591 -0.1521893791 0.5357917699
## quality -0.5441656711 0.0029863201 -0.8380931116
## PC7 PC8 PC9
## fixed.acidity -0.0743389867 -0.0468562413 -4.969682e-03
## volatile.acidity 0.0719119916 -0.2832216699 1.624292e-01
## citric.acid 0.0403006772 0.8767959786 -3.520743e-01
## residual.sugar -0.0062948503 0.0003229402 -1.398574e-04
## chlorides 0.0137426810 0.0142757191 1.509765e-03
## free.sulfur.dioxide -0.0006309199 -0.0007834377 4.943424e-04
## total.sulfur.dioxide 0.0007473199 -0.0000595991 -3.552413e-04
## density -0.0036914690 0.0003273388 1.303056e-04
## pH -0.9356299441 -0.1119938401 -3.122144e-01
## sulphates -0.3343812305 0.3683952830 8.672667e-01
## alcohol 0.0002777270 0.0088509911 -1.928489e-03
## quality 0.0148623442 -0.0166769439 -4.762335e-05
## PC10 PC11 PC12
## fixed.acidity -0.0008578167 2.247396e-03 -7.666015e-04
## volatile.acidity -0.9412187532 -2.484141e-02 -4.697001e-04
## citric.acid -0.3214800154 -2.200332e-02 -3.371246e-04
## residual.sugar 0.0024096653 5.940522e-04 -3.736920e-04
## chlorides -0.0294761271 9.993366e-01 -4.616856e-03
## free.sulfur.dioxide -0.0011675075 -1.036815e-05 6.485294e-06
## total.sulfur.dioxide 0.0006933234 -2.819662e-05 -3.812818e-06
## density -0.0010104654 4.629403e-03 9.999811e-01
## pH -0.0916016835 1.220385e-02 -3.481757e-03
## sulphates 0.0128950512 -1.598781e-03 -1.446541e-03
## alcohol 0.0238683042 6.811356e-03 1.095668e-03
## quality -0.0274365169 6.245904e-04 8.347362e-05
plot(pc_white, type ="barplot")
summary(pc_white)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 43.9492 12.97972 4.64359 1.14725 0.82868 0.70740
## Proportion of Variance 0.9093 0.07931 0.01015 0.00062 0.00032 0.00024
## Cumulative Proportion 0.9093 0.98865 0.99880 0.99942 0.99974 0.99998
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.13561 0.11881 0.10699 0.09059 0.0199 0.0005593
## Proportion of Variance 0.00001 0.00001 0.00001 0.00000 0.0000 0.0000000
## Cumulative Proportion 0.99998 0.99999 1.00000 1.00000 1.0000 1.0000000
biplot(pc_white)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
# Scaled data
# PRINCIPAL COMPONENT ANALYSIS
pc_white_scaled <- prcomp(wine_white, scale = TRUE)
summary(pc_white_scaled)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8294 1.2594 1.1710 1.04157 0.98756 0.96890 0.8771
## Proportion of Variance 0.2789 0.1322 0.1143 0.09041 0.08127 0.07823 0.0641
## Cumulative Proportion 0.2789 0.4111 0.5253 0.61573 0.69701 0.77524 0.8393
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.85082 0.74599 0.58561 0.53302 0.14307
## Proportion of Variance 0.06032 0.04638 0.02858 0.02368 0.00171
## Cumulative Proportion 0.89967 0.94604 0.97462 0.99829 1.00000
names(pc_white_scaled)
## [1] "sdev" "rotation" "center" "scale" "x"
#outputs the mean of variables
pc_white_scaled$center
## fixed.acidity volatile.acidity citric.acid
## 6.85478767 0.27824112 0.33419151
## residual.sugar chlorides free.sulfur.dioxide
## 6.39141486 0.04577236 35.30808493
## total.sulfur.dioxide density pH
## 138.36065741 0.99402738 3.18826664
## sulphates alcohol quality
## 0.48984688 10.51426705 5.87790935
# output the standrad deviation of variables
pc_white_scaled$scale
## fixed.acidity volatile.acidity citric.acid
## 0.843868228 0.100794548 0.121019804
## residual.sugar chlorides free.sulfur.dioxide
## 5.072057784 0.021847968 17.007137325
## total.sulfur.dioxide density pH
## 42.498064554 0.002990907 0.151000600
## sulphates alcohol quality
## 0.114125834 1.230620568 0.885638575
# rotation
pc_white_scaled$rotation
## PC1 PC2 PC3 PC4
## fixed.acidity -0.15690447 0.56066866 -0.20738436 0.03373494
## volatile.acidity -0.02428722 0.01606694 0.52491466 -0.13119747
## citric.acid -0.13294430 0.28938115 -0.44635554 0.32953335
## residual.sugar -0.40605288 -0.03882402 -0.03384313 -0.41615630
## chlorides -0.21754400 0.03691144 0.21471269 0.50961203
## free.sulfur.dioxide -0.27471931 -0.34554881 -0.31297088 -0.14892788
## total.sulfur.dioxide -0.39044148 -0.27232605 -0.12479447 -0.02161841
## density -0.50129557 -0.01773344 0.03196758 -0.10386393
## pH 0.13003701 -0.56714503 0.06848384 0.20410995
## sulphates -0.03364168 -0.24826266 -0.22699505 0.51924489
## alcohol 0.44279498 0.01698188 -0.15887556 -0.13438871
## quality 0.22713722 -0.14603134 -0.48884718 -0.27820033
## PC5 PC6 PC7 PC8
## fixed.acidity -0.24413933 0.105856235 0.22355921 0.13041311
## volatile.acidity -0.70298193 -0.123704688 -0.22363601 -0.22960669
## citric.acid -0.06510579 -0.131958661 -0.12037133 -0.69141866
## residual.sugar 0.01610213 0.289918546 -0.33860858 -0.11329401
## chlorides 0.17829248 -0.409317266 -0.55225504 0.21139734
## free.sulfur.dioxide -0.11117214 -0.488085145 0.22407108 0.12883115
## total.sulfur.dioxide -0.27144774 -0.272493820 0.20375343 0.01290262
## density 0.07834373 0.326008106 -0.12313568 -0.08667076
## pH 0.11270171 0.192688838 0.07704001 -0.47796137
## sulphates -0.45623099 0.479811894 -0.04462167 0.33642752
## alcohol -0.30855451 -0.135443327 -0.09801169 -0.08899029
## quality -0.04112191 -0.005524396 -0.58434519 0.14444197
## PC9 PC10 PC11 PC12
## fixed.acidity -0.63145048 0.20087123 -0.10411772 0.170792295
## volatile.acidity -0.03159628 -0.14175876 -0.27002270 0.013376718
## citric.acid 0.24949503 -0.10632912 -0.05395597 0.009648802
## residual.sugar 0.17730336 0.37427490 0.17987291 0.493565139
## chlorides -0.17916182 0.23552782 0.09108849 0.025168952
## free.sulfur.dioxide 0.10184710 0.32733415 -0.49921348 -0.029475198
## total.sulfur.dioxide -0.17800832 -0.34735757 0.64355326 0.035060193
## density -0.12538636 0.04349161 -0.06686042 -0.761184485
## pH -0.52031593 0.18375599 -0.07911267 0.141842640
## sulphates 0.23662489 0.05519364 -0.04102077 0.042787387
## alcohol 0.01278298 0.57530003 0.41895440 -0.350156811
## quality -0.29970621 -0.36771605 -0.14620225 -0.016069252
dim(pc_white_scaled$x)
## [1] 4898 12
wine_white.quality <- wine_white[, 12]
# Scree plot and eigen vector
plot(pc_white_scaled, type ="line")
pc_white_scaled
## Standard deviations:
## [1] 1.8293903 1.2594008 1.1709706 1.0415668 0.9875644 0.9688978 0.8770680
## [8] 0.8508195 0.7459900 0.5856051 0.5330248 0.1430703
##
## Rotation:
## PC1 PC2 PC3 PC4
## fixed.acidity -0.15690447 0.56066866 -0.20738436 0.03373494
## volatile.acidity -0.02428722 0.01606694 0.52491466 -0.13119747
## citric.acid -0.13294430 0.28938115 -0.44635554 0.32953335
## residual.sugar -0.40605288 -0.03882402 -0.03384313 -0.41615630
## chlorides -0.21754400 0.03691144 0.21471269 0.50961203
## free.sulfur.dioxide -0.27471931 -0.34554881 -0.31297088 -0.14892788
## total.sulfur.dioxide -0.39044148 -0.27232605 -0.12479447 -0.02161841
## density -0.50129557 -0.01773344 0.03196758 -0.10386393
## pH 0.13003701 -0.56714503 0.06848384 0.20410995
## sulphates -0.03364168 -0.24826266 -0.22699505 0.51924489
## alcohol 0.44279498 0.01698188 -0.15887556 -0.13438871
## quality 0.22713722 -0.14603134 -0.48884718 -0.27820033
## PC5 PC6 PC7 PC8
## fixed.acidity -0.24413933 0.105856235 0.22355921 0.13041311
## volatile.acidity -0.70298193 -0.123704688 -0.22363601 -0.22960669
## citric.acid -0.06510579 -0.131958661 -0.12037133 -0.69141866
## residual.sugar 0.01610213 0.289918546 -0.33860858 -0.11329401
## chlorides 0.17829248 -0.409317266 -0.55225504 0.21139734
## free.sulfur.dioxide -0.11117214 -0.488085145 0.22407108 0.12883115
## total.sulfur.dioxide -0.27144774 -0.272493820 0.20375343 0.01290262
## density 0.07834373 0.326008106 -0.12313568 -0.08667076
## pH 0.11270171 0.192688838 0.07704001 -0.47796137
## sulphates -0.45623099 0.479811894 -0.04462167 0.33642752
## alcohol -0.30855451 -0.135443327 -0.09801169 -0.08899029
## quality -0.04112191 -0.005524396 -0.58434519 0.14444197
## PC9 PC10 PC11 PC12
## fixed.acidity -0.63145048 0.20087123 -0.10411772 0.170792295
## volatile.acidity -0.03159628 -0.14175876 -0.27002270 0.013376718
## citric.acid 0.24949503 -0.10632912 -0.05395597 0.009648802
## residual.sugar 0.17730336 0.37427490 0.17987291 0.493565139
## chlorides -0.17916182 0.23552782 0.09108849 0.025168952
## free.sulfur.dioxide 0.10184710 0.32733415 -0.49921348 -0.029475198
## total.sulfur.dioxide -0.17800832 -0.34735757 0.64355326 0.035060193
## density -0.12538636 0.04349161 -0.06686042 -0.761184485
## pH -0.52031593 0.18375599 -0.07911267 0.141842640
## sulphates 0.23662489 0.05519364 -0.04102077 0.042787387
## alcohol 0.01278298 0.57530003 0.41895440 -0.350156811
## quality -0.29970621 -0.36771605 -0.14620225 -0.016069252
#compute standard deviation of each principal component
std_dev <- pc_white_scaled$sdev
#compute variance
pr_var <- std_dev^2
#proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
#scree plot
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
#cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
biplot(pc_white_scaled, scale = 0)
# pREDICTION
pc_white_scaled1 <- prcomp(wine_white[,1:11], scale = TRUE)
summary(pc_white_scaled1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7951 1.2551 1.1053 1.00922 0.98658 0.96889
## Proportion of Variance 0.2929 0.1432 0.1111 0.09259 0.08848 0.08534
## Cumulative Proportion 0.2929 0.4361 0.5472 0.63979 0.72827 0.81361
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 0.85241 0.77418 0.64354 0.53804 0.14370
## Proportion of Variance 0.06605 0.05449 0.03765 0.02632 0.00188
## Cumulative Proportion 0.87967 0.93416 0.97181 0.99812 1.00000
wine_white_pca <- predict(pc_white_scaled1,newdata = wine_white)
wine_white_pca <- as.data.frame(wine_white_pca)
head(wine_white_pca)
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 3.6765681 -0.5451776 0.930326898 1.1352584 0.2928249 -0.8915294
## 2 -0.6445220 0.4307260 0.356294322 -0.9990146 0.7140048 -0.4836882
## 3 0.1552747 -1.1896785 0.017529733 -0.2701919 0.3676683 -0.5006305
## 4 1.4552255 0.0996700 0.001956186 0.4229793 0.4699326 0.7691822
## 5 1.4552255 0.0996700 0.001956186 0.4229793 0.4699326 0.7691822
## 6 0.1552747 -1.1896785 0.017529733 -0.2701919 0.3676683 -0.5006305
## PC7 PC8 PC9 PC10 PC11
## 1 -0.2046431 -1.0250789 -0.1505937 0.07589739 0.04218543
## 2 -0.4369532 0.5848705 1.0634409 0.25389286 0.12789128
## 3 -0.5542620 0.9748027 -0.5495982 -0.85548280 -0.18427651
## 4 0.1976817 0.6015911 -0.2932142 0.40365834 -0.04526931
## 5 0.1976817 0.6015911 -0.2932142 0.40365834 -0.04526931
## 6 -0.5542620 0.9748027 -0.5495982 -0.85548280 -0.18427651
wine_white_pca[ , 12] <- as.data.frame(wine_white$quality)
colnames(wine_white_pca)[12] <- "quality"
# Code to get graph -biplot and eigen values
pca_white1<- PCA(scale(wine_white))
pca_white1
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4898 individuals, described by 12 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
pca_white1$eig
## eigenvalue percentage of variance
## comp 1 3.34666886 27.8889072
## comp 2 1.58609030 13.2174191
## comp 3 1.37117206 11.4264338
## comp 4 1.08486150 9.0405125
## comp 5 0.97528337 8.1273614
## comp 6 0.93876295 7.8230246
## comp 7 0.76924825 6.4104021
## comp 8 0.72389378 6.0324481
## comp 9 0.55650103 4.6375086
## comp 10 0.34293339 2.8577782
## comp 11 0.28411542 2.3676285
## comp 12 0.02046911 0.1705759
## cumulative percentage of variance
## comp 1 27.88891
## comp 2 41.10633
## comp 3 52.53276
## comp 4 61.57327
## comp 5 69.70063
## comp 6 77.52366
## comp 7 83.93406
## comp 8 89.96651
## comp 9 94.60402
## comp 10 97.46180
## comp 11 99.82942
## comp 12 100.00000
# Predict PCs-white
predict(pc_white_scaled,newdata=tail(wine_white, 2))
## PC1 PC2 PC3 PC4 PC5 PC6
## 4897 3.622627 -0.9396813 0.002755897 -0.8458506 0.1861173 -0.4915531
## 4898 2.794760 0.1840536 -0.079200365 -0.5201373 1.0988470 -0.5375748
## PC7 PC8 PC9 PC10 PC11 PC12
## 4897 -0.3168015 -0.9089182 0.09385922 -0.3619748 0.5562302 -0.02555398
## 4898 0.6454322 -1.1429077 0.42697522 -0.2760451 0.2858102 0.04833857
summary(pc_white_scaled)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8294 1.2594 1.1710 1.04157 0.98756 0.96890 0.8771
## Proportion of Variance 0.2789 0.1322 0.1143 0.09041 0.08127 0.07823 0.0641
## Cumulative Proportion 0.2789 0.4111 0.5253 0.61573 0.69701 0.77524 0.8393
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.85082 0.74599 0.58561 0.53302 0.14307
## Proportion of Variance 0.06032 0.04638 0.02858 0.02368 0.00171
## Cumulative Proportion 0.89967 0.94604 0.97462 0.99829 1.00000
library(devtools)
## Warning: package 'devtools' was built under R version 3.3.2
# install_github("ggbiplot", "vqv/ggbiplot")
library(ggbiplot)
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
## Loading required package: scales
## Warning: package 'scales' was built under R version 3.3.2
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## Loading required package: grid
g <- ggbiplot(pc_white_scaled, obs.scale = 2, var.scale = 2, groups= wine_white.quality, ellipse = TRUE,
circle = FALSE)
g <- g + scale_color_continuous(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
require(ggplot2)
theta <- seq(0,2*pi,length.out = 100)
circle <- data.frame(x = cos(theta), y = sin(theta))
p <- ggplot(circle,aes(x,y)) + geom_path()
# loadings <- data.frame(ir.pca$rotation, names = row.names(ir.pca$rotation))
# p + geom_text(data=loadings,
# mapping=aes(x = PC1, y = PC2, label = .names, colour = .names)) +
# coord_fixed(ratio=2) +
# labs(x = "PC1", y = "PC2")
print(g)
pc_white$eig
## NULL
Correlation_Matrix<-as.data.frame(round(cor(wine_white,pca_white1$ind$coord)^2*100,0))
Correlation_Matrix
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## fixed.acidity 8 50 6 0 6
## volatile.acidity 0 0 38 2 48
## citric.acid 6 13 27 12 0
## residual.sugar 55 0 0 19 0
## chlorides 16 0 6 28 3
## free.sulfur.dioxide 25 19 13 2 1
## total.sulfur.dioxide 51 12 2 0 7
## density 84 0 0 1 1
## pH 6 51 1 5 1
## sulphates 0 10 7 29 20
## alcohol 66 0 3 2 9
## quality 17 3 33 8 0
Next, I use multinomial logistic regression model using “quality” as the target variable.to compare result using PCA predictors and actual predictors
library(nnet)
wine_white$quality <- as.factor(wine_white$quality)
mlogit_model <- multinom(quality ~. ,data =wine_white, maxit = 1000)
## # weights: 91 (72 variable)
## initial value 9531.067910
## iter 10 value 6414.647224
## iter 20 value 6052.157273
## iter 30 value 5815.209566
## iter 40 value 5382.001911
## iter 50 value 5347.161928
## iter 60 value 5339.165595
## iter 70 value 5336.314586
## iter 80 value 5334.784154
## iter 90 value 5332.550025
## iter 100 value 5330.291963
## iter 110 value 5320.982596
## iter 120 value 5315.854261
## iter 130 value 5313.649597
## iter 140 value 5311.812395
## final value 5311.740067
## converged
mlogit_model
## Call:
## multinom(formula = quality ~ ., data = wine_white, maxit = 1000)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 -146.17833 -0.9011549 1.909546 0.7646530 -0.120460944
## 5 -160.02570 -1.2470523 -1.962609 1.2057458 -0.075605321
## 6 -107.89694 -1.3270421 -7.658482 1.3035760 0.004296053
## 7 339.28622 -0.8738655 -9.623866 0.4033815 0.202491343
## 8 129.37097 -1.0690515 -8.865040 0.8106784 0.177462841
## 9 -27.16097 1.1956160 -9.445852 2.0895194 0.256108776
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## 4 -9.807789 -0.073995463 -4.514643e-03 171.16764
## 5 -11.087749 -0.037352677 7.114124e-05 193.19748
## 6 -9.826164 -0.029373926 -2.116957e-03 132.70043
## 7 -25.229760 -0.024135389 -2.239523e-03 -331.31340
## 8 -16.299088 -0.007545143 -5.604673e-03 -125.95295
## 9 -441.548528 -0.020825203 -7.039806e-04 -44.57506
## pH sulphates alcohol
## 4 -2.5851632 2.0532646 -0.4634899
## 5 -3.7521896 1.4230186 -0.4813939
## 6 -3.4541101 2.7440208 0.3430976
## 7 -0.8773179 4.5087587 0.4053614
## 8 -1.0427423 3.3377507 0.9456756
## 9 17.3613016 0.9662957 1.6300509
##
## Residual Deviance: 10623.48
## AIC: 10767.48
mlogit_output <- summary(mlogit_model)
mlogit_output
## Call:
## multinom(formula = quality ~ ., data = wine_white, maxit = 1000)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 -146.17833 -0.9011549 1.909546 0.7646530 -0.120460944
## 5 -160.02570 -1.2470523 -1.962609 1.2057458 -0.075605321
## 6 -107.89694 -1.3270421 -7.658482 1.3035760 0.004296053
## 7 339.28622 -0.8738655 -9.623866 0.4033815 0.202491343
## 8 129.37097 -1.0690515 -8.865040 0.8106784 0.177462841
## 9 -27.16097 1.1956160 -9.445852 2.0895194 0.256108776
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## 4 -9.807789 -0.073995463 -4.514643e-03 171.16764
## 5 -11.087749 -0.037352677 7.114124e-05 193.19748
## 6 -9.826164 -0.029373926 -2.116957e-03 132.70043
## 7 -25.229760 -0.024135389 -2.239523e-03 -331.31340
## 8 -16.299088 -0.007545143 -5.604673e-03 -125.95295
## 9 -441.548528 -0.020825203 -7.039806e-04 -44.57506
## pH sulphates alcohol
## 4 -2.5851632 2.0532646 -0.4634899
## 5 -3.7521896 1.4230186 -0.4813939
## 6 -3.4541101 2.7440208 0.3430976
## 7 -0.8773179 4.5087587 0.4053614
## 8 -1.0427423 3.3377507 0.9456756
## 9 17.3613016 0.9662957 1.6300509
##
## Std. Errors:
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 0.07503227 0.1988060 0.563383375 0.5853718 0.05361692
## 5 0.42451826 0.1883643 0.368259938 0.3063269 0.04997279
## 6 0.35728005 0.1879169 0.347243801 0.2810207 0.04991745
## 7 0.46128442 0.1910399 0.463207285 0.3867977 0.05059846
## 8 0.05252902 0.2040204 0.778156873 0.6489123 0.05308092
## 9 0.01192238 0.2769569 0.004118554 0.0199239 0.15058624
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## 4 0.0614734959 0.01327570 0.006966082 0.07414261
## 5 0.8346830953 0.01080784 0.006568198 0.41689063
## 6 0.8271379501 0.01067142 0.006559115 0.35095023
## 7 0.0456715981 0.01085444 0.006656021 0.45478016
## 8 0.0096554725 0.01135180 0.007167097 0.05226580
## 9 0.0004420978 0.04115254 0.019924978 0.01174476
## pH sulphates alcohol
## 4 0.23808757 0.668467746 0.1448381
## 5 0.21881415 0.330642547 0.1284929
## 6 0.18164790 0.272048777 0.1260438
## 7 0.22468046 0.318407242 0.1288072
## 8 0.24114375 0.530465586 0.1398779
## 9 0.04752021 0.006232217 0.1818881
##
## Residual Deviance: 10623.48
## AIC: 10767.48
mlogit_output$coefficients
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 -146.17833 -0.9011549 1.909546 0.7646530 -0.120460944
## 5 -160.02570 -1.2470523 -1.962609 1.2057458 -0.075605321
## 6 -107.89694 -1.3270421 -7.658482 1.3035760 0.004296053
## 7 339.28622 -0.8738655 -9.623866 0.4033815 0.202491343
## 8 129.37097 -1.0690515 -8.865040 0.8106784 0.177462841
## 9 -27.16097 1.1956160 -9.445852 2.0895194 0.256108776
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## 4 -9.807789 -0.073995463 -4.514643e-03 171.16764
## 5 -11.087749 -0.037352677 7.114124e-05 193.19748
## 6 -9.826164 -0.029373926 -2.116957e-03 132.70043
## 7 -25.229760 -0.024135389 -2.239523e-03 -331.31340
## 8 -16.299088 -0.007545143 -5.604673e-03 -125.95295
## 9 -441.548528 -0.020825203 -7.039806e-04 -44.57506
## pH sulphates alcohol
## 4 -2.5851632 2.0532646 -0.4634899
## 5 -3.7521896 1.4230186 -0.4813939
## 6 -3.4541101 2.7440208 0.3430976
## 7 -0.8773179 4.5087587 0.4053614
## 8 -1.0427423 3.3377507 0.9456756
## 9 17.3613016 0.9662957 1.6300509
mlogit_output$standard.errors
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 0.07503227 0.1988060 0.563383375 0.5853718 0.05361692
## 5 0.42451826 0.1883643 0.368259938 0.3063269 0.04997279
## 6 0.35728005 0.1879169 0.347243801 0.2810207 0.04991745
## 7 0.46128442 0.1910399 0.463207285 0.3867977 0.05059846
## 8 0.05252902 0.2040204 0.778156873 0.6489123 0.05308092
## 9 0.01192238 0.2769569 0.004118554 0.0199239 0.15058624
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## 4 0.0614734959 0.01327570 0.006966082 0.07414261
## 5 0.8346830953 0.01080784 0.006568198 0.41689063
## 6 0.8271379501 0.01067142 0.006559115 0.35095023
## 7 0.0456715981 0.01085444 0.006656021 0.45478016
## 8 0.0096554725 0.01135180 0.007167097 0.05226580
## 9 0.0004420978 0.04115254 0.019924978 0.01174476
## pH sulphates alcohol
## 4 0.23808757 0.668467746 0.1448381
## 5 0.21881415 0.330642547 0.1284929
## 6 0.18164790 0.272048777 0.1260438
## 7 0.22468046 0.318407242 0.1288072
## 8 0.24114375 0.530465586 0.1398779
## 9 0.04752021 0.006232217 0.1818881
z <- mlogit_output$coefficients/mlogit_output$standard.errors
p <- (1-pnorm(abs(z),0,1))*2 # I am using two-tailed z test
print(z, digits =2)
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 -1948 -4.5 3.4 1.3 -2.247
## 5 -377 -6.6 -5.3 3.9 -1.513
## 6 -302 -7.1 -22.1 4.6 0.086
## 7 736 -4.6 -20.8 1.0 4.002
## 8 2463 -5.2 -11.4 1.2 3.343
## 9 -2278 4.3 -2293.5 104.9 1.701
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 4 -160 -5.57 -0.648 2309 -10.9
## 5 -13 -3.46 0.011 463 -17.1
## 6 -12 -2.75 -0.323 378 -19.0
## 7 -552 -2.22 -0.336 -729 -3.9
## 8 -1688 -0.66 -0.782 -2410 -4.3
## 9 -998758 -0.51 -0.035 -3795 365.3
## sulphates alcohol
## 4 3.1 -3.2
## 5 4.3 -3.7
## 6 10.1 2.7
## 7 14.2 3.1
## 8 6.3 6.8
## 9 155.0 9.0
print(p, digits =2)
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 0 5.8e-06 7.0e-04 1.9e-01 2.5e-02
## 5 0 3.6e-11 9.9e-08 8.3e-05 1.3e-01
## 6 0 1.6e-12 0.0e+00 3.5e-06 9.3e-01
## 7 0 4.8e-06 0.0e+00 3.0e-01 6.3e-05
## 8 0 1.6e-07 0.0e+00 2.1e-01 8.3e-04
## 9 0 1.6e-05 0.0e+00 0.0e+00 8.9e-02
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 4 0 2.5e-08 0.52 0 0.0e+00
## 5 0 5.5e-04 0.99 0 0.0e+00
## 6 0 5.9e-03 0.75 0 0.0e+00
## 7 0 2.6e-02 0.74 0 9.4e-05
## 8 0 5.1e-01 0.43 0 1.5e-05
## 9 0 6.1e-01 0.97 0 0.0e+00
## sulphates alcohol
## 4 2.1e-03 1.4e-03
## 5 1.7e-05 1.8e-04
## 6 0.0e+00 6.5e-03
## 7 0.0e+00 1.6e-03
## 8 3.1e-10 1.4e-11
## 9 0.0e+00 0.0e+00
Now I’ll explore the entire data set, and analyze if we can remove any variables which do not add to model performance.
Pquality5 <- rbind(mlogit_output$coefficients[2, ],mlogit_output$standard.errors[2, ],z[2, ],p[2, ])
Pquality5
## (Intercept) fixed.acidity volatile.acidity citric.acid
## [1,] -160.0256980 -1.247052e+00 -1.962609e+00 1.205746e+00
## [2,] 0.4245183 1.883643e-01 3.682599e-01 3.063269e-01
## [3,] -376.9583410 -6.620428e+00 -5.329413e+00 3.936141e+00
## [4,] 0.0000000 3.581602e-11 9.853093e-08 8.280224e-05
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## [1,] -0.07560532 -11.0877492 -0.0373526768 7.114124e-05
## [2,] 0.04997279 0.8346831 0.0108078450 6.568198e-03
## [3,] -1.51292978 -13.2837831 -3.4560707499 1.083117e-02
## [4,] 0.13029750 0.0000000 0.0005481114 9.913581e-01
## density pH sulphates alcohol
## [1,] 193.1974798 -3.7521896 1.4230186397 -0.481393883
## [2,] 0.4168906 0.2188142 0.3306425474 0.128492922
## [3,] 463.4248602 -17.1478377 4.3037977141 -3.746462259
## [4,] 0.0000000 0.0000000 0.0000167895 0.000179346
rownames(Pquality5) <- c("Coefficient","Std. Errors","z stat","p value")
knitr::kable(Pquality5)
| (Intercept) | fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Coefficient | -160.0256980 | -1.2470523 | -1.9626091 | 1.2057458 | -0.0756053 | -11.0877492 | -0.0373527 | 0.0000711 | 193.1974798 | -3.7521896 | 1.4230186 | -0.4813939 |
| Std. Errors | 0.4245183 | 0.1883643 | 0.3682599 | 0.3063269 | 0.0499728 | 0.8346831 | 0.0108078 | 0.0065682 | 0.4168906 | 0.2188142 | 0.3306425 | 0.1284929 |
| z stat | -376.9583410 | -6.6204282 | -5.3294126 | 3.9361413 | -1.5129298 | -13.2837831 | -3.4560707 | 0.0108312 | 463.4248602 | -17.1478377 | 4.3037977 | -3.7464623 |
| p value | 0.0000000 | 0.0000000 | 0.0000001 | 0.0000828 | 0.1302975 | 0.0000000 | 0.0005481 | 0.9913581 | 0.0000000 | 0.0000000 | 0.0000168 | 0.0001793 |
I used coefficients magnitude, p-value and accuracy to select key variables for lean model
mlogit_model1 <- multinom(quality ~ density + chlorides + volatile.acidity + alcohol, data = wine_white, maxit = 1000)
## # weights: 42 (30 variable)
## initial value 9531.067910
## iter 10 value 6042.447019
## iter 20 value 5522.543872
## iter 30 value 5512.500391
## iter 40 value 5504.072357
## iter 50 value 5498.013204
## iter 60 value 5495.236496
## iter 70 value 5493.446161
## iter 80 value 5491.784822
## iter 90 value 5490.802324
## iter 100 value 5488.899222
## iter 110 value 5487.839488
## iter 120 value 5484.593955
## final value 5484.564810
## converged
mlogit_model1
## Call:
## multinom(formula = quality ~ density + chlorides + volatile.acidity +
## alcohol, data = wine_white, maxit = 1000)
##
## Coefficients:
## (Intercept) density chlorides volatile.acidity alcohol
## 4 182.942656 -176.24297 -9.137097 2.902623 -0.5992752
## 5 76.853209 -65.21239 -8.314696 -1.381076 -0.6769956
## 6 -37.614256 41.91572 -7.818278 -7.126923 0.2964201
## 7 -69.837795 68.52409 -27.209577 -9.050082 0.8595494
## 8 -122.536927 115.88794 -17.240567 -8.442490 1.1672243
## 9 -8.966465 3.78357 -153.639503 -8.386114 1.0319386
##
## Residual Deviance: 10969.13
## AIC: 11029.13
mlogit_output1 <- summary(mlogit_model1)
mlogit_output1
## Call:
## multinom(formula = quality ~ density + chlorides + volatile.acidity +
## alcohol, data = wine_white, maxit = 1000)
##
## Coefficients:
## (Intercept) density chlorides volatile.acidity alcohol
## 4 182.942656 -176.24297 -9.137097 2.902623 -0.5992752
## 5 76.853209 -65.21239 -8.314696 -1.381076 -0.6769956
## 6 -37.614256 41.91572 -7.818278 -7.126923 0.2964201
## 7 -69.837795 68.52409 -27.209577 -9.050082 0.8595494
## 8 -122.536927 115.88794 -17.240567 -8.442490 1.1672243
## 9 -8.966465 3.78357 -153.639503 -8.386114 1.0319386
##
## Std. Errors:
## (Intercept) density chlorides volatile.acidity alcohol
## 4 1.142151 1.158530 6.1280463 1.782877 0.2133311
## 5 1.101058 1.075444 5.4474523 1.728997 0.2020842
## 6 1.005195 1.223813 5.5163101 1.739039 0.2006937
## 7 1.133330 1.078595 6.7392197 1.780452 0.2034684
## 8 1.242276 1.136474 8.8620454 1.918194 0.2144794
## 9 3.180607 3.213563 0.3377814 5.098028 0.5693638
##
## Residual Deviance: 10969.13
## AIC: 11029.13
mlogit_output1$coefficients
## (Intercept) density chlorides volatile.acidity alcohol
## 4 182.942656 -176.24297 -9.137097 2.902623 -0.5992752
## 5 76.853209 -65.21239 -8.314696 -1.381076 -0.6769956
## 6 -37.614256 41.91572 -7.818278 -7.126923 0.2964201
## 7 -69.837795 68.52409 -27.209577 -9.050082 0.8595494
## 8 -122.536927 115.88794 -17.240567 -8.442490 1.1672243
## 9 -8.966465 3.78357 -153.639503 -8.386114 1.0319386
mlogit_output1$standard.errors
## (Intercept) density chlorides volatile.acidity alcohol
## 4 1.142151 1.158530 6.1280463 1.782877 0.2133311
## 5 1.101058 1.075444 5.4474523 1.728997 0.2020842
## 6 1.005195 1.223813 5.5163101 1.739039 0.2006937
## 7 1.133330 1.078595 6.7392197 1.780452 0.2034684
## 8 1.242276 1.136474 8.8620454 1.918194 0.2144794
## 9 3.180607 3.213563 0.3377814 5.098028 0.5693638
z <- mlogit_output1$coefficients/mlogit_output1$standard.errors
p <- (1-pnorm(abs(z),0,1))*2
Pquality5 <- rbind(mlogit_output1$coefficients[2, ],mlogit_output1$standard.errors[2, ],z[2, ],p[2, ])
Pquality5
## (Intercept) density chlorides volatile.acidity alcohol
## [1,] 76.853209 -65.212387 -8.3146964 -1.3810757 -0.6769956119
## [2,] 1.101058 1.075444 5.4474523 1.7289969 0.2020842416
## [3,] 69.799444 -60.637651 -1.5263459 -0.7987728 -3.3500663222
## [4,] 0.000000 0.000000 0.1269237 0.4244222 0.0008079221
rownames(Pquality5) <- c("Coefficient","Std. Errors","z stat","p value")
knitr::kable(Pquality5)
| (Intercept) | density | chlorides | volatile.acidity | alcohol | |
|---|---|---|---|---|---|
| Coefficient | 76.853208 | -65.212387 | -8.3146964 | -1.3810757 | -0.6769956 |
| Std. Errors | 1.101058 | 1.075444 | 5.4474523 | 1.7289969 | 0.2020842 |
| z stat | 69.799444 | -60.637651 | -1.5263459 | -0.7987728 | -3.3500663 |
| p value | 0.000000 | 0.000000 | 0.1269237 | 0.4244222 | 0.0008079 |
predictedML1 <- predict(mlogit_model1,wine_white,na.action =na.pass, type="probs")
predicted_classML1 <- predict(mlogit_model1,wine_white)
cm_ols <- table(predicted_classML1, wine_white$quality)
print(cm_ols)
##
## predicted_classML1 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 3 2 1 0 0 0
## 5 7 92 800 450 38 11 0
## 6 12 65 647 1622 688 122 2
## 7 1 3 8 124 154 42 3
## 8 0 0 0 1 0 0 0
## 9 0 0 0 0 0 0 0
Accuracy <- sum(diag(cm_ols))/sum(cm_ols)
Accuracy
## [1] 0.5265414
USING pca
library(nnet)
wine_white$quality <- as.factor(wine_white_pca$quality)
mlogit_model_pca <- multinom(quality ~. ,data =wine_white_pca, maxit = 1000)
## # weights: 91 (72 variable)
## initial value 9531.067910
## iter 10 value 6036.731016
## iter 20 value 5694.114383
## iter 30 value 5472.950913
## iter 40 value 5389.586477
## iter 50 value 5315.063391
## iter 60 value 5302.126745
## iter 70 value 5301.452356
## iter 80 value 5301.029649
## final value 5300.956377
## converged
mlogit_model_pca
## Call:
## multinom(formula = quality ~ ., data = wine_white_pca, maxit = 1000)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 4 2.027223 -0.28901446 -0.1348783 0.4341662 -0.5896798 0.3894355
## 5 4.739629 0.09133208 0.1104327 0.1367819 -0.1555566 0.5366869
## 6 5.446409 -0.27475013 0.1601901 -0.3398061 0.1960996 0.5177786
## 7 4.099425 -0.63715392 0.2594263 -0.6238430 0.6511807 0.2471959
## 8 2.242588 -0.63211224 0.3220157 -0.5947537 0.6968820 0.1836012
## 9 -6.256670 -1.83248810 0.4835077 -1.4652448 3.9704391 -1.5844819
## PC6 PC7 PC8 PC9 PC10 PC11
## 4 -0.7095140 -0.100504703 -0.2867423 1.2781645 0.36473454 0.6561609
## 5 -0.4356058 -0.020956662 -0.6535577 1.1822973 0.32982542 -0.1822998
## 6 -0.4398309 -0.037024060 -1.1331343 0.2926639 0.63792229 -0.3598283
## 7 -0.5900704 -0.006554536 -1.2252729 -0.1996221 0.73051233 -2.4871566
## 8 -0.3458034 -0.166084128 -1.4341226 -0.6708020 0.64443425 -3.0548513
## 9 -2.5368759 -1.400295780 0.8337027 -0.3685297 -0.04790321 -0.2212499
##
## Residual Deviance: 10601.91
## AIC: 10745.91
mlogit_output_pca <- summary(mlogit_model_pca)
mlogit_output_pca
## Call:
## multinom(formula = quality ~ ., data = wine_white_pca, maxit = 1000)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 4 2.027223 -0.28901446 -0.1348783 0.4341662 -0.5896798 0.3894355
## 5 4.739629 0.09133208 0.1104327 0.1367819 -0.1555566 0.5366869
## 6 5.446409 -0.27475013 0.1601901 -0.3398061 0.1960996 0.5177786
## 7 4.099425 -0.63715392 0.2594263 -0.6238430 0.6511807 0.2471959
## 8 2.242588 -0.63211224 0.3220157 -0.5947537 0.6968820 0.1836012
## 9 -6.256670 -1.83248810 0.4835077 -1.4652448 3.9704391 -1.5844819
## PC6 PC7 PC8 PC9 PC10 PC11
## 4 -0.7095140 -0.100504703 -0.2867423 1.2781645 0.36473454 0.6561609
## 5 -0.4356058 -0.020956662 -0.6535577 1.1822973 0.32982542 -0.1822998
## 6 -0.4398309 -0.037024060 -1.1331343 0.2926639 0.63792229 -0.3598283
## 7 -0.5900704 -0.006554536 -1.2252729 -0.1996221 0.73051233 -2.4871566
## 8 -0.3458034 -0.166084128 -1.4341226 -0.6708020 0.64443425 -3.0548513
## 9 -2.5368759 -1.400295780 0.8337027 -0.3685297 -0.04790321 -0.2212499
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6
## 4 0.3533404 0.1503791 0.1633764 0.2060539 0.1864103 0.2040837 0.2136086
## 5 0.3316006 0.1401905 0.1532896 0.1960060 0.1723007 0.1937683 0.1926941
## 6 0.3305255 0.1398330 0.1529030 0.1962249 0.1724303 0.1935652 0.1921264
## 7 0.3333543 0.1416353 0.1547544 0.1990413 0.1844522 0.1972320 0.1983140
## 8 0.3517707 0.1485140 0.1631778 0.2098844 0.2095402 0.2094765 0.2126973
## 9 2.2690103 0.5442530 0.3550829 0.5853719 1.3480274 0.8587051 1.0117091
## PC7 PC8 PC9 PC10 PC11
## 4 0.2796000 0.2989028 0.3419873 0.3941081 1.0080647
## 5 0.2652672 0.2835056 0.3220298 0.3671447 0.9782499
## 6 0.2649037 0.2833340 0.3216162 0.3667061 0.9858760
## 7 0.2679956 0.2864775 0.3285265 0.3719938 1.0253194
## 8 0.2799308 0.3033375 0.3528157 0.3957018 1.2347147
## 9 0.5725602 0.8965439 1.1226002 0.9963558 3.0200993
##
## Residual Deviance: 10601.91
## AIC: 10745.91
mlogit_output_pca$coefficients
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 4 2.027223 -0.28901446 -0.1348783 0.4341662 -0.5896798 0.3894355
## 5 4.739629 0.09133208 0.1104327 0.1367819 -0.1555566 0.5366869
## 6 5.446409 -0.27475013 0.1601901 -0.3398061 0.1960996 0.5177786
## 7 4.099425 -0.63715392 0.2594263 -0.6238430 0.6511807 0.2471959
## 8 2.242588 -0.63211224 0.3220157 -0.5947537 0.6968820 0.1836012
## 9 -6.256670 -1.83248810 0.4835077 -1.4652448 3.9704391 -1.5844819
## PC6 PC7 PC8 PC9 PC10 PC11
## 4 -0.7095140 -0.100504703 -0.2867423 1.2781645 0.36473454 0.6561609
## 5 -0.4356058 -0.020956662 -0.6535577 1.1822973 0.32982542 -0.1822998
## 6 -0.4398309 -0.037024060 -1.1331343 0.2926639 0.63792229 -0.3598283
## 7 -0.5900704 -0.006554536 -1.2252729 -0.1996221 0.73051233 -2.4871566
## 8 -0.3458034 -0.166084128 -1.4341226 -0.6708020 0.64443425 -3.0548513
## 9 -2.5368759 -1.400295780 0.8337027 -0.3685297 -0.04790321 -0.2212499
mlogit_output_pca$standard.errors
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6
## 4 0.3533404 0.1503791 0.1633764 0.2060539 0.1864103 0.2040837 0.2136086
## 5 0.3316006 0.1401905 0.1532896 0.1960060 0.1723007 0.1937683 0.1926941
## 6 0.3305255 0.1398330 0.1529030 0.1962249 0.1724303 0.1935652 0.1921264
## 7 0.3333543 0.1416353 0.1547544 0.1990413 0.1844522 0.1972320 0.1983140
## 8 0.3517707 0.1485140 0.1631778 0.2098844 0.2095402 0.2094765 0.2126973
## 9 2.2690103 0.5442530 0.3550829 0.5853719 1.3480274 0.8587051 1.0117091
## PC7 PC8 PC9 PC10 PC11
## 4 0.2796000 0.2989028 0.3419873 0.3941081 1.0080647
## 5 0.2652672 0.2835056 0.3220298 0.3671447 0.9782499
## 6 0.2649037 0.2833340 0.3216162 0.3667061 0.9858760
## 7 0.2679956 0.2864775 0.3285265 0.3719938 1.0253194
## 8 0.2799308 0.3033375 0.3528157 0.3957018 1.2347147
## 9 0.5725602 0.8965439 1.1226002 0.9963558 3.0200993
z <- mlogit_output_pca$coefficients/mlogit_output_pca$standard.errors
p <- (1-pnorm(abs(z),0,1))*2 # I am using two-tailed z test
print(z, digits =2)
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 4 5.7 -1.92 -0.83 2.1 -3.2 1.91 -3.3 -0.359 -0.96 3.74 0.925
## 5 14.3 0.65 0.72 0.7 -0.9 2.77 -2.3 -0.079 -2.31 3.67 0.898
## 6 16.5 -1.96 1.05 -1.7 1.1 2.67 -2.3 -0.140 -4.00 0.91 1.740
## 7 12.3 -4.50 1.68 -3.1 3.5 1.25 -3.0 -0.024 -4.28 -0.61 1.964
## 8 6.4 -4.26 1.97 -2.8 3.3 0.88 -1.6 -0.593 -4.73 -1.90 1.629
## 9 -2.8 -3.37 1.36 -2.5 2.9 -1.85 -2.5 -2.446 0.93 -0.33 -0.048
## PC11
## 4 0.651
## 5 -0.186
## 6 -0.365
## 7 -2.426
## 8 -2.474
## 9 -0.073
print(p, digits =2)
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 4 9.6e-09 5.5e-02 0.409 0.0351 0.00156 0.0564 0.0009 0.719 3.4e-01
## 5 0.0e+00 5.1e-01 0.471 0.4853 0.36662 0.0056 0.0238 0.937 2.1e-02
## 6 0.0e+00 4.9e-02 0.295 0.0833 0.25543 0.0075 0.0221 0.889 6.4e-05
## 7 0.0e+00 6.8e-06 0.094 0.0017 0.00042 0.2101 0.0029 0.980 1.9e-05
## 8 1.8e-10 2.1e-05 0.048 0.0046 0.00088 0.3808 0.1040 0.553 2.3e-06
## 9 5.8e-03 7.6e-04 0.173 0.0123 0.00323 0.0650 0.0122 0.014 3.5e-01
## PC9 PC10 PC11
## 4 0.00019 0.355 0.515
## 5 0.00024 0.369 0.852
## 6 0.36283 0.082 0.715
## 7 0.54343 0.050 0.015
## 8 0.05727 0.103 0.013
## 9 0.74270 0.962 0.942
Now I’ll explore the entire data set, and analyze if we can remove any variables which do not add to model performance.
Pquality5_pca <- rbind(mlogit_output_pca$coefficients[2, ],mlogit_output_pca$standard.errors[2, ],z[2, ],p[2, ])
Pquality5_pca
## (Intercept) PC1 PC2 PC3 PC4 PC5
## [1,] 4.7396291 0.09133208 0.1104327 0.1367819 -0.1555566 0.536686929
## [2,] 0.3316006 0.14019051 0.1532896 0.1960060 0.1723007 0.193768274
## [3,] 14.2931876 0.65148549 0.7204187 0.6978452 -0.9028200 2.769735824
## [4,] 0.0000000 0.51473314 0.4712672 0.4852740 0.3666214 0.005610178
## PC6 PC7 PC8 PC9 PC10 PC11
## [1,] -0.43560578 -0.02095666 -0.6535577 1.1822973001 0.3298254 -0.1822998
## [2,] 0.19269406 0.26526720 0.2835056 0.3220297895 0.3671447 0.9782499
## [3,] -2.26060820 -0.07900209 -2.3052729 3.6713910909 0.8983526 -0.1863530
## [4,] 0.02378353 0.93703096 0.0211513 0.0002412339 0.3689976 0.8521679
rownames(Pquality5_pca) <- c("Coefficient","Std. Errors","z stat","p value")
knitr::kable(Pquality5_pca)
| (Intercept) | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Coefficient | 4.7396291 | 0.0913321 | 0.1104327 | 0.1367819 | -0.1555566 | 0.5366869 | -0.4356058 | -0.0209567 | -0.6535577 | 1.1822973 | 0.3298254 | -0.1822998 |
| Std. Errors | 0.3316006 | 0.1401905 | 0.1532896 | 0.1960060 | 0.1723007 | 0.1937683 | 0.1926941 | 0.2652672 | 0.2835056 | 0.3220298 | 0.3671447 | 0.9782499 |
| z stat | 14.2931876 | 0.6514855 | 0.7204187 | 0.6978452 | -0.9028200 | 2.7697358 | -2.2606082 | -0.0790021 | -2.3052729 | 3.6713911 | 0.8983526 | -0.1863530 |
| p value | 0.0000000 | 0.5147331 | 0.4712672 | 0.4852740 | 0.3666214 | 0.0056102 | 0.0237835 | 0.9370310 | 0.0211513 | 0.0002412 | 0.3689976 | 0.8521679 |
I used PC1 to PC4 for lean model
mlogit_model_pca1 <- multinom(quality ~ ., data = wine_white_pca, maxit = 1000)
## # weights: 91 (72 variable)
## initial value 9531.067910
## iter 10 value 6036.731016
## iter 20 value 5694.114383
## iter 30 value 5472.950913
## iter 40 value 5389.586477
## iter 50 value 5315.063391
## iter 60 value 5302.126745
## iter 70 value 5301.452356
## iter 80 value 5301.029649
## final value 5300.956377
## converged
mlogit_model_pca1
## Call:
## multinom(formula = quality ~ ., data = wine_white_pca, maxit = 1000)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 4 2.027223 -0.28901446 -0.1348783 0.4341662 -0.5896798 0.3894355
## 5 4.739629 0.09133208 0.1104327 0.1367819 -0.1555566 0.5366869
## 6 5.446409 -0.27475013 0.1601901 -0.3398061 0.1960996 0.5177786
## 7 4.099425 -0.63715392 0.2594263 -0.6238430 0.6511807 0.2471959
## 8 2.242588 -0.63211224 0.3220157 -0.5947537 0.6968820 0.1836012
## 9 -6.256670 -1.83248810 0.4835077 -1.4652448 3.9704391 -1.5844819
## PC6 PC7 PC8 PC9 PC10 PC11
## 4 -0.7095140 -0.100504703 -0.2867423 1.2781645 0.36473454 0.6561609
## 5 -0.4356058 -0.020956662 -0.6535577 1.1822973 0.32982542 -0.1822998
## 6 -0.4398309 -0.037024060 -1.1331343 0.2926639 0.63792229 -0.3598283
## 7 -0.5900704 -0.006554536 -1.2252729 -0.1996221 0.73051233 -2.4871566
## 8 -0.3458034 -0.166084128 -1.4341226 -0.6708020 0.64443425 -3.0548513
## 9 -2.5368759 -1.400295780 0.8337027 -0.3685297 -0.04790321 -0.2212499
##
## Residual Deviance: 10601.91
## AIC: 10745.91
mlogit_output_pca1 <- summary(mlogit_model_pca1)
mlogit_output_pca1
## Call:
## multinom(formula = quality ~ ., data = wine_white_pca, maxit = 1000)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 4 2.027223 -0.28901446 -0.1348783 0.4341662 -0.5896798 0.3894355
## 5 4.739629 0.09133208 0.1104327 0.1367819 -0.1555566 0.5366869
## 6 5.446409 -0.27475013 0.1601901 -0.3398061 0.1960996 0.5177786
## 7 4.099425 -0.63715392 0.2594263 -0.6238430 0.6511807 0.2471959
## 8 2.242588 -0.63211224 0.3220157 -0.5947537 0.6968820 0.1836012
## 9 -6.256670 -1.83248810 0.4835077 -1.4652448 3.9704391 -1.5844819
## PC6 PC7 PC8 PC9 PC10 PC11
## 4 -0.7095140 -0.100504703 -0.2867423 1.2781645 0.36473454 0.6561609
## 5 -0.4356058 -0.020956662 -0.6535577 1.1822973 0.32982542 -0.1822998
## 6 -0.4398309 -0.037024060 -1.1331343 0.2926639 0.63792229 -0.3598283
## 7 -0.5900704 -0.006554536 -1.2252729 -0.1996221 0.73051233 -2.4871566
## 8 -0.3458034 -0.166084128 -1.4341226 -0.6708020 0.64443425 -3.0548513
## 9 -2.5368759 -1.400295780 0.8337027 -0.3685297 -0.04790321 -0.2212499
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6
## 4 0.3533404 0.1503791 0.1633764 0.2060539 0.1864103 0.2040837 0.2136086
## 5 0.3316006 0.1401905 0.1532896 0.1960060 0.1723007 0.1937683 0.1926941
## 6 0.3305255 0.1398330 0.1529030 0.1962249 0.1724303 0.1935652 0.1921264
## 7 0.3333543 0.1416353 0.1547544 0.1990413 0.1844522 0.1972320 0.1983140
## 8 0.3517707 0.1485140 0.1631778 0.2098844 0.2095402 0.2094765 0.2126973
## 9 2.2690103 0.5442530 0.3550829 0.5853719 1.3480274 0.8587051 1.0117091
## PC7 PC8 PC9 PC10 PC11
## 4 0.2796000 0.2989028 0.3419873 0.3941081 1.0080647
## 5 0.2652672 0.2835056 0.3220298 0.3671447 0.9782499
## 6 0.2649037 0.2833340 0.3216162 0.3667061 0.9858760
## 7 0.2679956 0.2864775 0.3285265 0.3719938 1.0253194
## 8 0.2799308 0.3033375 0.3528157 0.3957018 1.2347147
## 9 0.5725602 0.8965439 1.1226002 0.9963558 3.0200993
##
## Residual Deviance: 10601.91
## AIC: 10745.91
mlogit_output_pca1$coefficients
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 4 2.027223 -0.28901446 -0.1348783 0.4341662 -0.5896798 0.3894355
## 5 4.739629 0.09133208 0.1104327 0.1367819 -0.1555566 0.5366869
## 6 5.446409 -0.27475013 0.1601901 -0.3398061 0.1960996 0.5177786
## 7 4.099425 -0.63715392 0.2594263 -0.6238430 0.6511807 0.2471959
## 8 2.242588 -0.63211224 0.3220157 -0.5947537 0.6968820 0.1836012
## 9 -6.256670 -1.83248810 0.4835077 -1.4652448 3.9704391 -1.5844819
## PC6 PC7 PC8 PC9 PC10 PC11
## 4 -0.7095140 -0.100504703 -0.2867423 1.2781645 0.36473454 0.6561609
## 5 -0.4356058 -0.020956662 -0.6535577 1.1822973 0.32982542 -0.1822998
## 6 -0.4398309 -0.037024060 -1.1331343 0.2926639 0.63792229 -0.3598283
## 7 -0.5900704 -0.006554536 -1.2252729 -0.1996221 0.73051233 -2.4871566
## 8 -0.3458034 -0.166084128 -1.4341226 -0.6708020 0.64443425 -3.0548513
## 9 -2.5368759 -1.400295780 0.8337027 -0.3685297 -0.04790321 -0.2212499
mlogit_output_pca1$standard.errors
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6
## 4 0.3533404 0.1503791 0.1633764 0.2060539 0.1864103 0.2040837 0.2136086
## 5 0.3316006 0.1401905 0.1532896 0.1960060 0.1723007 0.1937683 0.1926941
## 6 0.3305255 0.1398330 0.1529030 0.1962249 0.1724303 0.1935652 0.1921264
## 7 0.3333543 0.1416353 0.1547544 0.1990413 0.1844522 0.1972320 0.1983140
## 8 0.3517707 0.1485140 0.1631778 0.2098844 0.2095402 0.2094765 0.2126973
## 9 2.2690103 0.5442530 0.3550829 0.5853719 1.3480274 0.8587051 1.0117091
## PC7 PC8 PC9 PC10 PC11
## 4 0.2796000 0.2989028 0.3419873 0.3941081 1.0080647
## 5 0.2652672 0.2835056 0.3220298 0.3671447 0.9782499
## 6 0.2649037 0.2833340 0.3216162 0.3667061 0.9858760
## 7 0.2679956 0.2864775 0.3285265 0.3719938 1.0253194
## 8 0.2799308 0.3033375 0.3528157 0.3957018 1.2347147
## 9 0.5725602 0.8965439 1.1226002 0.9963558 3.0200993
z <- mlogit_output_pca1$coefficients/mlogit_output_pca1$standard.errors
p <- (1-pnorm(abs(z),0,1))*2
predictedML1 <- predict(mlogit_model_pca1,wine_white_pca,na.action =na.pass, type="probs")
predicted_class_pca_ML1 <- predict(mlogit_model_pca1,wine_white_pca)
cm_ols_pca <- table(predicted_class_pca_ML1, wine_white_pca$quality)
print(cm_ols_pca)
##
## predicted_class_pca_ML1 3 4 5 6 7 8 9
## 3 2 0 0 0 0 0 0
## 4 0 12 2 3 0 0 0
## 5 8 88 772 406 41 10 0
## 6 9 62 675 1644 626 117 1
## 7 1 1 8 144 213 48 4
## 8 0 0 0 0 0 0 0
## 9 0 0 0 1 0 0 0
Accuracy_pca <- sum(diag(cm_ols_pca))/sum(cm_ols_pca)
Accuracy_pca
## [1] 0.539608