mydata <- read.csv(file.choose()) #wine.csv
data <- mydata[-1] #removing 1st column
head(data)
## Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28
## 2 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26
## 3 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30
## 4 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24
## 5 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39
## 6 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34
## Proanthocyanins Color Hue Dilution Proline
## 1 2.29 5.64 1.04 3.92 1065
## 2 1.28 4.38 1.05 3.40 1050
## 3 2.81 5.68 1.03 3.17 1185
## 4 2.18 7.80 0.86 3.45 1480
## 5 1.82 4.32 1.04 2.93 735
## 6 1.97 6.75 1.05 2.85 1450
attach(data)
head(cor(data))
## Alcohol Malic Ash Alcalinity Magnesium
## Alcohol 1.00000000 0.09439694 0.2115446 -0.31023514 0.27079823
## Malic 0.09439694 1.00000000 0.1640455 0.28850040 -0.05457510
## Ash 0.21154460 0.16404547 1.0000000 0.44336719 0.28658669
## Alcalinity -0.31023514 0.28850040 0.4433672 1.00000000 -0.08333309
## Magnesium 0.27079823 -0.05457510 0.2865867 -0.08333309 1.00000000
## Phenols 0.28910112 -0.33516700 0.1289795 -0.32111332 0.21440123
## Phenols Flavanoids Nonflavanoids Proanthocyanins Color
## Alcohol 0.2891011 0.2368149 -0.1559295 0.136697912 0.54636420
## Malic -0.3351670 -0.4110066 0.2929771 -0.220746187 0.24898534
## Ash 0.1289795 0.1150773 0.1862304 0.009651935 0.25888726
## Alcalinity -0.3211133 -0.3513699 0.3619217 -0.197326836 0.01873198
## Magnesium 0.2144012 0.1957838 -0.2562940 0.236440610 0.19995001
## Phenols 1.0000000 0.8645635 -0.4499353 0.612413084 -0.05513642
## Hue Dilution Proline
## Alcohol -0.07174720 0.072343187 0.6437200
## Malic -0.56129569 -0.368710428 -0.1920106
## Ash -0.07466689 0.003911231 0.2236263
## Alcalinity -0.27395522 -0.276768549 -0.4405969
## Magnesium 0.05539820 0.066003936 0.3933508
## Phenols 0.43368134 0.699949365 0.4981149
summary(data)
## Alcohol Malic Ash Alcalinity
## Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## Magnesium Phenols Flavanoids Nonflavanoids
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.135 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color Hue Dilution
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## Proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
windows()
plot(data)
### normalize data
normdata <-scale(data)
pcaObj <- princomp(normdata, cor=TRUE)
summary(pcaObj) # prop of variance
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.1692972 1.5801816 1.2025273 0.9586313 0.92370351
## Proportion of Variance 0.3619885 0.1920749 0.1112363 0.0706903 0.06563294
## Cumulative Proportion 0.3619885 0.5540634 0.6652997 0.7359900 0.80162293
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.80103498 0.74231281 0.59033665 0.53747553
## Proportion of Variance 0.04935823 0.04238679 0.02680749 0.02222153
## Cumulative Proportion 0.85098116 0.89336795 0.92017544 0.94239698
## Comp.10 Comp.11 Comp.12 Comp.13
## Standard deviation 0.50090167 0.47517222 0.41081655 0.321524394
## Proportion of Variance 0.01930019 0.01736836 0.01298233 0.007952149
## Cumulative Proportion 0.96169717 0.97906553 0.99204785 1.000000000
head(pcaObj$loadings)
## [1] 0.144329395 -0.245187580 -0.002051061 -0.239320405 0.141992042
## [6] 0.394660845
head(pcaObj$scores) #Top PCA Scores which represents the whole data
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## [1,] 3.316751 1.4434626 0.1657390 0.2156312 -0.6930428 0.2238801
## [2,] 2.209465 -0.3333929 2.0264574 0.2913583 0.2576546 0.9271202
## [3,] 2.516740 1.0311513 -0.9828187 -0.7249023 0.2510331 -0.5492760
## [4,] 3.757066 2.7563719 0.1761918 -0.5679833 0.3118416 -0.1144310
## [5,] 1.008908 0.8698308 -2.0266882 0.4097658 -0.2984575 0.4065196
## [6,] 3.050254 2.1224011 0.6293958 0.5156375 0.6320187 -0.1234306
## Comp.7 Comp.8 Comp.9 Comp.10 Comp.11
## [1,] -0.59642655 -0.06513909 0.64144271 1.02095585 0.4515634
## [2,] -0.05377561 -1.02441595 -0.30884675 0.15970137 0.1426573
## [3,] -0.42420545 0.34421613 -1.17783447 0.11336086 0.2866728
## [4,] 0.38333730 -0.64359350 0.05254442 0.23941260 -0.7595843
## [5,] -0.44407446 -0.41670047 0.32681916 -0.07836648 0.5259451
## [6,] -0.40165376 -0.39489342 -0.15214608 -0.10199582 -0.4055853
## Comp.12 Comp.13
## [1,] -0.5408104139 -0.066238631
## [2,] -0.3882377413 0.003636502
## [3,] -0.0005835732 0.021716510
## [4,] 0.2420195635 -0.369483531
## [5,] 0.2166641578 -0.079363566
## [6,] 0.3794326839 0.145155331
mydata <- cbind(mydata,pcaObj$scores[,1:6])
head(mydata)
## Type Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39
## Nonflavanoids Proanthocyanins Color Hue Dilution Proline Comp.1
## 1 0.28 2.29 5.64 1.04 3.92 1065 3.316751
## 2 0.26 1.28 4.38 1.05 3.40 1050 2.209465
## 3 0.30 2.81 5.68 1.03 3.17 1185 2.516740
## 4 0.24 2.18 7.80 0.86 3.45 1480 3.757066
## 5 0.39 1.82 4.32 1.04 2.93 735 1.008908
## 6 0.34 1.97 6.75 1.05 2.85 1450 3.050254
## Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## 1 1.4434626 0.1657390 0.2156312 -0.6930428 0.2238801
## 2 -0.3333929 2.0264574 0.2913583 0.2576546 0.9271202
## 3 1.0311513 -0.9828187 -0.7249023 0.2510331 -0.5492760
## 4 2.7563719 0.1761918 -0.5679833 0.3118416 -0.1144310
## 5 0.8698308 -2.0266882 0.4097658 -0.2984575 0.4065196
## 6 2.1224011 0.6293958 0.5156375 0.6320187 -0.1234306
dim(mydata)
## [1] 178 20
new_data <- mydata[,15:20]
View(new_data)
attach(new_data)
M1 <-lm(Alcohol ~ Comp.1 + Comp.2 + Comp.3+ Comp.4 + Comp.5 + Comp.6, data = new_data)
M2 <-lm(Alcohol ~ Malic+ Ash+ Alcalinity+ Magnesium+ Phenols+ Flavanoids + Nonflavanoids+ Proanthocyanins+ Color+ Hue+ Dilution+ Proline
, data= new_data)
summary(M1) #83.41 % better than all components model M2
##
## Call:
## lm(formula = Alcohol ~ Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 +
## Comp.6, data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84510 -0.22936 0.00056 0.24190 1.14634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.00062 0.02522 515.551 < 2e-16 ***
## Comp.1 0.11684 0.01162 10.051 < 2e-16 ***
## Comp.2 0.39154 0.01596 24.535 < 2e-16 ***
## Comp.3 0.16789 0.02097 8.006 1.74e-13 ***
## Comp.4 0.01446 0.02631 0.550 0.583
## Comp.5 0.21507 0.02730 7.878 3.70e-13 ***
## Comp.6 0.17287 0.03148 5.491 1.42e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3364 on 171 degrees of freedom
## Multiple R-squared: 0.8341, Adjusted R-squared: 0.8283
## F-statistic: 143.3 on 6 and 171 DF, p-value: < 2.2e-16
summary(M2) # 59.36 %
##
## Call:
## lm(formula = Alcohol ~ Malic + Ash + Alcalinity + Magnesium +
## Phenols + Flavanoids + Nonflavanoids + Proanthocyanins +
## Color + Hue + Dilution + Proline, data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.45180 -0.30646 -0.02277 0.33195 1.54407
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.107e+01 5.963e-01 18.567 < 2e-16 ***
## Malic 1.316e-01 4.528e-02 2.907 0.00415 **
## Ash 1.379e-01 2.169e-01 0.636 0.52585
## Alcalinity -3.779e-02 1.781e-02 -2.122 0.03537 *
## Magnesium 4.179e-06 3.359e-03 0.001 0.99901
## Phenols 5.208e-02 1.340e-01 0.389 0.69796
## Flavanoids 9.125e-03 1.069e-01 0.085 0.93211
## Nonflavanoids -2.078e-01 4.336e-01 -0.479 0.63242
## Proanthocyanins -1.525e-01 9.823e-02 -1.552 0.12249
## Color 1.630e-01 2.744e-02 5.941 1.63e-08 ***
## Hue 2.169e-01 2.811e-01 0.772 0.44144
## Dilution 1.608e-01 1.097e-01 1.466 0.14462
## Proline 1.016e-03 1.999e-04 5.081 1.01e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5361 on 165 degrees of freedom
## Multiple R-squared: 0.5936, Adjusted R-squared: 0.564
## F-statistic: 20.08 on 12 and 165 DF, p-value: < 2.2e-16
Predicted_PCA <- predict(M1,new_data)
Predicted_org <- predict(M2,new_data)
final<- cbind(Predicted_PCA,Predicted_org,mydata)
head(final) #here Predicted_PCA (6PCAs) still looks accurate with Alcohol
## Predicted_PCA Predicted_org Type Alcohol Malic Ash Alcalinity Magnesium
## 1 13.87391 13.66661 1 14.23 1.71 2.43 15.6 127
## 2 13.68834 13.64752 1 13.20 1.78 2.14 11.2 100
## 3 13.48197 13.59564 1 13.16 2.36 2.67 18.6 101
## 4 14.58747 14.40527 1 14.37 1.95 2.50 16.8 113
## 5 13.13083 12.97483 1 13.24 2.59 2.87 21.0 118
## 6 14.41572 14.15702 1 14.20 1.76 2.45 15.2 112
## Phenols Flavanoids Nonflavanoids Proanthocyanins Color Hue Dilution
## 1 2.80 3.06 0.28 2.29 5.64 1.04 3.92
## 2 2.65 2.76 0.26 1.28 4.38 1.05 3.40
## 3 2.80 3.24 0.30 2.81 5.68 1.03 3.17
## 4 3.85 3.49 0.24 2.18 7.80 0.86 3.45
## 5 2.80 2.69 0.39 1.82 4.32 1.04 2.93
## 6 3.27 3.39 0.34 1.97 6.75 1.05 2.85
## Proline Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## 1 1065 3.316751 1.4434626 0.1657390 0.2156312 -0.6930428 0.2238801
## 2 1050 2.209465 -0.3333929 2.0264574 0.2913583 0.2576546 0.9271202
## 3 1185 2.516740 1.0311513 -0.9828187 -0.7249023 0.2510331 -0.5492760
## 4 1480 3.757066 2.7563719 0.1761918 -0.5679833 0.3118416 -0.1144310
## 5 735 1.008908 0.8698308 -2.0266882 0.4097658 -0.2984575 0.4065196
## 6 1450 3.050254 2.1224011 0.6293958 0.5156375 0.6320187 -0.1234306
clus_data <- mydata[,15:20]
#norm_clus <- scale(clus_data) # Scale function is used to normalize data
dist1 <- dist(clus_data, method= "euclidian") #method for finding the distance
#here I am considering Euclidean distance
head(dist1)
## [1] 3.041328 2.121993 1.912071 3.269353 1.647232 2.150326
fit <- hclust(dist1,method="complete")
str(fit)
## List of 7
## $ merge : int [1:177, 1:2] -35 -16 -165 -11 -141 -146 -64 -167 -132 -57 ...
## $ height : num [1:177] 0.392 0.404 0.482 0.563 0.637 ...
## $ order : int [1:178] 60 67 63 76 102 77 98 101 81 107 ...
## $ labels : NULL
## $ method : chr "complete"
## $ call : language hclust(d = dist1, method = "complete")
## $ dist.method: chr "euclidean"
## - attr(*, "class")= chr "hclust"
windows()
plot(fit, hang=-1)
groups <- cutree(fit, h=8)
groups <- as.data.frame(groups) #cluster numbering
final1 <- cbind(groups,mydata) #binding columnwise with original data
str(final1)
## 'data.frame': 178 obs. of 21 variables:
## $ groups : int 1 1 1 1 2 1 1 1 1 1 ...
## $ Type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoids : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins: num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ Dilution : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
## $ Comp.1 : num 3.32 2.21 2.52 3.76 1.01 ...
## $ Comp.2 : num 1.443 -0.333 1.031 2.756 0.87 ...
## $ Comp.3 : num 0.166 2.026 -0.983 0.176 -2.027 ...
## $ Comp.4 : num 0.216 0.291 -0.725 -0.568 0.41 ...
## $ Comp.5 : num -0.693 0.258 0.251 0.312 -0.298 ...
## $ Comp.6 : num 0.224 0.927 -0.549 -0.114 0.407 ...
View(final1)
head(aggregate(final1[,-c(2,15:20)], by=list(final1$groups),FUN=mean))
## Group.1 groups Alcohol Malic Ash Alcalinity Magnesium Phenols
## 1 1 1 13.64603 1.944921 2.423175 17.05397 107.92063 2.783492
## 2 2 2 12.20974 2.252564 2.397436 21.89231 93.53846 2.521538
## 3 3 3 12.38667 1.389333 1.868000 17.66667 87.06667 2.138000
## 4 4 4 12.99066 3.027049 2.410820 20.93279 98.37705 1.684590
## Flavanoids Nonflavanoids Proanthocyanins Color Hue Dilution
## 1 2.8966667 0.2822222 1.942381 5.358730 1.0712698 3.109683
## 2 2.3992308 0.3543590 1.783590 2.982051 1.0305128 2.984615
## 3 1.9106667 0.3066667 1.411333 3.222000 1.1100000 2.845333
## 4 0.9260656 0.4624590 1.148852 6.526393 0.7556721 1.801475
## Comp.6
## 1 0.17886105
## 2 -0.06062579
## 3 -0.17843048
## 4 -0.10208824
fit2 <- kmeans(clus_data, 4)
str(fit2)
## List of 9
## $ cluster : int [1:178] 1 1 4 1 4 1 1 4 1 1 ...
## $ centers : num [1:4, 1:6] 2.499 -0.138 -2.72 1.868 0.846 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:6] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ totss : num 1969
## $ withinss : num [1:4] 135 382 241 107
## $ tot.withinss: num 865
## $ betweenss : num 1104
## $ size : int [1:4] 41 62 51 24
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
groups<-fit2$cluster
groups <- as.data.frame(groups) #cluster numbering
final2 <- cbind(groups,mydata) #binding columnwise with original data
str(final2)
## 'data.frame': 178 obs. of 21 variables:
## $ groups : int 1 1 4 1 4 1 1 4 1 1 ...
## $ Type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoids : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins: num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ Dilution : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
## $ Comp.1 : num 3.32 2.21 2.52 3.76 1.01 ...
## $ Comp.2 : num 1.443 -0.333 1.031 2.756 0.87 ...
## $ Comp.3 : num 0.166 2.026 -0.983 0.176 -2.027 ...
## $ Comp.4 : num 0.216 0.291 -0.725 -0.568 0.41 ...
## $ Comp.5 : num -0.693 0.258 0.251 0.312 -0.298 ...
## $ Comp.6 : num 0.224 0.927 -0.549 -0.114 0.407 ...
View(final2)
head(aggregate(final2[,-c(2,15:20)], by=list(final1$groups),FUN=mean))
## Group.1 groups Alcohol Malic Ash Alcalinity Magnesium
## 1 1 2.015873 13.64603 1.944921 2.423175 17.05397 107.92063
## 2 2 2.205128 12.20974 2.252564 2.397436 21.89231 93.53846
## 3 3 1.933333 12.38667 1.389333 1.868000 17.66667 87.06667
## 4 4 2.819672 12.99066 3.027049 2.410820 20.93279 98.37705
## Phenols Flavanoids Nonflavanoids Proanthocyanins Color Hue
## 1 2.783492 2.8966667 0.2822222 1.942381 5.358730 1.0712698
## 2 2.521538 2.3992308 0.3543590 1.783590 2.982051 1.0305128
## 3 2.138000 1.9106667 0.3066667 1.411333 3.222000 1.1100000
## 4 1.684590 0.9260656 0.4624590 1.148852 6.526393 0.7556721
## Dilution Comp.6
## 1 3.109683 0.17886105
## 2 2.984615 -0.06062579
## 3 2.845333 -0.17843048
## 4 1.801475 -0.10208824