Implementing PCA (Principal Component Analysis) with model of 85%

wine <- read.csv(file.choose()) View(wine) dim(wine) summary(wine) cor(wine) #Check is any correlation in the data set >.85

Here in data set First coloum is not needed so removing from the data set

dataSet[-1] is used to remove coloumn

newWine <- wine[-1] View(newWine) cor(newWine)

here we facing scaling issue to overcome that we use scale()

normdata <- scale(newWine) View(normdata) normdata <- as.data.frame(normdata)

Principal component algorithm using princomp(dataset, cor=TRUE) cor- Check weather correlation there or not but it should be alwys true

PCA obj are arranged based on Varience (Descending Order)

pcaObj <- princomp(normdata, cor =FALSE) #PCA values with sd(variance arranged in Desending order from comp1 to comp13) #PCA formula (ai1x1 + ai2x2 +….+ainxn) summary(pcaObj) loadings(pcaObj) #weights #graph showes inmportance of principal components #comp1 having higest importance (highest importance) plot(pcaObj) #pcaObj$Score gives the top 3 PCA scores which represents whole data pcaObj$scores #Combining PC1, PC2, PC3 , PC4, PC5 to the original data using cbind()-Coloumn Bind wine <- cbind(wine, pcaObj$scores[,1:6]) View(wine) dim(wine) colnames(wine)

Preparing for Clustering (consider only PCA values)

clusdata <- wine[,15:20] View(clusdata) #Implement Clustering using PC1, PC2, PC3, PC4, PC5, PC6 instead of original columns #Normalize the clusdata normClus <- scale(clusdata) View(normClus) #Implementing hirarachy cluster so find the distance bw all the records #Method used to find the distance distance <- dist(normClus, method = “euclidean”) hirarachyClus <- hclust(distance,method = “complete”) #Complete linkage for Maximun distance , Single for minimum distance window() plot(hirarachyClus, hang = -1) newDataSet <- cbind(wine[2], clusdata) View(newDataSet)

Implementing Kmean cluster

wineData <- wine[2] newData1 <- cbind(normClus, wineData) View(newData1) kmean <- kmeans(newData1, 5) #Structure of kmean str(kmean) kmean$centers clusters <- as.data.frame(kmean$cluster) View(clusters) wineCluData <- cbind(newData1, clusters) View(wineCluData) #Sorting the data which are stored in cluster number 3 wineClus3 <- wineCluData[kmean$cluster==3,] View(wineClus3)

Applying Linear regression on cluster 3 data

View(wineClus3) modeldata1 <- wineClus3[,1:7] View(modeldata1) attach(modeldata1)

m1 <- lm(Alcohol ~ Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + Comp.6, data = modeldata1) summary(m1) colnames(wine)

m2 <- lm(Alcohol ~ Malic + Ash + Alcalinity + Magnesium + Phenols + Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = newWine) summary(m2)

cor(modeldata1)