```}
library(plyr) mydata<-read.csv(“C:\Users\SHABANA A T\Desktop\Data Science\wine.csv”) View(mydata) attach(mydata) cor(mydata) dim(mydata) summary(mydata) pca1<-princomp(mydata,cor =TRUE) pca1 summary(pca1) str(pca1) loadings(pca1) windows() plot(pca1) pca1\(scores pca1\)scores[,1:6] #For 85% I have to select 6 columns ie., 6 PC columns data<-cbind(mydata,pca1$scores[,1:6]) View(data) dim(data) # this completes PCA #new_data<-data[1:25,c(14:19)] #View(new_data) #normalized_data<-scale(new_data) #View(normalized_data)
#Implementing non-heirechaial clustering
fit<-kmeans(data,7) str(fit) fit\(centres final<-data.frame(data,fit\)cluster) View(final) dim(final) aggregate(final[,-c(14:19)],by=list(fit\(cluster),FUN =mean) View (aggregate(final[,-c(14:19)],by=list(fit\)cluster),FUN =mean)) new_data <-final[,-c(2:10)] View(new_data)
#Selecting all the records belonging to cluster3
data_clustered<-kmeans(data,7) data_clustered data_clustered\(cluster data\)cluster<-data_clustered\(cluster data\)cluster #dat_clus1<-data[data$cluster==1,] #dat_clus dat_clus3<-data[data$cluster==3,] dat_clus3
#IMPLEMENTING MULTIPLE REGRESSION
new_data1<-new_data[,-c(2:4,11)] View(new_data1) dim(new_data1) head(new_data1) tail(new_data1) cor(new_data1) windows() pairs(new_data1) summary(new_data1) skewness(Alcohol) kurtosis(Alcohol) barplot(Alcohol) hist(Alcohol) boxplot(Alcohol)
model <-lm(Alcohol~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6,data=new_data1) summary(model) # Comp4 is not significiant model2<-lm(Alcohol~Comp.4,data =new_data1) summary(model2) # still its not significant ,so to make it significiant deleting the 5th column new_data2 <-new_data1[,-c(5)] View(new_data2) #Final Model model2<-lm(Alcohol~Comp.1+Comp.2+Comp.3+Comp.5+Comp.6,data=new_data2) summary(model2) # all the p values are significiant and the model is an Average model pv<-predict(model2,new_data2) pv pv_1<-as.data.frame(pv) pv_1
```}