a.utf8.md

library(plyr)
mydata<-read.csv("C:\\Users\\SHABANA A T\\Desktop\\Data Science\\wine.csv")
View(mydata)
attach(mydata)
cor(mydata)
dim(mydata)
summary(mydata)
pca1<-princomp(mydata,cor =TRUE)
pca1
summary(pca1)
str(pca1)
loadings(pca1)
windows()
plot(pca1)
pca1$scores
pca1$scores[,1:6]
#For 85% I have to select 6 columns ie., 6 PC columns
data<-cbind(mydata,pca1$scores[,1:6])
View(data)
dim(data)
# this completes PCA
#new_data<-data[1:25,c(14:19)]
#View(new_data)
#normalized_data<-scale(new_data)
#View(normalized_data)

#Implementing non-heirechaial clustering

fit<-kmeans(data,7)
str(fit)
fit$centres
final<-data.frame(data,fit$cluster)
View(final)
dim(final)
aggregate(final[,-c(14:19)],by=list(fit$cluster),FUN =mean)
View (aggregate(final[,-c(14:19)],by=list(fit$cluster),FUN =mean))
new_data <-final[,-c(2:10)]
View(new_data)

#Selecting all  the records belonging to cluster3

data_clustered<-kmeans(data,7)
data_clustered
data_clustered$cluster
data$cluster<-data_clustered$cluster
data$cluster
#dat_clus1<-data[data$cluster==1,]
#dat_clus
dat_clus3<-data[data$cluster==3,]
dat_clus3

#IMPLEMENTING MULTIPLE REGRESSION

new_data1<-new_data[,-c(2:4,11)]
View(new_data1)
attach(new_data1)
dim(new_data1)
head(new_data1)
tail(new_data1)
cor(new_data1)
windows()
pairs(new_data1)
summary(new_data1)
skewness(Alcohol)
kurtosis(Alcohol)
barplot(Alcohol)
hist(Alcohol)
boxplot(Alcohol)

# The linear Model
model <-lm(Alcohol~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6,data=new_data1)
summary(model)
# Comp4 is not significiant
model2<-lm(Alcohol~Comp.4,data =new_data1)
summary(model2)
# still its not significant ,so to make it significiant deleting the 5th column
new_data2 <-new_data1[,-c(5)]
View(new_data2)
#Final Model
model2<-lm(Alcohol~Comp.1+Comp.2+Comp.3+Comp.5+Comp.6,data=new_data2)
summary(model2)
# all the p values are significiant and the model is an Average model
pv<-predict(model2,new_data2)
pv
pv_1<-as.data.frame(pv)
pv_1
{r}