#############
##Packages###
#############
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 3.6.3
library(nnet)
Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
library(devtools)
## Warning: package 'devtools' was built under R version 3.6.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 3.6.3
install_github("vqv/ggbiplot")
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install Rtools 3.5 from https://cran.r-project.org/bin/windows/Rtools/.
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
## Use `force = TRUE` to force installation
library(ggbiplot)
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 3.6.3
## Loading required package: scales
## Warning: package 'scales' was built under R version 3.6.3
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## Loading required package: grid
########################
####Install Dataset#####
########################
data_iris <- read.csv(file.choose())
str(data_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ ï..sepal_length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ sepal_width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ petal_length : num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ petal_width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(data_iris)
## ï..sepal_length sepal_width petal_length petal_width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.054 Mean :3.759 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
###########################################################
####Spliting the dataset into testing and training data####
###########################################################
smp_size <- floor(0.80 * nrow(data_iris))
set.seed(100)
train_ind<- sample(seq_len(nrow(data_iris)), size = smp_size)
train<- data_iris[train_ind, ]
test <- data_iris[-train_ind, ]
head(train)
## ï..sepal_length sepal_width petal_length petal_width species
## 102 5.8 2.7 5.1 1.9 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 4 4.6 3.1 1.5 0.2 setosa
## 55 6.5 2.8 4.6 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
head(test)
## ï..sepal_length sepal_width petal_length petal_width species
## 1 5.1 3.5 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 21 5.4 3.4 1.7 0.2 setosa
#################################################
####Scatter plot and correlation coefficients####
#################################################
pairs.panels(train[ , 1:4], gap= 0.2,
bg= c("red" , "green", "blue")[train$Species], pch = 25)

###################
#######PCA#########
###################
p_c <- prcomp(train[ , -5],
center = T,
scale. = T)
attributes(p_c)
## $names
## [1] "sdev" "rotation" "center" "scale" "x"
##
## $class
## [1] "prcomp"
p_c$center
## ï..sepal_length sepal_width petal_length petal_width
## 5.812500 3.044167 3.740833 1.214167
p_c$sdev
## [1] 1.6914566 0.9854915 0.3810962 0.1501558
#################################
##Print Principlal Components####
#################################
print(p_c)
## Standard deviations (1, .., p=4):
## [1] 1.6914566 0.9854915 0.3810962 0.1501558
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## ï..sepal_length 0.5275925 -0.359075112 0.7280867 -0.2502016
## sepal_width -0.2257929 -0.932445661 -0.2543189 0.1220024
## petal_length 0.5859947 0.003062244 -0.1494378 0.7964102
## petal_width 0.5720786 -0.040009693 -0.6187729 -0.5368849
summary(p_c)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.6915 0.9855 0.38110 0.15016
## Proportion of Variance 0.7153 0.2428 0.03631 0.00564
## Cumulative Proportion 0.7153 0.9580 0.99436 1.00000
plot(p_c , type = 'l', main = " ")

biplot(p_c, scale= 0)

##########################
###Orthogonality of PCs###
##########################
pairs.panels(p_c$x,
gap= 0.2,
bg = c("green" , "blue" , "violet")[train$Species],
pch = 25)

##########################
######create bi-plot######
##########################
b_p <- ggbiplot(p_c, obs.scale = 1, var.scale = 1,
groups = train$Species, ellipse = T,
circle = T, ellipse.prob = 0.68)# 0.68 can be replaced by 0.95
b_p <- b_p + theme(legend.direction = 'horizontal', legend.position = 'bottom')
print(b_p)

#######################
######Prediction######
######################
training <- predict(p_c , train)
head(training)
## PC1 PC2 PC3 PC4
## 102 1.1496488 0.7096525 -0.48824113 0.04552346
## 112 1.6008667 0.4489846 0.02376181 -0.04435101
## 4 -2.3297585 0.4574374 -0.07891796 0.07243465
## 55 1.0733055 0.2109474 0.44121628 -0.08462131
## 70 0.1142679 1.2657841 0.21052992 0.06599871
## 98 0.5767621 0.1370501 0.30764257 0.03837045
training <- data.frame(training, train[5])
head(training)
## PC1 PC2 PC3 PC4 species
## 102 1.1496488 0.7096525 -0.48824113 0.04552346 virginica
## 112 1.6008667 0.4489846 0.02376181 -0.04435101 virginica
## 4 -2.3297585 0.4574374 -0.07891796 0.07243465 setosa
## 55 1.0733055 0.2109474 0.44121628 -0.08462131 versicolor
## 70 0.1142679 1.2657841 0.21052992 0.06599871 versicolor
## 98 0.5767621 0.1370501 0.30764257 0.03837045 versicolor
testing<- predict(p_c,test)
testing<- data.frame(testing , test[5])
head(testing)
## PC1 PC2 PC3 PC4 species
## 1 -2.251732 -0.6180953 0.13679924 -0.012892254 setosa
## 6 -2.014846 -1.6164952 -0.02208957 0.004416717 setosa
## 10 -2.213651 0.3322176 0.26754364 0.052494605 setosa
## 11 -2.129986 -1.1773481 0.27581570 -0.001709197 setosa
## 13 -2.259488 0.5900038 0.24645685 0.008745513 setosa
## 21 -1.906495 -0.5336160 0.43402964 0.006114438 setosa
#########################################
####Multinomial logistic Regression#####
########################################
training$Species <- relevel(training$species, ref= "setosa")
model_1<- multinom(Species~PC1+PC2, data= training)
## # weights: 12 (6 variable)
## initial value 131.833475
## iter 10 value 20.393907
## iter 20 value 18.946567
## iter 30 value 18.811242
## iter 40 value 18.808754
## iter 50 value 18.808496
## iter 60 value 18.807802
## final value 18.807565
## converged
summary(model_1)
## Call:
## multinom(formula = Species ~ PC1 + PC2, data = training)
##
## Coefficients:
## (Intercept) PC1 PC2
## versicolor 9.975736 13.8673 3.897542
## virginica 3.607781 19.7804 4.999674
##
## Std. Errors:
## (Intercept) PC1 PC2
## versicolor 183.0945 105.5789 146.5342
## virginica 183.1012 105.5876 146.5355
##
## Residual Deviance: 37.61513
## AIC: 49.61513
##################################################
#######confusion matrix for training data #######
#################################################
p <- predict(model_1, training)
t <- table(p, training$Species)
t
##
## p setosa versicolor virginica
## setosa 39 0 0
## versicolor 0 35 4
## virginica 0 5 37
#################################
######## Misclassification ######
#################################
Misclassification_percentage <- (1-sum(diag(t))/sum(t))*100
Misclassification_percentage
## [1] 7.5
# confusion matrix for testing data
p1 <- predict(model_1, testing)
t1 <- table(p1, testing$species)
t1
##
## p1 setosa versicolor virginica
## setosa 11 0 0
## versicolor 0 7 0
## virginica 0 3 9
#Misclassification
Misclassification_percentage <- (1-sum(diag(t1))/sum(t1))*100
Misclassification_percentage
## [1] 10