PRINCIPAL COMPONENT ANALYSIS AND DIMENSION REDUCTION IN R

Prinipal Component Analysis

#############
##Packages###
#############
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(Rcpp)

## Warning: package 'Rcpp' was built under R version 3.6.3

library(nnet)

Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
library(devtools)

## Warning: package 'devtools' was built under R version 3.6.3

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 3.6.3

install_github("vqv/ggbiplot")

## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 3.5 from https://cran.r-project.org/bin/windows/Rtools/.

## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
##   Use `force = TRUE` to force installation

library(ggbiplot)

## Loading required package: plyr

## Warning: package 'plyr' was built under R version 3.6.3

## Loading required package: scales

## Warning: package 'scales' was built under R version 3.6.3

## 
## Attaching package: 'scales'

## The following objects are masked from 'package:psych':
## 
##     alpha, rescale

## Loading required package: grid

########################
####Install Dataset#####
########################
data_iris <- read.csv(file.choose())
str(data_iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ ï..sepal_length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ sepal_width    : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ petal_length   : num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ petal_width    : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ species        : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

summary(data_iris)

##  ï..sepal_length  sepal_width     petal_length    petal_width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

###########################################################
####Spliting the dataset into testing and training data####
###########################################################

smp_size <- floor(0.80 * nrow(data_iris))

set.seed(100)

train_ind<- sample(seq_len(nrow(data_iris)), size = smp_size)
train<- data_iris[train_ind, ]
test <- data_iris[-train_ind, ]
head(train)

##     ï..sepal_length sepal_width petal_length petal_width    species
## 102             5.8         2.7          5.1         1.9  virginica
## 112             6.4         2.7          5.3         1.9  virginica
## 4               4.6         3.1          1.5         0.2     setosa
## 55              6.5         2.8          4.6         1.5 versicolor
## 70              5.6         2.5          3.9         1.1 versicolor
## 98              6.2         2.9          4.3         1.3 versicolor

head(test)

##    ï..sepal_length sepal_width petal_length petal_width species
## 1              5.1         3.5          1.4         0.2  setosa
## 6              5.4         3.9          1.7         0.4  setosa
## 10             4.9         3.1          1.5         0.1  setosa
## 11             5.4         3.7          1.5         0.2  setosa
## 13             4.8         3.0          1.4         0.1  setosa
## 21             5.4         3.4          1.7         0.2  setosa

#################################################
####Scatter plot and correlation coefficients####
#################################################

pairs.panels(train[ , 1:4], gap= 0.2, 
             bg= c("red" , "green", "blue")[train$Species], pch = 25)

###################
#######PCA#########
###################

p_c <- prcomp(train[ , -5],
              center =  T,
              scale. = T)

attributes(p_c)

## $names
## [1] "sdev"     "rotation" "center"   "scale"    "x"       
## 
## $class
## [1] "prcomp"

p_c$center

## ï..sepal_length     sepal_width    petal_length     petal_width 
##        5.812500        3.044167        3.740833        1.214167

p_c$sdev

## [1] 1.6914566 0.9854915 0.3810962 0.1501558

#################################
##Print Principlal Components####
#################################

print(p_c)

## Standard deviations (1, .., p=4):
## [1] 1.6914566 0.9854915 0.3810962 0.1501558
## 
## Rotation (n x k) = (4 x 4):
##                        PC1          PC2        PC3        PC4
## ï..sepal_length  0.5275925 -0.359075112  0.7280867 -0.2502016
## sepal_width     -0.2257929 -0.932445661 -0.2543189  0.1220024
## petal_length     0.5859947  0.003062244 -0.1494378  0.7964102
## petal_width      0.5720786 -0.040009693 -0.6187729 -0.5368849

summary(p_c)

## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.6915 0.9855 0.38110 0.15016
## Proportion of Variance 0.7153 0.2428 0.03631 0.00564
## Cumulative Proportion  0.7153 0.9580 0.99436 1.00000

plot(p_c , type = 'l', main = " ")

biplot(p_c, scale= 0)

##########################
###Orthogonality of PCs###
##########################
pairs.panels(p_c$x, 
             gap= 0.2,
             bg = c("green" , "blue" , "violet")[train$Species],
             pch = 25)

##########################
######create bi-plot######
##########################

b_p <- ggbiplot(p_c, obs.scale = 1, var.scale = 1,
                groups = train$Species, ellipse = T, 
                circle = T, ellipse.prob = 0.68)# 0.68 can be replaced by 0.95

b_p <- b_p + theme(legend.direction = 'horizontal', legend.position = 'bottom') 
print(b_p)

#######################
######Prediction######
######################


training <- predict(p_c , train)
head(training)

##            PC1       PC2         PC3         PC4
## 102  1.1496488 0.7096525 -0.48824113  0.04552346
## 112  1.6008667 0.4489846  0.02376181 -0.04435101
## 4   -2.3297585 0.4574374 -0.07891796  0.07243465
## 55   1.0733055 0.2109474  0.44121628 -0.08462131
## 70   0.1142679 1.2657841  0.21052992  0.06599871
## 98   0.5767621 0.1370501  0.30764257  0.03837045

training <- data.frame(training, train[5])
head(training)

##            PC1       PC2         PC3         PC4    species
## 102  1.1496488 0.7096525 -0.48824113  0.04552346  virginica
## 112  1.6008667 0.4489846  0.02376181 -0.04435101  virginica
## 4   -2.3297585 0.4574374 -0.07891796  0.07243465     setosa
## 55   1.0733055 0.2109474  0.44121628 -0.08462131 versicolor
## 70   0.1142679 1.2657841  0.21052992  0.06599871 versicolor
## 98   0.5767621 0.1370501  0.30764257  0.03837045 versicolor

testing<- predict(p_c,test)
testing<- data.frame(testing , test[5])
head(testing)

##          PC1        PC2         PC3          PC4 species
## 1  -2.251732 -0.6180953  0.13679924 -0.012892254  setosa
## 6  -2.014846 -1.6164952 -0.02208957  0.004416717  setosa
## 10 -2.213651  0.3322176  0.26754364  0.052494605  setosa
## 11 -2.129986 -1.1773481  0.27581570 -0.001709197  setosa
## 13 -2.259488  0.5900038  0.24645685  0.008745513  setosa
## 21 -1.906495 -0.5336160  0.43402964  0.006114438  setosa

#########################################
####Multinomial logistic Regression#####
######################################## 

training$Species <- relevel(training$species, ref= "setosa")

model_1<- multinom(Species~PC1+PC2, data= training)

## # weights:  12 (6 variable)
## initial  value 131.833475 
## iter  10 value 20.393907
## iter  20 value 18.946567
## iter  30 value 18.811242
## iter  40 value 18.808754
## iter  50 value 18.808496
## iter  60 value 18.807802
## final  value 18.807565 
## converged

summary(model_1)

## Call:
## multinom(formula = Species ~ PC1 + PC2, data = training)
## 
## Coefficients:
##            (Intercept)     PC1      PC2
## versicolor    9.975736 13.8673 3.897542
## virginica     3.607781 19.7804 4.999674
## 
## Std. Errors:
##            (Intercept)      PC1      PC2
## versicolor    183.0945 105.5789 146.5342
## virginica     183.1012 105.5876 146.5355
## 
## Residual Deviance: 37.61513 
## AIC: 49.61513

##################################################
#######confusion matrix for training data #######
#################################################

p <- predict(model_1, training)
t <- table(p, training$Species)
t

##             
## p            setosa versicolor virginica
##   setosa         39          0         0
##   versicolor      0         35         4
##   virginica       0          5        37

#################################
######## Misclassification ######
#################################

Misclassification_percentage <- (1-sum(diag(t))/sum(t))*100
Misclassification_percentage

## [1] 7.5

# confusion matrix for testing data
p1 <- predict(model_1, testing)
t1 <- table(p1, testing$species)
t1

##             
## p1           setosa versicolor virginica
##   setosa         11          0         0
##   versicolor      0          7         0
##   virginica       0          3         9

#Misclassification
Misclassification_percentage <- (1-sum(diag(t1))/sum(t1))*100
Misclassification_percentage

## [1] 10

PRINCIPAL COMPONENT ANALYSIS AND DIMENSION REDUCTION IN R

Mirza Wasim Ahmed

20 January 2020

Prinipal Component Analysis