#loading caret package for PCA analysis
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
#seeing the dimension
dim(USArrests)
## [1] 50 4
#view of the structure of the dataset
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
#putting this corelation data in cordata variable
cordata <- cor(USArrests)
#visualizing corelation data
# here we can see that Rape and Assault is more common in Urban population then
#murder Rape and Assult is is blue in color whereas Murder is almost white
#in colour.
library(corrplot)
## corrplot 0.84 loaded
corrplot(cordata)

#ploting against urban population
#With the scatter plot 1 - UrbanPOpVsMurder we can see its scattered
#showing a flat line, plot 2 - UrbanPOpVsAssualt we can see its related
#more population more assualts it has a elevated line, plot 3 UrbanPOpVsRape
#we can see its related more population more assualts it has a elevated line)
scatter.smooth(USArrests$UrbanPop,USArrests$Murder)

scatter.smooth(USArrests$UrbanPop,USArrests$Assault)

scatter.smooth(USArrests$UrbanPop,USArrests$Rape)

#finding principal component
#This Pricipal component shows that component 1 is more important then the
#other two, it is showing 82% and other components 2 and 3 showing
#7% and 2.5% respectively.
USpca <- princomp(~USArrests$Murder+USArrests$Assault+USArrests$Rape)
USpca
## Call:
## princomp(formula = ~USArrests$Murder + USArrests$Assault + USArrests$Rape)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3
## 82.804294 6.905466 2.567381
##
## 3 variables and 50 observations.
#plotting pca showing component 1 as important then other component
screeplot(USpca, type = 'lines')

#biplot pca where showing loading and score,
#Best scored component 1 shows that highest crime reported at the states like
#Newyork, Maryland, Georgia etc. whereas Connecticut, South Dakota
#West Virginia has less crime rate.
#Component 2 showing California less crime reported Mississippi, North Carolina,
#South Carolina, North Carolina showing highest crime.
# Virginia close to zero showing decent amount of crime.
biplot(princomp(USArrests,cor = TRUE),scale=0)

#PCA using caret package
usarrest_pca <- preProcess(USArrests ,method = c('BoxCox','center','scale','pca'), thresh = 0.95)
attributes(usarrest_pca)
## $names
## [1] "dim" "bc" "yj"
## [4] "et" "invHyperbolicSine" "mean"
## [7] "std" "ranges" "rotation"
## [10] "method" "thresh" "pcaComp"
## [13] "numComp" "ica" "wildcards"
## [16] "k" "knnSummary" "bagImp"
## [19] "median" "data" "rangeBounds"
##
## $class
## [1] "preProcess"
summary(usarrest_pca)
## Length Class Mode
## dim 2 -none- numeric
## bc 4 -none- list
## yj 0 -none- NULL
## et 0 -none- NULL
## invHyperbolicSine 0 -none- NULL
## mean 4 -none- numeric
## std 4 -none- numeric
## ranges 0 -none- NULL
## rotation 12 -none- numeric
## method 5 -none- list
## thresh 1 -none- numeric
## pcaComp 0 -none- NULL
## numComp 1 -none- numeric
## ica 0 -none- NULL
## wildcards 2 -none- list
## k 1 -none- numeric
## knnSummary 1 -none- function
## bagImp 0 -none- NULL
## median 0 -none- NULL
## data 0 -none- NULL
## rangeBounds 2 -none- numeric
usarrest_pca$score
## NULL
#PCA object we are applying to the numeric data
pca_data <- predict(usarrest_pca, USArrests)
#retaining principal component
head(pca_data)
## PC1 PC2 PC3
## Alabama -1.03453112 1.0848115 -0.33595844
## Alaska -1.67318523 1.3179300 1.37135111
## Arizona -1.75849483 -0.6756439 -0.03729575
## Arkansas -0.06792624 1.1731575 0.05737966
## California -2.38210391 -1.4475095 0.30790267
## Colorado -1.50855460 -0.7761294 0.83838786
#loading principal component the importance of each variable on each
#component
#You can see that PC1 is giving negative values making it less important to consider
# whereas PC2 showing Albama, Alaska, Arkansas state and PC3 showing Alaska, Colarado,
#California, Arkansas state.
usarrest_pca$rotation
## PC1 PC2 PC3
## Murder -0.5409017 0.38528873 -0.2405880
## Assault -0.5671456 0.18708303 -0.4256220
## UrbanPop -0.2890186 -0.89808964 -0.2836722
## Rape -0.5497631 -0.09993764 0.8249202
#While doing loading we can see each variable importance in Principal Component
#we can see that PC1 is all negative, PC2 showing 38% Murder and 18% Assaults,
#PC1 and PC2 cover 95% of variability in the data.
#BoxCox, center, and scale functions were used in the second method,as mentioned
#below:
#BoxCox is used to transforms the data to more normally distributed data
#center is used to subtracts the mean from each value
#scale is used to divides each value by the standard deviation