Intro

Principal Component Analysis (PCA) involves the process of understanding different features in a dataset and can be used in conjunction with cluster analysis.

PCA is also a popular machine learning algorithm used for feature selection. Imagine if you have more than 100 features or factors. It is useful to select the most important features for further analysis.

The basic idea when using PCA as a tool for feature selection is to select variables according to the magnitude (from largest to smallest in absolute values) of their coefficients (loadings).

#install.packages('dplyr')
library(dplyr) # sane data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) # sane data munging
library(ggplot2) # needs no introduction
library(ggfortify) # super-helpful for plotting non-"standard" stats objects
## Warning: namespace 'DBI' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'

## Warning: namespace 'DBI' is not available and has been replaced
## by .GlobalEnv when processing object '<unknown>'
#identifying your working directory
getwd() #confirm your working directory is accurate
## [1] "/cloud/project"
library(readr)

##  mydata <-read_csv('Segmentation.csv')

mydata <-read_csv('wonderfulcustomer_segmentation.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   gender = col_double(),
##   age = col_double(),
##   companies = col_double(),
##   Halo_oranges = col_double(),
##   experience = col_double(),
##   userfriendly = col_double(),
##   price = col_double(),
##   variety = col_double(),
##   ad_effectiveness = col_double(),
##   halo_organic = col_double(),
##   pistachio_flavor = col_double()
## )
# read csv file #This allows you to read the data from my Github site.

#Open the data. Note that some students will see an Excel option in "Import Dataset";
#those that do not will need to save the original data as a csv and import that as a text file.
#rm(list = ls()) #used to clean your working environment
fit <- kmeans(mydata[,-1], 3, iter.max=1000)
#exclude the first column since it is "id" instead of a factor #or variable.
#3 means you want to have 3 clusters
table(fit$cluster)
## 
## 1 2 3 
## 5 3 5
barplot(table(fit$cluster), col="#336699") #plot

pca <- prcomp(mydata[,-1], scale=TRUE) #principle component analysis
pca_data <- mutate(fortify(pca), col=fit$cluster)
#We want to examine the cluster memberships for each #observation - see last column

ggplot(pca_data) + geom_point(aes(x=PC1, y=PC2, fill=factor(col)),
size=3, col="#7f7f7f", shape=21) + theme_bw(base_family="Helvetica")

autoplot(fit, data=mydata[,-1], frame=TRUE, frame.type='norm')
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## Too few points to calculate an ellipse

names(pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pca$center
##              age        companies     Halo_oranges       experience 
##        1.3076923        3.0000000        0.7692308        4.3076923 
##     userfriendly            price          variety ad_effectiveness 
##        4.0000000        3.3846154        3.9230769        3.8461538 
##     halo_organic pistachio_flavor 
##        1.5384615        1.8461538
pca$scale
##              age        companies     Halo_oranges       experience 
##        0.4803845        1.2909944        0.4385290        0.7510676 
##     userfriendly            price          variety ad_effectiveness 
##        0.9128709        1.1929279        0.7595545        0.9870962 
##     halo_organic pistachio_flavor 
##        0.5188745        0.6887372
pca$rotation
##                         PC1         PC2         PC3          PC4         PC5
## age               0.1354156  0.62954146 -0.08030617  0.130636308 -0.44470909
## companies        -0.2138361  0.30532000 -0.32665538 -0.560914547  0.38158456
## Halo_oranges      0.1183531  0.07137337  0.69627241  0.085320365  0.53554915
## experience       -0.3288920  0.06849566 -0.32952288  0.001918435  0.24898344
## userfriendly     -0.4614568 -0.08328913  0.01627939  0.106249693  0.08002230
## price            -0.4368161 -0.16437607  0.15326075  0.213483797 -0.13445847
## variety          -0.3854090  0.23079792  0.27769092  0.067565270 -0.35060755
## ad_effectiveness -0.4429978 -0.20585705 -0.05564472  0.126437669 -0.02179738
## halo_organic      0.1837193 -0.60799746 -0.09577026 -0.269741066 -0.29492576
## pistachio_flavor -0.1773943  0.03753854  0.42465538 -0.714737476 -0.27010512
##                          PC6         PC7         PC8          PC9         PC10
## age               0.32850751 -0.32115063  0.37406796  0.114218454 -0.005608762
## companies        -0.28133801 -0.42635221 -0.02046313  0.150581607  0.100485543
## Halo_oranges      0.26800846 -0.35527896  0.04618089  0.020552369 -0.037469004
## experience        0.77058402  0.18304833 -0.29425844  0.007511035 -0.051046945
## userfriendly     -0.10868152  0.07090802  0.43388771  0.264342918 -0.697988267
## price            -0.01223187 -0.06126961  0.01350257  0.610347723  0.564285969
## variety          -0.15995307 -0.23693124 -0.64820621 -0.158212869 -0.258196166
## ad_effectiveness  0.03950093 -0.29155378  0.35189570 -0.681958330  0.257596971
## halo_organic      0.27260843 -0.53033467 -0.06760668  0.162430375 -0.202850729
## pistachio_flavor  0.18406932  0.35239104  0.18417467 -0.070120498  0.078008504
dim(pca$x)
## [1] 13 10
biplot(pca, scale=0)

pca$rotation=-pca$rotation
pca$x=-pca$x
biplot(pca, scale=0)

pca$sdev
##  [1] 2.0888302 1.2911213 1.1599507 1.0120643 0.8622413 0.7284398 0.3884226
##  [8] 0.3000490 0.2335709 0.1746222
pca.var=pca$sdev^2
pca.var
##  [1] 4.36321153 1.66699431 1.34548555 1.02427421 0.74346004 0.53062450
##  [7] 0.15087214 0.09002943 0.05455538 0.03049292
pve=pca.var/sum(pca.var)
pve
##  [1] 0.436321153 0.166699431 0.134548555 0.102427421 0.074346004 0.053062450
##  [7] 0.015087214 0.009002943 0.005455538 0.003049292
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

write.csv(pca_data, "pca_data.csv")
#save your cluster solutions in the working directory
#We want to examine the cluster memberships for each observation - see last column of pca_data

Discussion Questions for you (50 words per questions)

  1. Think about at least one question you could answer using this result. Please make sure to cite the original source. Answer: Your answer here:

2.Interpret the PCA graphs according to the required reading(p.385-p.399) https://www.statlearning.com/ (page number required).

References

Cluster analysis - reading (p.385-p.399) https://www.statlearning.com/

Hint:you can download the free version of this book from this website.

Comparison of similarity coefficients used for cluster analysis with dominant markers in maize (Zea mays L) https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1415-47572004000100014&lng=en&nrm=iso

Principal Component Methods in R: Practical Guide http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/

Principal component analysis - reading (p.404-p.405) https://www.statlearning.com/

Hint:you can download the free version from this website.

Principal Component Methods in R: Practical Guide http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/

https://online.stat.psu.edu/stat505/lesson/11/11.4