library(readr)
mydata <-read_csv('Coffee_Survey_Final_Clean.csv')
## Rows: 28 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (21): ID, Age, Occupation, Income, Zipcode, Buy_local_important, Drink_p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

###Building distance function and ploting the trees (dendrograms)

use = scale(mydata[,-c(1)], center = TRUE, scale = TRUE)
dist = dist(use)  
d <- dist(as.matrix(dist))   # find distance matrix 
seg.hclust <- hclust(d)                # apply hirarchical clustering 
library(ggplot2) # needs no introduction
plot(seg.hclust)

###Identifying clustering memberships for each cluster

groups.3 = cutree(seg.hclust,3)
table(groups.3)   
## groups.3
##  1  2  3 
##  8 18  2
mydata$ID[groups.3 == 1]
## [1]  1  3 13 14 18 20 22 28
mydata$ID[groups.3 == 2]
##  [1]  2  4  5  6  8  9 10 11 12 15 16 19 21 23 24 25 26 27
mydata$ID[groups.3 == 3]
## [1]  7 17

Identifying common features of each cluster using the aggregate function

#?aggregate
aggregate(mydata,list(groups.3),median)
##   Group.1   ID Age Occupation Income Zipcode Buy_local_important
## 1       1 16.0   3          2    5.5    10.5                   2
## 2       2 13.5   3          2    5.0     8.0                   2
## 3       3 12.0   4          3    6.0     7.0                   1
##   Drink_preference Cups_per_day Coffee_from Coffeeshop_visit Buy_reason_1
## 1                2          1.0         1.0                4          5.0
## 2                1          2.0         3.0                3          4.5
## 3                1          4.5         4.5                2          5.0
##   Buy_reason_2 Buy_reason_3 Local_owned_important Organic_Important
## 1          7.0            5                     0                 2
## 2          5.0            5                     1                 2
## 3          5.5            6                     0                 0
##   Selection_Important Social_cause_important Enviroment_important
## 1                   0                      0                    0
## 2                   1                      1                    1
## 3                   0                      1                    0
##   Drive thru_important Fast Service_important Non-coffee_purchases
## 1                  0.0                    0.0                  3.0
## 2                  1.0                    1.0                  3.0
## 3                  1.5                    0.5                  2.5
aggregate(mydata,list(groups.3),mean)
##   Group.1       ID      Age Occupation   Income   Zipcode Buy_local_important
## 1       1 14.87500 3.125000   2.000000 4.875000 10.625000            2.375000
## 2       2 14.61111 2.944444   2.111111 4.944444  7.388889            2.277778
## 3       3 12.00000 4.000000   3.000000 6.000000  7.000000            1.000000
##   Drink_preference Cups_per_day Coffee_from Coffeeshop_visit Buy_reason_1
## 1         2.125000     1.500000         2.0         3.625000     5.000000
## 2         1.277778     2.388889         2.5         3.055556     3.944444
## 3         1.000000     4.500000         4.5         2.000000     5.000000
##   Buy_reason_2 Buy_reason_3 Local_owned_important Organic_Important
## 1     6.000000     4.875000              0.125000          1.500000
## 2     4.722222     5.333333              1.333333          1.611111
## 3     5.500000     6.000000              0.000000          0.000000
##   Selection_Important Social_cause_important Enviroment_important
## 1            0.500000               0.000000             0.250000
## 2            1.444444               1.333333             1.333333
## 3            0.000000               1.000000             0.000000
##   Drive thru_important Fast Service_important Non-coffee_purchases
## 1             0.375000               0.375000             2.750000
## 2             1.388889               1.222222             2.666667
## 3             1.500000               0.500000             2.500000
aggregate(mydata[,-1],list(groups.3),median)
##   Group.1 Age Occupation Income Zipcode Buy_local_important Drink_preference
## 1       1   3          2    5.5    10.5                   2                2
## 2       2   3          2    5.0     8.0                   2                1
## 3       3   4          3    6.0     7.0                   1                1
##   Cups_per_day Coffee_from Coffeeshop_visit Buy_reason_1 Buy_reason_2
## 1          1.0         1.0                4          5.0          7.0
## 2          2.0         3.0                3          4.5          5.0
## 3          4.5         4.5                2          5.0          5.5
##   Buy_reason_3 Local_owned_important Organic_Important Selection_Important
## 1            5                     0                 2                   0
## 2            5                     1                 2                   1
## 3            6                     0                 0                   0
##   Social_cause_important Enviroment_important Drive thru_important
## 1                      0                    0                  0.0
## 2                      1                    1                  1.0
## 3                      1                    0                  1.5
##   Fast Service_important Non-coffee_purchases
## 1                    0.0                  3.0
## 2                    1.0                  3.0
## 3                    0.5                  2.5
aggregate(mydata[,-1],list(groups.3),mean)
##   Group.1      Age Occupation   Income   Zipcode Buy_local_important
## 1       1 3.125000   2.000000 4.875000 10.625000            2.375000
## 2       2 2.944444   2.111111 4.944444  7.388889            2.277778
## 3       3 4.000000   3.000000 6.000000  7.000000            1.000000
##   Drink_preference Cups_per_day Coffee_from Coffeeshop_visit Buy_reason_1
## 1         2.125000     1.500000         2.0         3.625000     5.000000
## 2         1.277778     2.388889         2.5         3.055556     3.944444
## 3         1.000000     4.500000         4.5         2.000000     5.000000
##   Buy_reason_2 Buy_reason_3 Local_owned_important Organic_Important
## 1     6.000000     4.875000              0.125000          1.500000
## 2     4.722222     5.333333              1.333333          1.611111
## 3     5.500000     6.000000              0.000000          0.000000
##   Selection_Important Social_cause_important Enviroment_important
## 1            0.500000               0.000000             0.250000
## 2            1.444444               1.333333             1.333333
## 3            0.000000               1.000000             0.000000
##   Drive thru_important Fast Service_important Non-coffee_purchases
## 1             0.375000               0.375000             2.750000
## 2             1.388889               1.222222             2.666667
## 3             1.500000               0.500000             2.500000
cluster_means <- aggregate(mydata[,-1],list(groups.3),mean)

Exporting cluster analysis results into excel from R Studio Cloud

write.csv(groups.3, "clusterID.csv")
write.csv(cluster_means, "cluster_means.csv")

Principal Component Analysis (PCA)

Intro

Principal Component Analysis (PCA) involves the process of understanding different features in a dataset and can be used in conjunction with cluster analysis.

PCA is also a popular machine learning algorithm used for feature selection. Imagine if you have more than 100 features or factors. It is useful to select the most important features for further analysis.

The basic idea when using PCA as a tool for feature selection is to select variables according to the magnitude (from largest to smallest in absolute values) of their coefficients (loadings).

#install.packages('dplyr')
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) 
library(ggplot2) 
library(ggfortify) 


getwd() 
## [1] "/cloud/project"
library(readr)


mydata <-read_csv('Coffee_Survey_Final_Clean.csv')
## Rows: 28 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (21): ID, Age, Occupation, Income, Zipcode, Buy_local_important, Drink_p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read csv file #This allows you to read the data from my Github site.


fit <- kmeans(mydata[,-1], 3, iter.max=1000)

table(fit$cluster)
## 
##  1  2  3 
##  9  9 10
barplot(table(fit$cluster), col="#336699") 

pca <- prcomp(mydata[,-1], scale=TRUE) 
pca_data <- mutate(fortify(pca), col=fit$cluster)

ggplot(pca_data) + geom_point(aes(x=PC1, y=PC2, fill=factor(col)),
size=3, col="#7f7f7f", shape=21) + theme_bw(base_family="Helvetica")

autoplot(fit, data=mydata[,-1], frame=TRUE, frame.type='norm')

names(pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pca$center
##                    Age             Occupation                 Income 
##              3.0714286              2.1428571              5.0000000 
##                Zipcode    Buy_local_important       Drink_preference 
##              8.2857143              2.2142857              1.5000000 
##           Cups_per_day            Coffee_from       Coffeeshop_visit 
##              2.2857143              2.5000000              3.1428571 
##           Buy_reason_1           Buy_reason_2           Buy_reason_3 
##              4.3214286              5.1428571              5.2500000 
##  Local_owned_important      Organic_Important    Selection_Important 
##              0.8928571              1.4642857              1.0714286 
## Social_cause_important   Enviroment_important   Drive thru_important 
##              0.9285714              0.9285714              1.1071429 
## Fast Service_important   Non-coffee_purchases 
##              0.9285714              2.6785714
pca$scale
##                    Age             Occupation                 Income 
##              0.7663560              0.6506000              1.6996732 
##                Zipcode    Buy_local_important       Drink_preference 
##              3.1839545              0.6862212              0.7934920 
##           Cups_per_day            Coffee_from       Coffeeshop_visit 
##              1.4871584              1.6216133              1.6035675 
##           Buy_reason_1           Buy_reason_2           Buy_reason_3 
##              2.1266139              2.3208282              2.4888641 
##  Local_owned_important      Organic_Important    Selection_Important 
##              0.7859547              0.7444681              0.7663560 
## Social_cause_important   Enviroment_important   Drive thru_important 
##              0.7663560              0.7663560              0.6852567 
## Fast Service_important   Non-coffee_purchases 
##              0.6042180              0.5479640
pca$rotation
##                                 PC1         PC2         PC3          PC4
## Age                     0.037705866  0.36200898 -0.25492300  0.277720594
## Occupation              0.200337337  0.31411542 -0.30984707  0.018995509
## Income                  0.088140678  0.33361268  0.12170734  0.022700383
## Zipcode                -0.245378290 -0.07390904  0.01157851  0.083895243
## Buy_local_important    -0.026460888 -0.11675408  0.34865242 -0.170547295
## Drink_preference       -0.310902984 -0.31227903 -0.18312270  0.124427011
## Cups_per_day            0.193843848  0.30722977  0.08276182  0.046721259
## Coffee_from             0.039454241 -0.01453535 -0.24394395  0.525338263
## Coffeeshop_visit       -0.134835477 -0.25741700 -0.13332409  0.332938205
## Buy_reason_1           -0.246046352 -0.07242288  0.33037781  0.288070391
## Buy_reason_2           -0.160141513  0.27896225  0.10750443 -0.137031439
## Buy_reason_3            0.043037046 -0.08494933 -0.32998342 -0.399981137
## Local_owned_important   0.380602838 -0.23108077 -0.11396336  0.018568791
## Organic_Important       0.042944242 -0.31134255 -0.19266196 -0.092094952
## Selection_Important     0.244890127 -0.28224032  0.29697831 -0.089602168
## Social_cause_important  0.402790882 -0.11629244 -0.16800844  0.002491132
## Enviroment_important    0.361065745 -0.20281076 -0.04124596  0.058871040
## Drive thru_important    0.233340441 -0.01187485  0.27826266  0.339608416
## Fast Service_important  0.306523635  0.01683559  0.32822086  0.127844307
## Non-coffee_purchases   -0.008169555 -0.07843671 -0.04302496  0.260048637
##                                PC5         PC6          PC7          PC8
## Age                    -0.10026479  0.04587032 -0.354413036  0.013497404
## Occupation             -0.28449398  0.20092746  0.015387211 -0.042227295
## Income                  0.43364359  0.19707819  0.093696135 -0.052223560
## Zipcode                 0.49236492  0.06709525 -0.159193343  0.427339154
## Buy_local_important    -0.34920793  0.41135090 -0.239133857  0.060734360
## Drink_preference       -0.03869161  0.00841105  0.149242416  0.139436264
## Cups_per_day            0.24361892 -0.04470592  0.112006960  0.289751915
## Coffee_from             0.02137129 -0.11940973  0.240584200  0.168203235
## Coffeeshop_visit        0.08385195  0.15564659 -0.429920041 -0.091561674
## Buy_reason_1           -0.05919088 -0.18498475  0.183625380 -0.332285018
## Buy_reason_2           -0.25172810 -0.16534818  0.172780287  0.451467741
## Buy_reason_3            0.04515812 -0.04553451  0.233312957 -0.006057242
## Local_owned_important  -0.12920666 -0.05644509  0.028503482  0.017101234
## Organic_Important      -0.15131322 -0.28427682 -0.118538135  0.416527552
## Selection_Important     0.23178344  0.07678892  0.198231648  0.056868447
## Social_cause_important  0.13955443 -0.18289523  0.009112269 -0.227950246
## Enviroment_important    0.14248083  0.34103969 -0.067356021  0.113250383
## Drive thru_important   -0.14658824 -0.30313176  0.125091099 -0.002537500
## Fast Service_important -0.15550122 -0.07964847 -0.268048283  0.301539664
## Non-coffee_purchases   -0.20147072  0.55036249  0.490952126  0.148023845
##                                PC9         PC10         PC11         PC12
## Age                    -0.00818993  0.245370167  0.185110213 -0.156171178
## Occupation              0.02219942 -0.161741078 -0.103271884 -0.310455438
## Income                 -0.15440313  0.563342524 -0.007512520 -0.009963682
## Zipcode                 0.28865191 -0.166003308 -0.317856229 -0.160078815
## Buy_local_important     0.22464386  0.178055013  0.242307336 -0.122122585
## Drink_preference        0.12931668  0.076219044  0.046339274  0.003348516
## Cups_per_day           -0.17030342 -0.502381508  0.495013123  0.103776743
## Coffee_from             0.23370925  0.230238937  0.051923083 -0.106748629
## Coffeeshop_visit       -0.19466633 -0.004078856  0.276863994  0.494050063
## Buy_reason_1           -0.17527861  0.026570780  0.024293198 -0.072474243
## Buy_reason_2           -0.18110563  0.312331770 -0.175764318  0.458353697
## Buy_reason_3            0.44971160  0.147290282  0.387028739  0.237672332
## Local_owned_important  -0.09699525 -0.047011417 -0.390832062  0.126878622
## Organic_Important      -0.44289684  0.169577393  0.216738239 -0.369202691
## Selection_Important    -0.14757391  0.138250884  0.196331791 -0.247156220
## Social_cause_important -0.07938053  0.054716390 -0.063834313  0.195385685
## Enviroment_important    0.03163209  0.158222377 -0.097270273  0.068954133
## Drive thru_important    0.37238777  0.057381043  0.164406408 -0.021977914
## Fast Service_important  0.18538928 -0.058236195 -0.092795013  0.176425196
## Non-coffee_purchases   -0.17356306 -0.145089658  0.003117891  0.111647241
##                               PC13         PC14          PC15         PC16
## Age                     0.05893407 -0.350876459  0.2383626129 -0.142282435
## Occupation             -0.28234323  0.198198204  0.1256690699 -0.188306696
## Income                  0.19625774  0.335203520  0.2382062254  0.196182435
## Zipcode                 0.08861019 -0.030922765  0.1922491828 -0.338387362
## Buy_local_important    -0.07879331  0.365228920 -0.0001364952 -0.172562135
## Drink_preference       -0.37472890  0.237244225  0.4653164448  0.280230822
## Cups_per_day           -0.15997922  0.253950399  0.0832004198 -0.065943474
## Coffee_from            -0.04075050  0.288493823 -0.5794524206 -0.022198725
## Coffeeshop_visit       -0.06879855  0.051134472 -0.0197038575  0.005143447
## Buy_reason_1            0.07300437  0.148876031  0.1255075769 -0.628573490
## Buy_reason_2           -0.30672729 -0.104798964 -0.0155278631 -0.204808722
## Buy_reason_3            0.26985533 -0.006679436  0.0766677445 -0.284036389
## Local_owned_important   0.06664852  0.307540954  0.2368766529 -0.004930643
## Organic_Important       0.25688019  0.035019556  0.0749720675 -0.059363400
## Selection_Important    -0.28051652 -0.232249219 -0.0985888560  0.007283355
## Social_cause_important -0.04754710  0.085869406  0.1439263535 -0.249445902
## Enviroment_important   -0.30674223 -0.250082567 -0.0830926695 -0.252314334
## Drive thru_important   -0.06936985 -0.264451005  0.3685396705  0.161688910
## Fast Service_important  0.33407748  0.131324982 -0.0533857033  0.068523021
## Non-coffee_purchases    0.39711369 -0.210332846  0.1203348785  0.008612881
##                                PC17        PC18         PC19        PC20
## Age                     0.274356487 -0.13606971  0.300198788 -0.28457387
## Occupation             -0.079263490  0.48249132 -0.307678403  0.06166248
## Income                 -0.149522281  0.08810182 -0.076150659  0.06193634
## Zipcode                 0.135157817  0.10550146  0.014001949  0.21181452
## Buy_local_important     0.182199182 -0.21401806  0.119188840  0.25419338
## Drink_preference        0.071545005 -0.14938324 -0.162517812 -0.37845678
## Cups_per_day           -0.057011778 -0.18459811  0.153089685 -0.04283930
## Coffee_from             0.089193548 -0.04172606  0.086517081  0.04247881
## Coffeeshop_visit       -0.036558721  0.39909903 -0.026442561  0.18636450
## Buy_reason_1           -0.163248010  0.03763563 -0.003691161 -0.23775154
## Buy_reason_2            0.089682274  0.08026788  0.008909056  0.09310075
## Buy_reason_3           -0.089533424  0.21194787  0.057261614 -0.13274297
## Local_owned_important  -0.009671652  0.11642900  0.645886392 -0.03134568
## Organic_Important      -0.184747893 -0.02693577 -0.140793209  0.15851543
## Selection_Important     0.385932343  0.42869891  0.102838111 -0.18912081
## Social_cause_important  0.498209014 -0.31961928 -0.391558958  0.20791125
## Enviroment_important   -0.539155725 -0.28926403 -0.075967570 -0.14757108
## Drive thru_important   -0.194092336  0.08584015  0.001137816  0.41519315
## Fast Service_important  0.056044403  0.14132392 -0.346667512 -0.48083004
## Non-coffee_purchases    0.146190235 -0.05061233 -0.082293095  0.08379117
dim(pca$x)
## [1] 28 20
biplot(pca, scale=0)

pca$rotation=-pca$rotation
pca$x=-pca$x
biplot(pca, scale=0)

pca$sdev
##  [1] 2.0529879 1.7193528 1.5565201 1.4222525 1.1574714 1.1394458 1.0415987
##  [8] 0.9746165 0.8477479 0.7977981 0.7702227 0.7035045 0.5621017 0.5431589
## [15] 0.4719661 0.4049420 0.3188104 0.2941911 0.2296571 0.1693139
pca.var=pca$sdev^2
pca.var
##  [1] 4.21475925 2.95617418 2.42275494 2.02280223 1.33973996 1.29833662
##  [7] 1.08492777 0.94987733 0.71867654 0.63648174 0.59324295 0.49491851
## [13] 0.31595829 0.29502164 0.22275196 0.16397804 0.10164008 0.08654839
## [19] 0.05274241 0.02866719
pve=pca.var/sum(pca.var)
pve
##  [1] 0.210737962 0.147808709 0.121137747 0.101140112 0.066986998 0.064916831
##  [7] 0.054246388 0.047493866 0.035933827 0.031824087 0.029662148 0.024745926
## [13] 0.015797914 0.014751082 0.011137598 0.008198902 0.005082004 0.004327419
## [19] 0.002637120 0.001433359
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

write.csv(pca_data, "pca_data.csv")

References

Cluster analysis - reading (p.385-p.399) https://www.statlearning.com/

Hint:you can download the free version of this book from this website.

Comparison of similarity coefficients used for cluster analysis with dominant markers in maize (Zea mays L) https://www.scielo.br/scielo.php?script=sci_arttext&pid=S1415-47572004000100014&lng=en&nrm=iso

Principal Component Methods in R: Practical Guide http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/

Principal component analysis - reading (p.404-p.405) https://www.statlearning.com/

Hint:you can download the free version from this website.

Principal Component Methods in R: Practical Guide http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/

https://online.stat.psu.edu/stat505/lesson/11/11.4