# we changed echo=TRUE to error=TRUE in this step.
#The purpose of doing this is to skip error checking at Rmarkdown compiling
#see the following reference:
#Reference: https://stackoverflow.com/questions/34280043/how-to-skip-error-checking-at-
rmarkdown-compiling
## Error: object 'rmarkdown' not found
#install.packages('dplyr')
library(dplyr) # sane data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # sane data munging
library(ggplot2) # needs no introduction
library(ggfortify) # super-helpful for plotting non-"standard" stats objects
#identifying your working directory
getwd() #confirm your working directory is accurate
## [1] "/cloud/project"
library(readr)
## mydata <-read_csv('Survey Data Set.csv')
mydata <-read_csv('Survey Data Set.csv')
## Rows: 20 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): Invest_stocks, Risky_invest, Comfortable_volatility, Own_money, Com...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read csv file #This allows you to read the data from my Github site.
#Open the data. Note that some students will see an Excel option in "Import Dataset";
#those that do not will need to save the original data as a csv and import that as a text file.
#rm(list = ls()) #used to clean your working environment
fit <- kmeans(mydata[,-1], 3, iter.max=1000)
#exclude the first column since it is "id" instead of a factor #or variable.
#3 means you want to have 3 clusters
table(fit$cluster)
##
## 1 2 3
## 8 8 4
barplot(table(fit$cluster), col="#336699") #plot

pca <- prcomp(mydata[,-1]) #principle component analysis
pca_data <- mutate(fortify(pca), col=fit$cluster)
#We want to examine the cluster memberships for each #observation - see last column
ggplot(pca_data) + geom_point(aes(x=PC1, y=PC2, fill=factor(col)),
size=3, col="#7f7f7f", shape=21) + theme_bw(base_family="Helvetica")

autoplot(fit, data=mydata[,-1], frame=TRUE, frame.type='norm')

write.csv(pca_data, "Survey Data Set.csv")
#save your cluster solutions in the working directory
#We want to examine the cluster memberships for each observation - see last column of pca_data