library(tidytext)
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
library(textdata)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(tm)
library(dplyr)
library(SentimentAnalysis)
library(gutenbergr)
library(stringr)
library(plyr)
library(ggcorrplot)
library(factoextra)
library(cluster)
library(ggplot2)
summary<-data %>%
group_by(employ) %>%
summarize(mean=mean(income,na.rm=TRUE),
std_dev=sd(income,na.rm=TRUE))
summary
hist(data$income)

hist(data$size)

hist(data$age)

store_numeric <- names(which(sapply(data, is.numeric)))
corr <- cor(data[,store_numeric], use = 'pairwise.complete.obs')
ggcorrplot(corr, lab = TRUE)

ggplot(data, aes(x = factor(location), fill = factor(location))) +
geom_bar() +
geom_text(aes(label = sum(income)), stat = "count", vjust = 1.5, colour = "white")
## Warning: Removed 4 rows containing missing values (geom_text).

ddply(data, .(location), summarize, Average_Size =mean(size))
ddply(data, .(employ), summarize, Average_Income =mean(income, na.rm=TRUE))
ggplot(data_01, aes(data_01$age, data_01$income, color=data_01$employ_txt)) + geom_point()

ggplot(data_01, aes(data_01$age, data_01$income, color=data_01$location_txt)) + geom_point()

df <- na.omit(data)
df <- scale(df)
head(df)
## homeshop storeshop employ income location size
## 2 -0.06071799 0.36367948 1.2858059 0.07248169 -0.2998623 -0.2390601
## 3 -0.43912469 0.45293994 1.5632869 1.55888013 -0.2998623 1.3419836
## 4 -0.12652785 0.02448973 1.2858059 0.36387987 2.3547180 -0.2390601
## 5 -0.76817400 -1.31441715 -0.9340418 0.52957264 2.3547180 2.9230272
## 6 -0.75172153 -1.46615993 -0.9340418 0.28196192 2.3547180 -0.2390601
## 9 -1.08077084 -1.08233996 1.2858059 0.37519379 2.3547180 -0.2390601
## age
## 2 0.6101992
## 3 -1.7123288
## 4 0.7801402
## 5 -0.1262121
## 6 0.5535521
## 9 0.6101992
#fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:1412] 1 2 1 2 2 1 1 2 2 2 ...
## ..- attr(*, "names")= chr [1:1412] "2" "3" "4" "5" ...
## $ centers : num [1:2, 1:7] 0.1719 -0.0981 0.2258 -0.1288 1.0273 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:7] "homeshop" "storeshop" "employ" "income" ...
## $ totss : num 9877
## $ withinss : num [1:2] 2369 5257
## $ tot.withinss: num 7626
## $ betweenss : num 2251
## $ size : int [1:2] 513 899
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = df)
