Descriptive Analysis and K-mean Clustering

library(tidytext)
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
library(textdata)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(tm)
library(dplyr)
library(SentimentAnalysis)
library(gutenbergr)
library(stringr)
library(plyr)
library(ggcorrplot)
library(factoextra)
library(cluster)
library(ggplot2)

summary<-data %>%
  group_by(employ) %>%
  summarize(mean=mean(income,na.rm=TRUE),
            std_dev=sd(income,na.rm=TRUE))
summary

hist(data$income)

hist(data$size)

hist(data$age)

store_numeric <- names(which(sapply(data, is.numeric)))
corr <- cor(data[,store_numeric], use = 'pairwise.complete.obs')
ggcorrplot(corr, lab = TRUE)

ggplot(data, aes(x = factor(location), fill = factor(location))) +
  geom_bar() +
  geom_text(aes(label = sum(income)), stat = "count", vjust = 1.5, colour = "white")

## Warning: Removed 4 rows containing missing values (geom_text).

ddply(data, .(location), summarize,  Average_Size =mean(size))

ddply(data, .(employ), summarize,  Average_Income =mean(income, na.rm=TRUE))

ggplot(data_01, aes(data_01$age, data_01$income, color=data_01$employ_txt)) + geom_point()

ggplot(data_01, aes(data_01$age, data_01$income, color=data_01$location_txt)) + geom_point()

df <- na.omit(data)
df <- scale(df)
head(df)

##      homeshop   storeshop     employ     income   location       size
## 2 -0.06071799  0.36367948  1.2858059 0.07248169 -0.2998623 -0.2390601
## 3 -0.43912469  0.45293994  1.5632869 1.55888013 -0.2998623  1.3419836
## 4 -0.12652785  0.02448973  1.2858059 0.36387987  2.3547180 -0.2390601
## 5 -0.76817400 -1.31441715 -0.9340418 0.52957264  2.3547180  2.9230272
## 6 -0.75172153 -1.46615993 -0.9340418 0.28196192  2.3547180 -0.2390601
## 9 -1.08077084 -1.08233996  1.2858059 0.37519379  2.3547180 -0.2390601
##          age
## 2  0.6101992
## 3 -1.7123288
## 4  0.7801402
## 5 -0.1262121
## 6  0.5535521
## 9  0.6101992

#fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

k2 <- kmeans(df, centers = 2, nstart = 25)

str(k2)

## List of 9
##  $ cluster     : Named int [1:1412] 1 2 1 2 2 1 1 2 2 2 ...
##   ..- attr(*, "names")= chr [1:1412] "2" "3" "4" "5" ...
##  $ centers     : num [1:2, 1:7] 0.1719 -0.0981 0.2258 -0.1288 1.0273 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:7] "homeshop" "storeshop" "employ" "income" ...
##  $ totss       : num 9877
##  $ withinss    : num [1:2] 2369 5257
##  $ tot.withinss: num 7626
##  $ betweenss   : num 2251
##  $ size        : int [1:2] 513 899
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

fviz_cluster(k2, data = df)

Descriptive Analysis and K-mean Clustering

Javed Iqbal

3/9/2022