Data Cleaning

Adding descriptions for coded variables

I am adding in descriptions so it will be easier to read the output of the model

#cleaning variables
#define weekday
fbook_clean = 
  fbook %>%
  mutate(
    #Category
    dsc_category = 
      case_when(
        Category == 1 ~ "Action",
        Category == 2 ~ "Product", 
        Category == 3 ~ "Inspiration",
        TRUE ~ as.character(Category)
      ),
    dsc_month = 
      case_when( 
        Post.Month == 1 ~ "January",
        Post.Month == 2 ~ "February", 
        Post.Month == 3 ~ "March",
        Post.Month == 4 ~ "April",
        Post.Month == 5 ~ "May",
        Post.Month == 6 ~ "June",
        Post.Month == 7 ~ "July",
        Post.Month == 8 ~ "August",
        Post.Month == 9 ~ "September",
        Post.Month == 10 ~ "October",
        Post.Month == 11 ~ "November",
        Post.Month == 12 ~ "December",
        TRUE ~ as.character(Post.Month)
      ),
        dsc_weekday = 
      case_when(
        Post.Weekday == 1 ~ "Sunday",
        Post.Weekday == 2 ~ "Monday",
        Post.Weekday == 3 ~ "Tuesday",
        Post.Weekday == 4 ~ "Wednesday",
        Post.Weekday == 5 ~ "Thursday",
        Post.Weekday == 6 ~ "Friday",
        Post.Weekday == 7 ~ "Saturday",
        TRUE ~ as.character(Post.Weekday)
      )

  )

# going to check to make sure the descriptions I created are correct
fbook_clean %>%
  dplyr::select(Category, dsc_category) %>%
  distinct() %>%
  arrange(Category)
##   Category dsc_category
## 1        1       Action
## 2        2      Product
## 3        3  Inspiration
fbook_clean %>% 
  dplyr::select(Post.Month, dsc_month) %>%
  distinct() %>% #only the unique pairs selected
  arrange(Post.Month)
##    Post.Month dsc_month
## 1           1   January
## 2           2  February
## 3           3     March
## 4           4     April
## 5           5       May
## 6           6      June
## 7           7      July
## 8           8    August
## 9           9 September
## 10         10   October
## 11         11  November
## 12         12  December
fbook_clean %>%
  dplyr::select(Post.Weekday, dsc_weekday) %>%
  distinct() %>%
  arrange(Post.Weekday)
##   Post.Weekday dsc_weekday
## 1            1      Sunday
## 2            2      Monday
## 3            3     Tuesday
## 4            4   Wednesday
## 5            5    Thursday
## 6            6      Friday
## 7            7    Saturday

Changing data types

Change Type to factor for linear regression

head(fbook)
##   Page.total.likes   Type Category Post.Month Post.Weekday Post.Hour Paid
## 1           139441  Photo        2         12            4         3    0
## 2           139441 Status        2         12            3        10    0
## 3           139441  Photo        3         12            3         3    0
## 4           139441  Photo        2         12            2        10    1
## 5           139441  Photo        2         12            2         3    0
## 6           139441 Status        2         12            1         9    0
##   Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1                      2752                            5091
## 2                     10460                           19057
## 3                      2413                            4373
## 4                     50128                           87991
## 5                      7244                           13594
## 6                     10472                           20849
##   Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1                    178                     109                        159
## 2                   1457                    1361                       1674
## 3                    177                     113                        154
## 4                   2211                     790                       1119
## 5                    671                     410                        580
## 6                   1191                    1073                       1389
##   Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1                                                         3078
## 2                                                        11710
## 3                                                         2812
## 4                                                        61027
## 5                                                         6228
## 6                                                        16034
##   Lifetime.Post.reach.by.people.who.like.your.Page
## 1                                             1640
## 2                                             6112
## 3                                             1503
## 4                                            32048
## 5                                             3200
## 6                                             7852
##   Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1                                                                 119       4
## 2                                                                1108       5
## 3                                                                 132       0
## 4                                                                1386      58
## 5                                                                 396      19
## 6                                                                1016       1
##   like share Total.Interactions
## 1   79    17                100
## 2  130    29                164
## 3   66    14                 80
## 4 1572   147               1777
## 5  325    49                393
## 6  152    33                186
#using str(fbook_clean) in the terminal to see what needs to be changed and its the features I just made above
fbook_clean$Type = as.factor(fbook_clean$Type)
fbook_clean$dsc_weekday = factor(fbook_clean$dsc_weekday, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
fbook_clean$dsc_month = factor(fbook_clean$dsc_month,
                                  levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))
fbook_clean$dsc_category = as.factor(fbook_clean$dsc_category)
# making post hour and paid factors too
fbook_clean$Post.Hour = as.factor(fbook_clean$Post.Hour)
fbook_clean$Paid = as.factor(fbook_clean$Paid)


str(fbook_clean) #success!
## 'data.frame':    490 obs. of  22 variables:
##  $ Page.total.likes                                                   : int  139441 139441 139441 139441 139441 139441 139441 139441 139441 139441 ...
##  $ Type                                                               : Factor w/ 4 levels "Link","Photo",..: 2 3 2 2 2 3 2 2 3 2 ...
##  $ Category                                                           : int  2 2 3 2 2 2 3 3 2 3 ...
##  $ Post.Month                                                         : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ Post.Weekday                                                       : int  4 3 3 2 2 1 1 7 7 6 ...
##  $ Post.Hour                                                          : Factor w/ 22 levels "1","2","3","4",..: 3 10 3 10 3 9 3 9 3 10 ...
##  $ Paid                                                               : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 2 2 1 1 ...
##  $ Lifetime.Post.Total.Reach                                          : int  2752 10460 2413 50128 7244 10472 11692 13720 11844 4694 ...
##  $ Lifetime.Post.Total.Impressions                                    : int  5091 19057 4373 87991 13594 20849 19479 24137 22538 8668 ...
##  $ Lifetime.Engaged.Users                                             : int  178 1457 177 2211 671 1191 481 537 1530 280 ...
##  $ Lifetime.Post.Consumers                                            : int  109 1361 113 790 410 1073 265 232 1407 183 ...
##  $ Lifetime.Post.Consumptions                                         : int  159 1674 154 1119 580 1389 364 305 1692 250 ...
##  $ Lifetime.Post.Impressions.by.people.who.have.liked.your.Page       : int  3078 11710 2812 61027 6228 16034 15432 19728 15220 4309 ...
##  $ Lifetime.Post.reach.by.people.who.like.your.Page                   : int  1640 6112 1503 32048 3200 7852 9328 11056 7912 2324 ...
##  $ Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post: int  119 1108 132 1386 396 1016 379 422 1250 199 ...
##  $ comment                                                            : int  4 5 0 58 19 1 3 0 0 3 ...
##  $ like                                                               : int  79 130 66 1572 325 152 249 325 161 113 ...
##  $ share                                                              : int  17 29 14 147 49 33 27 14 31 26 ...
##  $ Total.Interactions                                                 : int  100 164 80 1777 393 186 279 339 192 142 ...
##  $ dsc_category                                                       : Factor w/ 3 levels "Action","Inspiration",..: 3 3 2 3 3 3 2 2 3 2 ...
##  $ dsc_month                                                          : Factor w/ 12 levels "January","February",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ dsc_weekday                                                        : Factor w/ 7 levels "Sunday","Monday",..: 4 3 3 2 2 1 1 7 7 6 ...

Exploratory Data Analysis

Missing Values

Checking for missing values in the 7 input features

sum(is.na(fbook_clean))
## [1] 6
sapply(fbook_clean, function(x) sum(is.na(x)))
##                                                    Page.total.likes 
##                                                                   0 
##                                                                Type 
##                                                                   0 
##                                                            Category 
##                                                                   0 
##                                                          Post.Month 
##                                                                   0 
##                                                        Post.Weekday 
##                                                                   0 
##                                                           Post.Hour 
##                                                                   0 
##                                                                Paid 
##                                                                   1 
##                                           Lifetime.Post.Total.Reach 
##                                                                   0 
##                                     Lifetime.Post.Total.Impressions 
##                                                                   0 
##                                              Lifetime.Engaged.Users 
##                                                                   0 
##                                             Lifetime.Post.Consumers 
##                                                                   0 
##                                          Lifetime.Post.Consumptions 
##                                                                   0 
##        Lifetime.Post.Impressions.by.people.who.have.liked.your.Page 
##                                                                   0 
##                    Lifetime.Post.reach.by.people.who.like.your.Page 
##                                                                   0 
## Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post 
##                                                                   0 
##                                                             comment 
##                                                                   0 
##                                                                like 
##                                                                   1 
##                                                               share 
##                                                                   4 
##                                                  Total.Interactions 
##                                                                   0 
##                                                        dsc_category 
##                                                                   0 
##                                                           dsc_month 
##                                                                   0 
##                                                         dsc_weekday 
##                                                                   0
fbook_clean %>%
  filter(is.na(share)) #assume this is a NA b/c its 0, no share so plan to change this to 0.... 
##   Page.total.likes  Type Category Post.Month Post.Weekday Post.Hour Paid
## 1           136736 Photo        1         10            6         8    0
## 2           136393 Photo        1         10            7         9    0
## 3           136393 Photo        1         10            7         6    0
## 4           135428 Photo        1          9            5        10    0
##   Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1                      1261                            2158
## 2                       584                            1029
## 3                       677                            1285
## 4                      1060                            2004
##   Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1                     37                      37                         49
## 2                    273                     271                        308
## 3                    251                     246                        297
## 4                    266                     251                        337
##   Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1                                                         1911
## 2                                                          943
## 3                                                         1210
## 4                                                         1705
##   Lifetime.Post.reach.by.people.who.like.your.Page
## 1                                             1077
## 2                                              511
## 3                                              615
## 4                                              870
##   Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1                                                                  33       0
## 2                                                                 232       0
## 3                                                                 211       0
## 4                                                                 204       0
##   like share Total.Interactions dsc_category dsc_month dsc_weekday
## 1   NA    NA                  0       Action   October      Friday
## 2    2    NA                  2       Action   October    Saturday
## 3    7    NA                  7       Action   October    Saturday
## 4   18    NA                 18       Action September    Thursday
fbook_clean %>%
  filter(is.na(like)) #assume this is a NA b/c its 0, no like so plan to change this to 0.... 
##   Page.total.likes  Type Category Post.Month Post.Weekday Post.Hour Paid
## 1           136736 Photo        1         10            6         8    0
##   Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1                      1261                            2158
##   Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1                     37                      37                         49
##   Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1                                                         1911
##   Lifetime.Post.reach.by.people.who.like.your.Page
## 1                                             1077
##   Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1                                                                  33       0
##   like share Total.Interactions dsc_category dsc_month dsc_weekday
## 1   NA    NA                  0       Action   October      Friday
fbook_clean %>%
  filter(is.na(Paid)) #we can drop this one... but need to figure out how,, maybe w/ index
##   Page.total.likes  Type Category Post.Month Post.Weekday Post.Hour Paid
## 1            81370 Photo        2          1            4         4 <NA>
##   Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1                      4188                            7292
##   Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1                    564                     524                        743
##   Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1                                                         3861
##   Lifetime.Post.reach.by.people.who.like.your.Page
## 1                                             2200
##   Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post comment
## 1                                                                 316       0
##   like share Total.Interactions dsc_category dsc_month dsc_weekday
## 1   91    28                119      Product   January   Wednesday

Here I’m going to create a new dataset, fbook_clean2, with 0’s replacing the NAs in share and like, and removing the record that has NA in Paid

fbook_clean2 = 
  fbook_clean %>%
  filter(!is.na(Paid))

fbook_clean2[is.na(fbook_clean2)] = 0

# this is checking that there are no more nulls
sum(is.na(fbook_clean2))
## [1] 0

Exploratory Data Analysis: Total Reach

To start I’m going to create a dataset with only the input variables and the output variable of interest, Total Reach

fbook_clean2_reach = 
  fbook_clean2 %>%
  dplyr::select(Lifetime.Post.Total.Reach, Page.total.likes, Type, dsc_category, dsc_month, dsc_weekday, Post.Hour, Paid)

Outliers

Since there’s only one continuous variable (Page.total.likes) in the input variables, I am only going to look for outliers in this variable.

Starting by looking visually

fbook_clean2_reach %>%
  ggplot(aes(Page.total.likes)) +
  geom_boxplot() +
  theme_classic()

There doesn’t appear to be any outliers in Page.total.likes based on the boxplot above.

Scatterplot

Using a scatterplot to examine the relationship between total reach and page total likes

fbook_clean2_reach %>%
  ggplot(aes(y = Lifetime.Post.Total.Reach, x = Page.total.likes)) +
  geom_point() +
  theme_classic()

Interesting to see that in general, posts don’t have a high reach depending on how many total likes a page has.

Box plots

fbook_clean2_reach %>%
  ggplot(aes(y = Lifetime.Post.Total.Reach, x = Type, fill  = Type)) +
  geom_boxplot() +
  theme_classic()

fbook_clean2_reach %>%
  ggplot(aes(y = Lifetime.Post.Total.Reach, x = dsc_category, fill = dsc_category)) +
  geom_boxplot() +
  theme_classic()

fbook_clean2_reach %>%
  ggplot(aes(y = Lifetime.Post.Total.Reach, x = dsc_month, fill = dsc_month)) +
  geom_boxplot() +
  theme_classic()

fbook_clean2_reach %>%
  ggplot(aes(y = Lifetime.Post.Total.Reach, x = dsc_weekday, fill = dsc_weekday)) +
  geom_boxplot() +
  theme_classic()

fbook_clean2_reach %>%
  ggplot(aes(y = Lifetime.Post.Total.Reach, x = Post.Hour, fill= Post.Hour)) +
  geom_boxplot() +
  theme_classic()

fbook_clean2_reach %>%
  ggplot(aes(y = Lifetime.Post.Total.Reach, x = Paid, fill = Paid)) +
  geom_boxplot() + 
  theme_classic()

Type of post shows that “Photo” has the larger spread when it comes to reach and “Video” is likely to have a high reach. The category that the post is classified as does not seem to largely influence its reach but generally speaking Action does better than Product. The time features like month, day, and hour may not have high importance in the reach due to the likely hood of a post being created during waking hours and various shopping season, basically Life.time.post.total.reach covers so much time, the orginal posting time shouldnt influence the post’s reach down the road. Last graph shows whether the post has been paid for or not, and shows that paid post do in fact have a larger reach.

After exploring total reach, it appears that total reach has an interation with whether its a paid post or not. Besides this, there was not a whole lot to futher explore therefore I am going to look at Engaged Users below.

Exploratory Data Analysis: Engaged Users

To start, I’m going to create a dataset with only the input variables and the output variable of interest, Total Reach

fbook_clean2_engaged= 
  fbook_clean2 %>%
  dplyr::select(Lifetime.Engaged.Users, Page.total.likes, Type, dsc_category, dsc_month, dsc_weekday, Post.Hour, Paid)

Outliers

Since there’s only one continuous variable (Page.total.likes) in the input variables, I am only going to look for outliers in this variable.

Starting by looking visually

fbook_clean2_engaged %>%
  ggplot(aes(Lifetime.Engaged.Users)) +
  geom_boxplot() +
  theme_classic()

The above boxplot appears to be right skewed

Scatterplot

Using a scatterplot to examine the relationship between engaged users and page total likes

fbook_clean2_engaged %>%
  ggplot(aes(y = Lifetime.Engaged.Users, x = Page.total.likes)) +
  geom_point() +
  theme_classic()

Interesting to see that in general, posts don’t have a high reach depending on how many total likes a page has.

Box plots

fbook_clean2_engaged %>%
  ggplot(aes(y = Lifetime.Engaged.Users, x = Type, fill = Type)) +
  geom_boxplot() +
  theme_classic()

Engaged Users and Type of post show that Status has the greatest engaged users while link is the least.

fbook_clean2_engaged %>%
  ggplot(aes(y = Lifetime.Engaged.Users, x = dsc_category, fill = dsc_category)) +
  geom_boxplot() +
  theme_classic()

Engeged Users and Category of post show that product does well. Interesting because product and lifetime.post.reach did not show a strong interation suggesting that reach may not be a great metirc for investment if the goal is to increase engagement.

fbook_clean2_engaged %>%
  ggplot(aes(y = Lifetime.Engaged.Users, x = dsc_month, fill = dsc_month)) +
  geom_boxplot() + theme_classic()

Engagment by month show February as quite the standout month with overall high engagment.

fbook_clean2_engaged %>%
  ggplot(aes(y = Lifetime.Engaged.Users, x = dsc_weekday, fill = dsc_weekday)) +
  geom_boxplot() +
  theme_classic()

Tuesday and Thursday is the best day to post if engaged users is your goal however the lifetime engament seem to be heavily dependent on the day of the week. This could be due to more people working and living non tradional schedules even at the time of this data collection.

fbook_clean2_engaged %>%
  ggplot(aes(y = Lifetime.Engaged.Users, x = Post.Hour, fill = Post.Hour)) +
  geom_boxplot() +
  theme_classic()

Post with good engagment is seen during the early AM, mid day, and late afternoon. Late night has very little spread in the data.

fbook_clean2_engaged %>%
  ggplot(aes(y = Lifetime.Engaged.Users, x = Paid, fill = Paid)) +
  geom_boxplot() +
  theme_classic()

Paying to promote a post does not seem to influence whether people will engage with the post or not. Interesting finding due to previous exploration that showed a paid post having greater reach. This suggest that if the goal is to reach people, pay to promote the post while on the other hand, if the goal of the social media content is to have engagment (think influencer) then the content quality should be the main focus. Another intersting finding previously was that the category action did best with reach and product was the least. if the post’s goal is to show a product and get reach, the post should be a picture of something pretty rad and action packed with the product possibly as a subject or be in use by the subject.

K means

setup for K means

## install packages if necessary
# install.packages("cluster")
# install.packages("factoextra")
# install.packages("purrr")
# install.packages("NbClust")

library(cluster)
## Warning: package 'cluster' was built under R version 4.0.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(purrr)
## Warning: package 'purrr' was built under R version 4.0.3
library(NbClust)
## Warning: package 'NbClust' was built under R version 4.0.3
fbook_clean2_output_var = 
  fbook_clean2 %>%
  dplyr::select(
    Lifetime.Post.Total.Reach, 
    Lifetime.Post.Total.Impressions, 
    Lifetime.Engaged.Users, 
    Lifetime.Post.Consumers, 
    Lifetime.Post.Consumptions,
    Lifetime.Post.Impressions.by.people.who.have.liked.your.Page,
    Lifetime.Post.reach.by.people.who.like.your.Page,
    Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post, 
    comment, like, share, Total.Interactions)

Choosing the K

set.seed(123)

fviz_nbclust(fbook_clean2_output_var, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

Going with 4 clusters

K means

km_results = 
  kmeans(fbook_clean2_output_var, 4)

print(km_results)
## K-means clustering with 4 clusters of sizes 80, 386, 21, 2
## 
## Cluster means:
##   Lifetime.Post.Total.Reach Lifetime.Post.Total.Impressions
## 1                 28044.550                       47547.562
## 2                  5289.347                        8864.459
## 3                 60027.429                      135393.952
## 4                 52576.000                      888037.000
##   Lifetime.Engaged.Users Lifetime.Post.Consumers Lifetime.Post.Consumptions
## 1               1690.688                1473.500                   2319.200
## 2                630.987                 564.544                   1069.142
## 3               1788.000                1330.810                   2454.667
## 4               3098.500                2533.000                   4595.500
##   Lifetime.Post.Impressions.by.people.who.have.liked.your.Page
## 1                                                    26516.250
## 2                                                     6320.744
## 3                                                    67814.524
## 4                                                   878222.000
##   Lifetime.Post.reach.by.people.who.like.your.Page
## 1                                        13718.000
## 2                                         3508.518
## 3                                        23778.286
## 4                                        47928.000
##   Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post   comment
## 1                                                           1191.5250 13.525000
## 2                                                            435.6684  4.310881
## 3                                                           1109.6667 16.047619
## 4                                                           2471.5000 76.500000
##       like    share Total.Interactions
## 1 314.5125 40.60000           368.6375
## 2 105.9301 20.16580           130.4067
## 3 570.9048 50.52381           637.4762
## 4 886.5000 98.00000          1061.0000
## 
## Clustering vector:
##   [1] 2 2 2 3 2 2 2 2 2 2 1 2 2 2 1 2 2 3 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2
##  [38] 2 1 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2 3 1 1 2 2 2 1 2 2 3 2 1 1
##  [75] 1 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 1 2 2
## [112] 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 1 1 3 2 1 2 2
## [149] 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 2 1 2 2 2 2 1 2 1 2 2
## [186] 2 2 2 2 2 1 2 2 1 2 2 2 2 1 2 2 2 3 2 2 3 2 2 2 1 2 2 2 2 1 2 1 1 2 2 2 2
## [223] 1 2 1 2 3 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 2 2 2 2 2 1 2 1 1 2 1 1 2 2 2
## [260] 1 2 2 1 2 2 2 2 2 1 3 3 2 2 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 3 2 1 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 2 3 2 2 2 2 1 2 1 2 2 2 2 2 2
## [334] 2 2 1 2 2 2 2 2 2 1 2 2 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 1 2
## [371] 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 3 2 2 2 1 1 3 2 2 2 2 1 2 2 2 2
## [408] 2 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 1 2
## [445] 2 2 2 2 2 1 2 2 4 2 2 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2
## [482] 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1]  53391958944  30457556711  89059189719 204264499799
##  (between_SS / total_SS =  90.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
fviz_cluster(km_results, data = fbook_clean2_output_var, 
             geom = "point",
             ellipse.type = "convex", 
             ggtheme = theme_bw()
             )

there is a lot of overlap in this cluster.