Visualizing Heirarchical and Latent Class Clustering of Singapore’s 2018 media survey

Installing the packages

Loading the dataset

df<-read_csv("data/survey_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Area of Residence` = col_character(),
##   `Resident Status` = col_character(),
##   `Age Group` = col_character(),
##   Gender = col_character(),
##   Race = col_character(),
##   `Employment Status` = col_character(),
##   Occupation = col_character(),
##   `Occupation Group` = col_character(),
##   `Monthly Income from Work Group` = col_character(),
##   `Highest Qualification` = col_character(),
##   `Housing Type` = col_character(),
##   `Marital Status` = col_character(),
##   `Have Children` = col_character(),
##   `A1_1 of Residents` = col_character(),
##   `A1_2 of Residents` = col_character(),
##   `A1_3 of Residents` = col_character(),
##   `A1_4 of Residents` = col_character(),
##   `A1_5 of Residents` = col_character(),
##   `A1_6 of Residents` = col_character(),
##   `A1_7 of Residents` = col_character()
##   # ... with 341 more columns
## )
## See spec(...) for full column specifications.

1. Data Wrangling

1.1.Segmenting the media categories to groups

tv_col <- c("A1_1 of Residents","A1_2 of Residents")
radio_col <- c("A1_3 of Residents", "A1_4 of Residents")
newspapers_magazines_print <- c("A1_5 of Residents", "A1_7 of Residents", 
                                "A1_9 of Residents", "A1_10 of Residents")
newspapers_magazines_digital <- c("A1_6 of Residents","A1_8 of Residents","A1_11 of Residents")
video_music_col <- c("A1_12 of Residents", "A1_13 of Residents")
socialmedia_col <- c("A1_14 of Residents")
messaging_col <- c("A1_15 of Residents")

1.2 Converting Categorical responses to Binary numeric values

df[df=="Mentioned"]<-1
df[df=="Not mentioned"]<-0

1.3 Subseting dataset for first set of responses

sub_1<-c("Age Group","Monthly Income from Work Group","A1_1 of Residents","A1_2 of Residents","A1_3 of Residents", "A1_4 of Residents","A1_5 of Residents", "A1_7 of Residents", 
                                "A1_9 of Residents", "A1_10 of Residents","A1_6 of Residents","A1_8 of Residents","A1_11 of Residents","A1_12 of Residents", "A1_13 of Residents","A1_14 of Residents","A1_15 of Residents")
df_1<-df[,sub_1]
head(df_1)
## # A tibble: 6 x 17
##   `Age Group` `Monthly Income~ `A1_1 of Reside~ `A1_2 of Reside~
##   <chr>       <chr>            <chr>            <chr>           
## 1 40-44       Refused to Answ~ 1                0               
## 2 60-64       S$1,500 - S$1,9~ 1                0               
## 3 35-39       S$5,000 - S$5,9~ 1                1               
## 4 35-39       S$11,000 - S$11~ 1                1               
## 5 50-54       S$4,500 - S$4,9~ 1                0               
## 6 25-29       S$10,000 - S$10~ 1                1               
## # ... with 13 more variables: `A1_3 of Residents` <chr>, `A1_4 of
## #   Residents` <chr>, `A1_5 of Residents` <chr>, `A1_7 of Residents` <chr>,
## #   `A1_9 of Residents` <chr>, `A1_10 of Residents` <chr>, `A1_6 of
## #   Residents` <chr>, `A1_8 of Residents` <chr>, `A1_11 of Residents` <chr>,
## #   `A1_12 of Residents` <chr>, `A1_13 of Residents` <chr>, `A1_14 of
## #   Residents` <chr>, `A1_15 of Residents` <chr>

1.3 Converting all the fields to

df_1$`A1_1 of Residents`<-as.integer(df_1$`A1_1 of Residents`)
df_1$`A1_2 of Residents`<-as.integer(df_1$`A1_2 of Residents`)
df_1$`A1_3 of Residents`<-as.integer(df_1$`A1_3 of Residents`)
df_1$`A1_4 of Residents`<-as.integer(df_1$`A1_4 of Residents`)
df_1$`A1_5 of Residents`<-as.integer(df_1$`A1_5 of Residents`)
df_1$`A1_6 of Residents`<-as.integer(df_1$`A1_6 of Residents`)
df_1$`A1_7 of Residents`<-as.integer(df_1$`A1_7 of Residents`)
df_1$`A1_8 of Residents`<-as.integer(df_1$`A1_8 of Residents`)
df_1$`A1_9 of Residents`<-as.integer(df_1$`A1_9 of Residents`)
df_1$`A1_10 of Residents`<-as.integer(df_1$`A1_10 of Residents`)
df_1$`A1_11 of Residents`<-as.integer(df_1$`A1_11 of Residents`)
df_1$`A1_12 of Residents`<-as.integer(df_1$`A1_12 of Residents`)
df_1$`A1_13 of Residents`<-as.integer(df_1$`A1_13 of Residents`)
df_1$`A1_14 of Residents`<-as.integer(df_1$`A1_14 of Residents`)
df_1$`A1_15 of Residents`<-as.integer(df_1$`A1_14 of Residents`)
df_1 %>%
   replace(is.na(.), 0)
## # A tibble: 1,047 x 17
##    `Age Group` `Monthly Income~ `A1_1 of Reside~ `A1_2 of Reside~
##    <chr>       <chr>                       <int>            <int>
##  1 40-44       Refused to Answ~                1                0
##  2 60-64       S$1,500 - S$1,9~                1                0
##  3 35-39       S$5,000 - S$5,9~                1                1
##  4 35-39       S$11,000 - S$11~                1                1
##  5 50-54       S$4,500 - S$4,9~                1                0
##  6 25-29       S$10,000 - S$10~                1                1
##  7 25-29       Refused to Answ~                0                0
##  8 70 and abo~ S$2,500 - S$2,9~                0                0
##  9 60-64       S$3,000 - S$3,4~                1                1
## 10 25-29       S$3,500 - S$3,9~                0                0
## # ... with 1,037 more rows, and 13 more variables: `A1_3 of Residents` <int>,
## #   `A1_4 of Residents` <int>, `A1_5 of Residents` <int>, `A1_7 of
## #   Residents` <int>, `A1_9 of Residents` <int>, `A1_10 of Residents` <int>,
## #   `A1_6 of Residents` <int>, `A1_8 of Residents` <int>, `A1_11 of
## #   Residents` <int>, `A1_12 of Residents` <int>, `A1_13 of Residents` <int>,
## #   `A1_14 of Residents` <int>, `A1_15 of Residents` <int>
head(df_1)
## # A tibble: 6 x 17
##   `Age Group` `Monthly Income~ `A1_1 of Reside~ `A1_2 of Reside~
##   <chr>       <chr>                       <int>            <int>
## 1 40-44       Refused to Answ~                1                0
## 2 60-64       S$1,500 - S$1,9~                1                0
## 3 35-39       S$5,000 - S$5,9~                1                1
## 4 35-39       S$11,000 - S$11~                1                1
## 5 50-54       S$4,500 - S$4,9~                1                0
## 6 25-29       S$10,000 - S$10~                1                1
## # ... with 13 more variables: `A1_3 of Residents` <int>, `A1_4 of
## #   Residents` <int>, `A1_5 of Residents` <int>, `A1_7 of Residents` <int>,
## #   `A1_9 of Residents` <int>, `A1_10 of Residents` <int>, `A1_6 of
## #   Residents` <int>, `A1_8 of Residents` <int>, `A1_11 of Residents` <int>,
## #   `A1_12 of Residents` <int>, `A1_13 of Residents` <int>, `A1_14 of
## #   Residents` <int>, `A1_15 of Residents` <int>

1.4 Changing the column names

df_age <- df_1 %>% mutate(TV = rowSums(df_1[,tv_col]),
                     Radio = rowSums(df_1[,radio_col]),
                    `Newspapers & Magazines (Print)` = rowSums(df_1[,newspapers_magazines_print]),
                    `Newspapers & Magazines (Digital)` = rowSums(df_1[,newspapers_magazines_digital]),
                    `Online Video & Music` = rowSums(df_1[,video_music_col]),
                    `Social Media` = rowSums(df_1[,socialmedia_col]),
                    `Messaging Apps` = rowSums(df_1[,messaging_col])) %>%
              dplyr::select(`Age Group`, 
                     TV, Radio, `Newspapers & Magazines (Print)`, 
                    `Newspapers & Magazines (Digital)`,`Online Video & Music`, 
                    `Social Media`, `Messaging Apps`)


head(df_age)
## # A tibble: 6 x 8
##   `Age Group`    TV Radio `Newspapers & M~ `Newspapers & M~ `Online Video &~
##   <chr>       <dbl> <dbl>            <dbl>            <dbl>            <dbl>
## 1 40-44           1     1                0                3                1
## 2 60-64           1     0                1                0                1
## 3 35-39           2     0                0                0                1
## 4 35-39           2     0                0                2                2
## 5 50-54           1     1                2                1                2
## 6 25-29           2     0                0                2                2
## # ... with 2 more variables: `Social Media` <dbl>, `Messaging Apps` <dbl>

Reconverting values to binary

df_age$TV[df_age$TV>1]<-1
df_age$Radio[df_age$Radio>1]<-1
df_age$`Newspapers & Magazines (Print)`[df_age$`Newspapers & Magazines (Print)`>1]<-1
df_age$`Newspapers & Magazines (Digital)`[df_age$`Newspapers & Magazines (Digital)`>1]<-1
df_age$`Online Video & Music`[df_age$`Online Video & Music`>1]<-1
df_age$`Social Media`[df_age$`Social Media`>1]<-1
df_age$`Messaging Apps`[df_age$`Messaging Apps`>1]<-1
head(df_age)
## # A tibble: 6 x 8
##   `Age Group`    TV Radio `Newspapers & M~ `Newspapers & M~ `Online Video &~
##   <chr>       <dbl> <dbl>            <dbl>            <dbl>            <dbl>
## 1 40-44           1     1                0                1                1
## 2 60-64           1     0                1                0                1
## 3 35-39           1     0                0                0                1
## 4 35-39           1     0                0                1                1
## 5 50-54           1     1                1                1                1
## 6 25-29           1     0                0                1                1
## # ... with 2 more variables: `Social Media` <dbl>, `Messaging Apps` <dbl>

2. Latent Class Analysis

Identification of groups based on their media habits is a natural business question because of its relevance to the targeting of public communications. LCA is a statistical method used to group individuals into classes of an unobserved (i.e. latent) variable on the basis of their responses to a set of nominal, ordinal or continuous observed variables. LCA was selected as the appropriate analytics technique because it can be used with data with non-normal distributions that show heteroskedasticity or have heterogeneity of variance

2.1 Converting to integers

df_lca<-df_age
df_lca$TV<-as.integer(as.numeric(df_lca$TV))
df_lca$Radio<-as.integer(as.numeric(df_lca$Radio))
df_lca$`Newspapers & Magazines (Print)`<-as.integer(as.numeric(df_lca$`Newspapers & Magazines (Print)`))
df_lca$`Newspapers & Magazines (Digital)`<-as.integer(as.numeric(df_lca$`Newspapers & Magazines (Digital)`))
df_lca$`Online Video & Music`<-as.integer(as.numeric(df_lca$`Online Video & Music`))
df_lca$`Social Media`<-as.integer(as.numeric(df_lca$`Social Media`))
df_lca$`Messaging Apps`<-as.integer(as.numeric(df_lca$`Messaging Apps`))

2.2 poLCA is a package to conduct LCA in R. It does not accept 0 as a value and hence we add 1 to our binary values to convert them to 2 & 1 instead of 1 & 0

df_lca <- df_lca[2:8]+1
head(df_lca)
##   TV Radio Newspapers & Magazines (Print) Newspapers & Magazines (Digital)
## 1  2     2                              1                                2
## 2  2     1                              2                                1
## 3  2     1                              1                                1
## 4  2     1                              1                                2
## 5  2     2                              2                                2
## 6  2     1                              1                                2
##   Online Video & Music Social Media Messaging Apps
## 1                    2            2              2
## 2                    2            2              2
## 3                    2            2              2
## 4                    2            2              2
## 5                    2            2              2
## 6                    2            2              2
lca.polca<-poLCA(cbind(TV,Radio,`Newspapers & Magazines (Print)`, `Newspapers & Magazines (Digital)`,`Online Video & Music`,`Social Media`,`Messaging Apps`)~1,nclass=3,data=df_lca,nrep=1,na.rm = TRUE, graphs = F)
## Conditional item response (column) probabilities,
##  by outcome variable, for each class (row) 
##  
## $TV
##            Pr(1)  Pr(2)
## class 1:  0.4307 0.5693
## class 2:  0.0743 0.9257
## class 3:  0.1071 0.8929
## 
## $Radio
##            Pr(1)  Pr(2)
## class 1:  0.5528 0.4472
## class 2:  0.3952 0.6048
## class 3:  0.6327 0.3673
## 
## $`Newspapers & Magazines (Print)`
##            Pr(1)  Pr(2)
## class 1:  0.9075 0.0925
## class 2:  0.2969 0.7031
## class 3:  0.5000 0.5000
## 
## $`Newspapers & Magazines (Digital)`
##            Pr(1)  Pr(2)
## class 1:  0.3634 0.6366
## class 2:  0.5567 0.4433
## class 3:  0.7500 0.2500
## 
## $`Online Video & Music`
##            Pr(1)  Pr(2)
## class 1:  0.1103 0.8897
## class 2:  0.1433 0.8567
## class 3:  0.6224 0.3776
## 
## $`Social Media`
##           Pr(1) Pr(2)
## class 1:      0     1
## class 2:      0     1
## class 3:      1     0
## 
## $`Messaging Apps`
##           Pr(1) Pr(2)
## class 1:      0     1
## class 2:      0     1
## class 3:      1     0
## 
## Estimated class population shares 
##  0.4041 0.4087 0.1872 
##  
## Predicted class memberships (by modal posterior prob.) 
##  0.4269 0.3859 0.1872 
##  
## ========================================================= 
## Fit for 3 latent classes: 
## ========================================================= 
## number of observations: 1047 
## number of estimated parameters: 23 
## residual degrees of freedom: 104 
## maximum log-likelihood: -3585.772 
##  
## AIC(3): 7217.544
## BIC(3): 7331.478
## G^2(3): 115.0451 (Likelihood ratio/deviance statistic) 
## X^2(3): 121.3545 (Chi-square goodness of fit) 
##  
## ALERT: iterations finished, MAXIMUM LIKELIHOOD NOT FOUND 
## 
lca_plot<-melt(lca.polca$probs)
revalue(lca_plot$Var2, c("Pr(1)"="Not Mentioned"))->lca_plot$Var2
revalue(lca_plot$Var2, c("Pr(2)"="Mentioned"))->lca_plot$Var2
lca_g_1<-ggplot(lca_plot, aes(x=L1, y=value, fill=Var2))+geom_bar(stat="identity",position="stack")+facet_wrap(~Var1)
names(lca_plot)[names(lca_plot) == "Var1"] <- "Classes"

2.3 Plotting LCA as a stacked bar chart

lca_g_2<-ggplot(lca_plot, aes(x=L1, y=value, fill=Var2))+geom_bar(stat="identity",position="stack")+facet_wrap(~Classes)+scale_x_discrete("Class")+scale_y_continuous("Proportion")+scale_fill_discrete("Factor Level")+theme_bw()+scale_fill_manual(values=c("#D53233", "#268AE6"), 
        name="Response")+coord_flip()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
ggplotly(lca_g_2)

2.4 Plotting the classes as a mmultiple line chart to view each class and its probability

lca_plot_2<-subset(lca_plot, Var2=="Mentioned")
head(lca_plot_2)
##      Classes      Var2     value    L1
## 4  class 1:  Mentioned 0.5693216    TV
## 5  class 2:  Mentioned 0.9257190    TV
## 6  class 3:  Mentioned 0.8928571    TV
## 10 class 1:  Mentioned 0.4471878 Radio
## 11 class 2:  Mentioned 0.6047971 Radio
## 12 class 3:  Mentioned 0.3673469 Radio
lca_plot_2<-lca_plot_2 %>%
  pivot_wider(names_from = L1, values_from = value)
head(lca_plot_2)
## # A tibble: 3 x 9
##   Classes Var2     TV Radio `Newspapers & M~ `Newspapers & M~ `Online Video &~
##   <fct>   <fct> <dbl> <dbl>            <dbl>            <dbl>            <dbl>
## 1 "class~ Ment~ 0.569 0.447           0.0925            0.637            0.890
## 2 "class~ Ment~ 0.926 0.605           0.703             0.443            0.857
## 3 "class~ Ment~ 0.893 0.367           0.5               0.25             0.378
## # ... with 2 more variables: `Social Media` <dbl>, `Messaging Apps` <dbl>
ggparcoord(lca_plot_2,columns=3:9, groupColumn="Classes", alphaLines = 1, scale="uniminmax")+ theme_minimal() + theme(axis.text.x = element_text(angle=45, hjust=1), 
                                axis.title.x = element_blank(),
                                axis.title = element_text(family = 'Helvetica', size =11),
                                axis.text = element_text(family = 'Helvetica', size =10),
                                legend.text= element_text(family = 'Helvetica', size =10),
                                legend.title = element_text(family = 'Helvetica', size =11))
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

3. Heatmap

Heatmaps are good for showing variance across multiple variables, revealing any patterns, displaying whether any variables are similar to each other, and for detecting if any correlations exist in-between them. The shades of blue on the heatmap show the variations in the number of respondents who mentioned that they made use of that media platform in the past 4 weeks.

3.1 Data munging to convert the dataframe into a matrix

df_age_p<-df_age%>%
  pivot_longer(-`Age Group`,names_to = "Category", values_to = "Outcome",values_ptypes = list(Outcome = 'character'))
head(df_age_p)
## # A tibble: 6 x 3
##   `Age Group` Category                         Outcome
##   <chr>       <chr>                            <chr>  
## 1 40-44       TV                               1      
## 2 40-44       Radio                            1      
## 3 40-44       Newspapers & Magazines (Print)   0      
## 4 40-44       Newspapers & Magazines (Digital) 1      
## 5 40-44       Online Video & Music             1      
## 6 40-44       Social Media                     1
df_age_p$Outcome<-as.integer(df_age_p$Outcome)
df_age_p<-aggregate(Outcome~`Age Group`+Category, data=df_age_p, FUN = sum)
head(df_age_p)
##   Age Group       Category Outcome
## 1     25-29 Messaging Apps     164
## 2     30-34 Messaging Apps     102
## 3     35-39 Messaging Apps      99
## 4     40-44 Messaging Apps     116
## 5     45-49 Messaging Apps     132
## 6     50-54 Messaging Apps      95
df_age<-df_age_p %>%
  pivot_wider(names_from = Category, values_from = Outcome)

row.names(df_age)<-df_age$`Age Group`
## Warning: Setting row names on a tibble is deprecated.
head(df_age)
## # A tibble: 6 x 8
##   `Age Group` `Messaging Apps` `Newspapers & M~ `Newspapers & M~
##   <chr>                  <int>            <int>            <int>
## 1 25-29                    164              108               36
## 2 30-34                    102               68               22
## 3 35-39                     99               59               27
## 4 40-44                    116               73               59
## 5 45-49                    132               81               60
## 6 50-54                     95               40               73
## # ... with 4 more variables: `Online Video & Music` <int>, Radio <int>, `Social
## #   Media` <int>, TV <int>

3.2 Converting dataframe to matrix

df_mat_age<-dplyr::select(df_age, c(2:8))
age_mat<-data.matrix(df_mat_age)

3.3 Statistical Method for assesing optimal number of clusters with Euclidean method

mat_d <- dist(age_mat, method = "euclidean")
dend_expend(mat_d)[[3]]
##   dist_methods hclust_methods     optim
## 1      unknown         ward.D 0.7401989
## 2      unknown        ward.D2 0.7447490
## 3      unknown         single 0.6435877
## 4      unknown       complete 0.7417255
## 5      unknown        average 0.7694610
## 6      unknown       mcquitty 0.7661193
## 7      unknown         median 0.7511730
## 8      unknown       centroid 0.7576126
mat_clust <- hclust(mat_d, method = "average")
num_k <- find_k(mat_clust)
plot(num_k)

3.4 Plotting the heatmap

p <- heatmaply(age_mat, 
        xlab = "Media Categories", ylab = "Age Groups", 
        main = "How Singaporeans View Media\n across all Age Groups\n (Euclidean  method)",
        dist_method="euclidean",
        hclust_method="mcquitty",
        k_row=2,
        seriate="GW",
        margins = c(0,150,125,70),
        grid_color = "white",
        grid_width = 0.001,
        titleX = FALSE,
        branches_lwd = 0.3,
        label_names = c("Age Group", "Category","Outcome"),
        colors = Reds,
        fontsize_row = 8, fontsize_col = 8,
        labCol = colnames(age_mat),
        labRow = rownames(age_mat),
        heatmap_layers = theme(axis.line=element_blank())
        )

p

3.5 Statistical Method for assesing optimal number of clusters with Complete method

mat_d <- dist(mat_d, method = "maximum")
dend_expend(mat_d)[[3]]
##   dist_methods hclust_methods     optim
## 1      unknown         ward.D 0.7401989
## 2      unknown        ward.D2 0.7447490
## 3      unknown         single 0.6435877
## 4      unknown       complete 0.7417255
## 5      unknown        average 0.7694610
## 6      unknown       mcquitty 0.7661193
## 7      unknown         median 0.7511730
## 8      unknown       centroid 0.7576126
mat_clust <- hclust(mat_d, method = "complete")
num_k <- find_k(mat_clust)
plot(num_k)

p <- heatmaply(age_mat, 
        xlab = "Media Categories", ylab = "Age Groups", 
        main = "How Singaporeans View Media\n across all Age Groups\n (Maximum  method)",
        dist_methid="maximum",
        hclust_method="complete",
        k_row=5,
        seriate="GW",
        margins = c(0,150,125,70),
        grid_color = "white",
        grid_width = 0.001,
        titleX = FALSE,
        branches_lwd = 0.3,
        label_names = c("Age Group", "Category","Outcome"),
        colors = Reds,
        fontsize_row = 8, fontsize_col = 8,
        labCol = colnames(age_mat),
        labRow = rownames(age_mat),
        heatmap_layers = theme(axis.line=element_blank())
        )

p

3.6 Statistical Method for assesing optimal number of clusters with Average method

mat_d <- dist(mat_d, method = "manhattan")
dend_expend(mat_d)[[3]]
##   dist_methods hclust_methods     optim
## 1      unknown         ward.D 0.8193890
## 2      unknown        ward.D2 0.8491837
## 3      unknown         single 0.8438174
## 4      unknown       complete 0.8667901
## 5      unknown        average 0.8714375
## 6      unknown       mcquitty 0.8651621
## 7      unknown         median 0.8687502
## 8      unknown       centroid 0.8447406
mat_clust <- hclust(mat_d, method = "average")
num_k <- find_k(mat_clust)
plot(num_k)

p <- heatmaply(age_mat, 
        xlab = "Media Categories", ylab = "Age Groups", 
        main = "How Singaporeans View Media\n across all Age Groups\n (Mahattan  method)",
        dist_methid="Manhattan",
        hclust_method="average",
        k_row=2,
        seriate="GW",
        margins = c(0,150,125,70),
        grid_color = "white",
        grid_width = 0.001,
        titleX = FALSE,
        branches_lwd = 0.3,
        label_names = c("Age Group", "Category","Outcome"),
        colors = Reds,
        fontsize_row = 8, fontsize_col = 8,
        labCol = colnames(age_mat),
        labRow = rownames(age_mat),
        heatmap_layers = theme(axis.line=element_blank())
        )

p