1. Setting up the working environment
2. Import data
3. Sample structure understanding
4. Data cleaning
5. Data exploration
- 5.1 Key influencing factors
- 5.2 Important factors

1. Setting up the working environment

library(tidyverse)
library(knitr)
library(kableExtra)

2. Import data

# import CSV ------------------------------------------------------------
df_input <- read_csv("survey_input.csv")

# import excel ------------------------------------------------------------
library(readxl)

df_names <- read_excel("df_names.xlsx", sheet = "df_names")
df_names_a <- read_excel("df_names.xlsx", sheet = "df_names_section_A")
df_names_b <- read_excel("df_names.xlsx", sheet = "df_names_section_B")

3. Sample structure understanding

3.1 Categorical variables

Majority of respondents are female in their 20s (20-24 yo), obtained University degree, having full-time jobs and originally from the South.

3.1.1 Sample structure table summary

# sample structure --------------------------------------------------------
# extract demographic section column names
v_d <- df_names %>% filter(section == "D") %>% .[["col_name"]]

df_input <- df_input %>% 
  mutate(native.village = factor(native.village, 
                                 levels = c("North", "Middle", "South"),
                                 labels = c("North", "Central", "South")),
         gender = factor(gender, 
                         levels = c("male", "female"),
                         labels = c("Male", "Female")),
         job = factor(job, 
                      levels = c("student", "part.time", "full.time", "housework"), 
                      labels = c("Student", "Part-time", "Full-time", "Housework")),
         educationlevel = factor(educationlevel, 
                                 levels = c("below.hs", "highschool", "university"),
                                 labels = c("Below high school", "High school", "University")),
         age.range = factor(age.range, 
                            levels = c("20_24t", "25_35t", ">35t"),
                            labels = c("20-24 yo", "25-35 yo", "Above 35 yo")))

df_demo <- df_input %>% select(ID, v_d)

df_sample_structure <-
  df_demo %>% 
  select_if(negate(is.double)) %>%
  gather(variable, value, 2:ncol(.)) %>%
  group_by(variable, value) %>%
  summarise(no_of_respondent = n()) %>%
  mutate(proportion = round(no_of_respondent / 300 * 100, 2)) %>%
  ungroup() %>%
  mutate(variable = factor(variable, 
                           levels = c("age.range", "educationlevel", "gender", "job", "native.village"),
                           labels = c("Age range", "Education level", "Gender", "Job", "Native village"))) %>% 
  arrange(variable, desc(proportion))

df_sample_structure %>% kable_adj(names = c("Variable", "Level", "No. of respondents", "% Respondents"))

Variable	Level	No. of respondents	% Respondents
Age range	20-24 yo	173	57.67
Age range	Above 35 yo	71	23.67
Age range	25-35 yo	56	18.67
Education level	University	267	89.00
Education level	High school	31	10.33
Education level	Below high school	2	0.67
Gender	Female	199	66.33
Gender	Male	101	33.67
Job	Full-time	155	51.67
Job	Student	103	34.33
Job	Part-time	33	11.00
Job	Housework	9	3.00
Native village	South	195	65.00
Native village	Central	66	22.00
Native village	North	39	13.00

3.1.2 Sample structure plot

df_sample_structure %>% 
  ggplot(aes(reorder(value, no_of_respondent), no_of_respondent))+
  geom_col(fill = v_palette_pander[1])+
  geom_text(aes(y = no_of_respondent + 25, label = proportion))+
  facet_wrap(~variable, scales = "free_y")+
  labs(x="", y = "Number of respondents", title = "Number of respondents by age groups")+
  theme_bw()+
  geom_hline(yintercept = 30)+
  coord_flip()

3.2 Continuous variables

Respondent age is not normally distributed. 1/3 respondents are 22 years old.

ggplot(df_demo, aes(age))+
  geom_histogram(binwidth = 1,
                 fill = v_palette_pander[1])+
  scale_x_continuous(breaks = 20:70)+
  geom_vline(xintercept = 22, col = "red")+
  labs(title = "Respondents' age distribution", x = "Age", y = "Number of respondents")+
  theme_bw()

3.3 Continuous vs. categorical variables

Female respondents age is extremely right skewed.

df_demo %>% 
  ggplot(aes(age))+
  geom_density(aes(fill = gender, col = gender), alpha = 0.4)+
  scale_fill_manual(values = v_palette_pander[c(4,5)])+
  scale_color_manual(values = v_palette_pander[c(4,5)])+
  labs(title = "Respondents' age distribution by gender", 
      x = "Age", 
      y = "Density")+
  theme_bw()

Age distributions of those who are from the South and Central are wider spread than the North group.

df_demo %>% 
  ggplot(aes(native.village, age, col = native.village))+
  geom_boxplot(alpha = 0)+
  geom_jitter()+
  scale_color_manual(values = v_palette_pander[c(4,5,7)])+
  labs(title = "Respondents' age distribution by native village",
       x = "Native village",
       y = "Age")+
  theme_bw()

4. Data cleaning

Coding error is spotted! Each variable in brand section should have two levels only. Khai hoan brand has 4 levels.

# extract C-1 section column names 
v_c_1 <- df_names %>% filter(section == "C", question_no == 1) %>% .[["col_name"]]

df_input %>% 
  select(ID, v_c_1) %>% 
  mutate_at(vars(v_c_1), factor) %>%
  summary()

      ID                phu.quoc        phan.thiet     nam.ngu   
 Length:300         kh.pq   :256   kh.pt     :282   kh.nn  :150  
 Class :character   phu.quoc: 44   phan.thiet: 18   nam.ngu:150  
 Mode  :character                                                
                                                                 
     hanh.phuc        thanh.phong      de.nhi       chinsu       co.hau   
 hanh.phuc: 19   kh.tp      :299   de.nhi : 25   chinsu: 56   co.hau:  1  
 kh.hp    :281   thanh.phong:  1   kh.dnhi:275   kh.cs :244   kh.ch :299  
                                                                          
                                                                          
      hung.thinh     ba.mien        cholimex       nha.trang  
 hung.thinh: 39   ba.mien:  2   cholimex:  3   kh.nt    :292  
 kh.ht     :261   kh.bm  :298   kh.cho  :297   nha.trang:  8  
                                                              
                                                              
      hong.thanh      khai.hoan        thien.huong      tu.tuyet  
 hong.thanh:  4   k        :  7   kh.th      :298   kh.tt   :298  
 kh.hthanh :296   kh.kh    :259   thien.huong:  2   tu.tuyet:  2  
                  khai.hoan:  5                                   
                  NA's     : 29                                   
      lien.thanh 
 kh.lt     :283  
 lien.thanh: 17

Nam Ngu is the most popular brand and almost triple the second player - Chinsu in terms of the number of responses.

# recode multiple answers
df_brand <- df_input %>% 
  select(ID, v_c_1) %>%
  gather(brand, value, 2:ncol(.)) %>%
  mutate(recoded_value = ifelse(brand == value, "Y", "N")) %>%
  mutate(recoded_value = replace_na(recoded_value, "N")) %>%
  filter(recoded_value == "Y") %>%
  select(-value,-recoded_value)


# brand summary -----------------------------------------------------------
df_brand_summary <- df_brand %>% 
  group_by(brand) %>%
  summarise(no_of_response = n()) %>%
  arrange(desc(no_of_response)) %>%
  mutate(proportion = round(no_of_response / 300 * 100, 1))

# absolute
df_brand_summary %>%  
  ggplot(aes(reorder(brand, no_of_response), no_of_response))+
  geom_col(fill = v_palette_pander[1])+
  geom_text(aes(y = 170, label = proportion))+
  coord_flip()+
  labs(x = "Brands", y = "Number of Respondents",
       caption = "Numbers represent % respondents")+
  theme_bw()

ggsave(filename = "brand.png", width = 10, height = 6)

There are few respondents gave more than the required number of responses (maximum is 2).

df_brand_count <- df_brand %>% 
  group_by(ID) %>% 
  summarise(brand_count = n_distinct(brand)) %>% 
  ungroup() 

df_brand %>% 
  group_by(ID) %>% 
  mutate(brandID = row_number()) %>% 
  ungroup() %>% 
  select(ID, brand, brandID) %>% 
  spread(brandID, brand) %>% 
  left_join(df_brand_count, by = "ID") %>% 
  arrange(desc(brand_count)) %>% 
  head(n = 10) %>% 
  kable_adj(names = c("Respondent ID", "Brand 1", "Brand 2", "Brand 3", "Brand 4", "Number of brands declared"))

Respondent ID	Brand 1	Brand 2	Brand 3	Brand 4	Number of brands declared
KS2_29	phu.quoc	phan.thiet	nam.ngu	hung.thinh	4
KS2_111	phu.quoc	nam.ngu	nha.trang	NA	3
KS2_130	nam.ngu	chinsu	cholimex	NA	3
KS2_181	hanh.phuc	chinsu	hung.thinh	NA	3
KS2_224	phu.quoc	nam.ngu	lien.thanh	NA	3
KS2_255	hung.thinh	tu.tuyet	lien.thanh	NA	3
KS2_109	phu.quoc	hung.thinh	NA	NA	2
KS2_110	hung.thinh	cholimex	NA	NA	2
KS2_113	de.nhi	hung.thinh	NA	NA	2
KS2_115	nam.ngu	de.nhi	NA	NA	2

5. Data exploration

5.1 Key influencing factors

Average rating score of 6 key factors:

# section A ---------------------------------------------------------------
# extract A section column names 
v_a <- df_names %>% filter(section == "A") %>% .[["col_name"]]

df_a <- df_input %>% select(v_a) %>% select(-ID)
names(df_a) <- df_names_a[["col_label"]]

df_a %>% 
  summarise_all(~round(mean(.), 2)) %>% 
  kable_adj()

Health	Sensory	Tradition	Quality	Price	Convenience
3.95	4.09	3.98	4.16	4	4.08

Quality seems to be the most influencing factor.

The challenge of visualing ordinal categorical variables:

ggplot(df_a, aes(Health, Sensory))+
  geom_point()+
  theme_bw()

Using ‘jitter’ trick…

library(GGally)

df_a %>% 
  ggpairs(lower = list(continuous=wrap("points", position="jitter", alpha = 0.2)))

… or using dimension reduction technique?

library(psych)
l_res_pca <- principal(df_a, nfactors=2, rotate="none")
df_res_pca_score <- l_res_pca[["scores"]] %>% tbl_df()

df_res_pca_loading <- 
  tibble("item" = names(df_a),
         "PC1" = l_res_pca[["loadings"]][1:6],
         "PC2" = l_res_pca[["loadings"]][7:12])

ggplot(df_res_pca_loading, aes(PC1, PC2))+
  geom_text(aes(label = item))+
  scale_x_continuous(breaks = seq(0,1, 0.1), limits = c(0,1))+
  scale_y_continuous(breaks = seq(-1, 1, 0.1), limits = c(-1,1))+
  theme_bw()

respondent_labels <- 1:nrow(df_input)
# respondent_col <- rainbow(nrow(df_input))

bind_cols(df_res_pca_score, df_demo) %>% 
  ggplot(aes(PC1, PC2))+
  geom_text(aes(label = respondent_labels, col = gender), 
            position = position_jitter(height = 0.4))+
  theme_bw()+
  theme(legend.position = "bottom")

Revealing special cases:

Respondent 19:

df_a %>% filter(row_number() == 19) %>% kable_adj()

Health	Sensory	Tradition	Quality	Price	Convenience
1	2	2	3	4	4

Respondent 133:

df_a %>% filter(row_number() == 133) %>% kable_adj()

Health	Sensory	Tradition	Quality	Price	Convenience
5	5	5	5	1	1

5.2 Important factors

Looking into sub-factors, the product origin seems to be the most concerning factor when choosing fish sauce products.

# section B ---------------------------------------------------------------
# extract B section column names 
v_b <- df_names %>% filter(section == "B") %>% .[["col_name"]]

df_input %>% 
  summarise_at(vars(v_b), ~round(mean(.),2)) %>% 
  gather(variable, value, 1:ncol(.)) %>% 
  left_join(df_names_b, by = c("variable" = "col_name")) %>%
  arrange(col_group, col_des) %>% 
  select(col_des, value) %>% 
  mutate(value = ifelse(value > 4,
                  cell_spec(value, color = "#1e90ff", bold = T),
                  cell_spec(value, color = "black"))) %>% 
  kable_adj(names = c("Factors", "Value")) %>% 
  pack_rows("Convenience", 1, 4) %>%
  pack_rows("Health", 5, 7) %>%
  pack_rows("Price", 8, 10) %>%
  pack_rows("Quality and safety", 11, 14) %>%
  pack_rows("Sensory appeal", 15, 19) %>%
  pack_rows("Traditional value", 20, 24)

Factors	Value
Convenience
Can be bought in shops close to where I live or work	3.85
Is easily available in shops and supermarkets	4.04
Is easy to prepare	3.83
Takes no time to prepare	3.91
Health
Contains lots of protein, vitamins, omega-3, iron and lysine	3.9
Eating with fish sauce will help us eat more, thus providing more active energy	3.68
Keeps me healthy	3.82
Price
Is cheap	3.25
Is good value for money	4.02
Is not expensive	3.65
Quality and safety
Conform to Vietnam or global quality standard	3.8
Get certification of Vietnam high quality goods by consumers	3.8
Has a clear origin of production	4.18
Has Eco-friendly packaging	3.57
Sensory appeal
Distinctive brownish color	3.7
Eye-catching packaging	3.58
Has a pleasant texture	3.7
Smells nice	3.91
Tastes good	4.03
Traditional value
Be processed/prepared in a special way	3.9
Has origin of materials (geographical indications)	4.23
Is associated to specific celebrations of my country/family	3.55
Is familiar with me and my family	3.74
Is special product that reflects the nation or local culture	4.06

df_b_long <- df_input %>% 
  select(ID, v_b) %>% 
  gather(variable, value, 2:ncol(.)) %>% 
  left_join(df_names_b, by = c("variable" = "col_name"))

df_b_long_summary <- df_b_long %>% 
  group_by(variable, col_des, col_des_vn, col_group) %>% 
  summarise(value = round(mean(value), 1)) %>% 
  ungroup() %>% 
  arrange(value)

df_b_long %>% 
  group_by(variable, col_des, col_des_vn, col_group, value) %>% 
  summarise(no_of_response = n()) %>%  
  ungroup() %>% 
  ggplot(aes(col_des, no_of_response))+
  geom_col(aes(fill = value))+
  geom_text(aes(y = 320, label = value, col = value, size = value), data = df_b_long_summary, fontface = "bold")+
  coord_flip()+
  scale_fill_gradient(high = "#132B43", low = "#56B1F7")+
  scale_color_gradient(high = "#132B43", low = "#56B1F7")+
  facet_grid(col_group ~ ., scales = "free_y")+
  theme(axis.text = element_text(size = 12),
        legend.position = "bottom")+
  guides(col = F,
         size = F)+
  labs(x = "", y = "Number of respondents", fill = "Rating score")+
  theme_bw()

The origin factor is highly important for both male and female groups. Male respondents seem to give higher scores in most of the factors compared to their counterparts. However, female respondents seem to care more about whether the products are available in shops, keeping them healthy and reflecting the local culture than male.

df_input %>% 
  group_by(gender) %>% 
  summarise_at(vars(v_b), ~round(mean(.),2)) %>% 
  gather(variable, value, 2:ncol(.)) %>% 
  left_join(df_names_b, by = c("variable" = "col_name")) %>% 
  select(col_group, col_des, gender, value) %>% 
  ggplot(aes(value, reorder(col_des, value), col = gender, group = gender, shape = gender))+
  geom_point(size = 2)+
  scale_color_manual(values = v_palette_pander[c(4,5)])+
  facet_grid(col_group ~., scales = "free_y")+
  labs(x = "Rating score", y = "Factors", fill = "Gender")+
  theme_bw()

df_input %>% 
  group_by(gender) %>% 
  summarise_at(vars(v_b), ~round(mean(.),2)) %>% 
  gather(variable, value, 2:ncol(.)) %>% 
  left_join(df_names_b, by = c("variable" = "col_name")) %>% 
  select(col_group, col_des, gender, value) %>% 
  spread(gender, value) %>% 
  arrange(col_group, col_des) %>% 
  select(col_des, Male, Female) %>% 
  mutate_if(is.numeric, function(x) {
    cell_spec(x, bold = T, 
              color = spec_color(x, end = 0.9),
              font_size = spec_font_size(x))
  }) %>% 
  kable_adj(names = c("Factors", "Male", "Female")) %>% 
  pack_rows("Convenience", 1, 4) %>%
  pack_rows("Health", 5, 7) %>% 
   pack_rows("Price", 8, 10) %>%
  pack_rows("Quality and safety", 11, 14) %>%
  pack_rows("Sensory appeal", 15, 19) %>%
  pack_rows("Traditional value", 20, 24)

Factors	Male	Female
Convenience
Can be bought in shops close to where I live or work	3.89	3.83
Is easily available in shops and supermarkets	3.99	4.07
Is easy to prepare	3.84	3.82
Takes no time to prepare	3.94	3.9
Health
Contains lots of protein, vitamins, omega-3, iron and lysine	4.06	3.82
Eating with fish sauce will help us eat more, thus providing more active energy	3.74	3.65
Keeps me healthy	3.76	3.85
Price
Is cheap	3.18	3.28
Is good value for money	4.05	4.01
Is not expensive	3.66	3.64
Quality and safety
Conform to Vietnam or global quality standard	3.9	3.74
Get certification of Vietnam high quality goods by consumers	3.9	3.75
Has a clear origin of production	4.2	4.17
Has Eco-friendly packaging	3.58	3.56
Sensory appeal
Distinctive brownish color	3.66	3.71
Eye-catching packaging	3.65	3.55
Has a pleasant texture	3.76	3.67
Smells nice	4.03	3.85
Tastes good	4.12	3.98
Traditional value
Be processed/prepared in a special way	3.91	3.89
Has origin of materials (geographical indications)	4.24	4.23
Is associated to specific celebrations of my country/family	3.53	3.56
Is familiar with me and my family	3.71	3.75
Is special product that reflects the nation or local culture	3.99	4.09

Consumer attitude towards fish sauce products

Xuan Pham

12/17/2019