library(tidyverse)
library(knitr)
library(kableExtra)
# import CSV ------------------------------------------------------------
df_input <- read_csv("survey_input.csv")
# import excel ------------------------------------------------------------
library(readxl)
df_names <- read_excel("df_names.xlsx", sheet = "df_names")
df_names_a <- read_excel("df_names.xlsx", sheet = "df_names_section_A")
df_names_b <- read_excel("df_names.xlsx", sheet = "df_names_section_B")
Majority of respondents are female in their 20s (20-24 yo), obtained University degree, having full-time jobs and originally from the South.
# sample structure --------------------------------------------------------
# extract demographic section column names
v_d <- df_names %>% filter(section == "D") %>% .[["col_name"]]
df_input <- df_input %>%
mutate(native.village = factor(native.village,
levels = c("North", "Middle", "South"),
labels = c("North", "Central", "South")),
gender = factor(gender,
levels = c("male", "female"),
labels = c("Male", "Female")),
job = factor(job,
levels = c("student", "part.time", "full.time", "housework"),
labels = c("Student", "Part-time", "Full-time", "Housework")),
educationlevel = factor(educationlevel,
levels = c("below.hs", "highschool", "university"),
labels = c("Below high school", "High school", "University")),
age.range = factor(age.range,
levels = c("20_24t", "25_35t", ">35t"),
labels = c("20-24 yo", "25-35 yo", "Above 35 yo")))
df_demo <- df_input %>% select(ID, v_d)
df_sample_structure <-
df_demo %>%
select_if(negate(is.double)) %>%
gather(variable, value, 2:ncol(.)) %>%
group_by(variable, value) %>%
summarise(no_of_respondent = n()) %>%
mutate(proportion = round(no_of_respondent / 300 * 100, 2)) %>%
ungroup() %>%
mutate(variable = factor(variable,
levels = c("age.range", "educationlevel", "gender", "job", "native.village"),
labels = c("Age range", "Education level", "Gender", "Job", "Native village"))) %>%
arrange(variable, desc(proportion))
df_sample_structure %>% kable_adj(names = c("Variable", "Level", "No. of respondents", "% Respondents"))
| Variable | Level | No. of respondents | % Respondents |
|---|---|---|---|
| Age range | 20-24 yo | 173 | 57.67 |
| Age range | Above 35 yo | 71 | 23.67 |
| Age range | 25-35 yo | 56 | 18.67 |
| Education level | University | 267 | 89.00 |
| Education level | High school | 31 | 10.33 |
| Education level | Below high school | 2 | 0.67 |
| Gender | Female | 199 | 66.33 |
| Gender | Male | 101 | 33.67 |
| Job | Full-time | 155 | 51.67 |
| Job | Student | 103 | 34.33 |
| Job | Part-time | 33 | 11.00 |
| Job | Housework | 9 | 3.00 |
| Native village | South | 195 | 65.00 |
| Native village | Central | 66 | 22.00 |
| Native village | North | 39 | 13.00 |
df_sample_structure %>%
ggplot(aes(reorder(value, no_of_respondent), no_of_respondent))+
geom_col(fill = v_palette_pander[1])+
geom_text(aes(y = no_of_respondent + 25, label = proportion))+
facet_wrap(~variable, scales = "free_y")+
labs(x="", y = "Number of respondents", title = "Number of respondents by age groups")+
theme_bw()+
geom_hline(yintercept = 30)+
coord_flip()
Respondent age is not normally distributed. 1/3 respondents are 22 years old.
ggplot(df_demo, aes(age))+
geom_histogram(binwidth = 1,
fill = v_palette_pander[1])+
scale_x_continuous(breaks = 20:70)+
geom_vline(xintercept = 22, col = "red")+
labs(title = "Respondents' age distribution", x = "Age", y = "Number of respondents")+
theme_bw()
Female respondents age is extremely right skewed.
df_demo %>%
ggplot(aes(age))+
geom_density(aes(fill = gender, col = gender), alpha = 0.4)+
scale_fill_manual(values = v_palette_pander[c(4,5)])+
scale_color_manual(values = v_palette_pander[c(4,5)])+
labs(title = "Respondents' age distribution by gender",
x = "Age",
y = "Density")+
theme_bw()
Age distributions of those who are from the South and Central are wider spread than the North group.
df_demo %>%
ggplot(aes(native.village, age, col = native.village))+
geom_boxplot(alpha = 0)+
geom_jitter()+
scale_color_manual(values = v_palette_pander[c(4,5,7)])+
labs(title = "Respondents' age distribution by native village",
x = "Native village",
y = "Age")+
theme_bw()
Coding error is spotted! Each variable in brand section should have two levels only. Khai hoan brand has 4 levels.
# extract C-1 section column names
v_c_1 <- df_names %>% filter(section == "C", question_no == 1) %>% .[["col_name"]]
df_input %>%
select(ID, v_c_1) %>%
mutate_at(vars(v_c_1), factor) %>%
summary()
ID phu.quoc phan.thiet nam.ngu
Length:300 kh.pq :256 kh.pt :282 kh.nn :150
Class :character phu.quoc: 44 phan.thiet: 18 nam.ngu:150
Mode :character
hanh.phuc thanh.phong de.nhi chinsu co.hau
hanh.phuc: 19 kh.tp :299 de.nhi : 25 chinsu: 56 co.hau: 1
kh.hp :281 thanh.phong: 1 kh.dnhi:275 kh.cs :244 kh.ch :299
hung.thinh ba.mien cholimex nha.trang
hung.thinh: 39 ba.mien: 2 cholimex: 3 kh.nt :292
kh.ht :261 kh.bm :298 kh.cho :297 nha.trang: 8
hong.thanh khai.hoan thien.huong tu.tuyet
hong.thanh: 4 k : 7 kh.th :298 kh.tt :298
kh.hthanh :296 kh.kh :259 thien.huong: 2 tu.tuyet: 2
khai.hoan: 5
NA's : 29
lien.thanh
kh.lt :283
lien.thanh: 17
Nam Ngu is the most popular brand and almost triple the second player - Chinsu in terms of the number of responses.
# recode multiple answers
df_brand <- df_input %>%
select(ID, v_c_1) %>%
gather(brand, value, 2:ncol(.)) %>%
mutate(recoded_value = ifelse(brand == value, "Y", "N")) %>%
mutate(recoded_value = replace_na(recoded_value, "N")) %>%
filter(recoded_value == "Y") %>%
select(-value,-recoded_value)
# brand summary -----------------------------------------------------------
df_brand_summary <- df_brand %>%
group_by(brand) %>%
summarise(no_of_response = n()) %>%
arrange(desc(no_of_response)) %>%
mutate(proportion = round(no_of_response / 300 * 100, 1))
# absolute
df_brand_summary %>%
ggplot(aes(reorder(brand, no_of_response), no_of_response))+
geom_col(fill = v_palette_pander[1])+
geom_text(aes(y = 170, label = proportion))+
coord_flip()+
labs(x = "Brands", y = "Number of Respondents",
caption = "Numbers represent % respondents")+
theme_bw()
ggsave(filename = "brand.png", width = 10, height = 6)
There are few respondents gave more than the required number of responses (maximum is 2).
df_brand_count <- df_brand %>%
group_by(ID) %>%
summarise(brand_count = n_distinct(brand)) %>%
ungroup()
df_brand %>%
group_by(ID) %>%
mutate(brandID = row_number()) %>%
ungroup() %>%
select(ID, brand, brandID) %>%
spread(brandID, brand) %>%
left_join(df_brand_count, by = "ID") %>%
arrange(desc(brand_count)) %>%
head(n = 10) %>%
kable_adj(names = c("Respondent ID", "Brand 1", "Brand 2", "Brand 3", "Brand 4", "Number of brands declared"))
| Respondent ID | Brand 1 | Brand 2 | Brand 3 | Brand 4 | Number of brands declared |
|---|---|---|---|---|---|
| KS2_29 | phu.quoc | phan.thiet | nam.ngu | hung.thinh | 4 |
| KS2_111 | phu.quoc | nam.ngu | nha.trang | NA | 3 |
| KS2_130 | nam.ngu | chinsu | cholimex | NA | 3 |
| KS2_181 | hanh.phuc | chinsu | hung.thinh | NA | 3 |
| KS2_224 | phu.quoc | nam.ngu | lien.thanh | NA | 3 |
| KS2_255 | hung.thinh | tu.tuyet | lien.thanh | NA | 3 |
| KS2_109 | phu.quoc | hung.thinh | NA | NA | 2 |
| KS2_110 | hung.thinh | cholimex | NA | NA | 2 |
| KS2_113 | de.nhi | hung.thinh | NA | NA | 2 |
| KS2_115 | nam.ngu | de.nhi | NA | NA | 2 |
Average rating score of 6 key factors:
# section A ---------------------------------------------------------------
# extract A section column names
v_a <- df_names %>% filter(section == "A") %>% .[["col_name"]]
df_a <- df_input %>% select(v_a) %>% select(-ID)
names(df_a) <- df_names_a[["col_label"]]
df_a %>%
summarise_all(~round(mean(.), 2)) %>%
kable_adj()
| Health | Sensory | Tradition | Quality | Price | Convenience |
|---|---|---|---|---|---|
| 3.95 | 4.09 | 3.98 | 4.16 | 4 | 4.08 |
Quality seems to be the most influencing factor.
The challenge of visualing ordinal categorical variables:
ggplot(df_a, aes(Health, Sensory))+
geom_point()+
theme_bw()
Using ‘jitter’ trick…
library(GGally)
df_a %>%
ggpairs(lower = list(continuous=wrap("points", position="jitter", alpha = 0.2)))
… or using dimension reduction technique?
library(psych)
l_res_pca <- principal(df_a, nfactors=2, rotate="none")
df_res_pca_score <- l_res_pca[["scores"]] %>% tbl_df()
df_res_pca_loading <-
tibble("item" = names(df_a),
"PC1" = l_res_pca[["loadings"]][1:6],
"PC2" = l_res_pca[["loadings"]][7:12])
ggplot(df_res_pca_loading, aes(PC1, PC2))+
geom_text(aes(label = item))+
scale_x_continuous(breaks = seq(0,1, 0.1), limits = c(0,1))+
scale_y_continuous(breaks = seq(-1, 1, 0.1), limits = c(-1,1))+
theme_bw()
respondent_labels <- 1:nrow(df_input)
# respondent_col <- rainbow(nrow(df_input))
bind_cols(df_res_pca_score, df_demo) %>%
ggplot(aes(PC1, PC2))+
geom_text(aes(label = respondent_labels, col = gender),
position = position_jitter(height = 0.4))+
theme_bw()+
theme(legend.position = "bottom")
Revealing special cases:
Respondent 19:
df_a %>% filter(row_number() == 19) %>% kable_adj()
| Health | Sensory | Tradition | Quality | Price | Convenience |
|---|---|---|---|---|---|
| 1 | 2 | 2 | 3 | 4 | 4 |
Respondent 133:
df_a %>% filter(row_number() == 133) %>% kable_adj()
| Health | Sensory | Tradition | Quality | Price | Convenience |
|---|---|---|---|---|---|
| 5 | 5 | 5 | 5 | 1 | 1 |
Looking into sub-factors, the product origin seems to be the most concerning factor when choosing fish sauce products.
# section B ---------------------------------------------------------------
# extract B section column names
v_b <- df_names %>% filter(section == "B") %>% .[["col_name"]]
df_input %>%
summarise_at(vars(v_b), ~round(mean(.),2)) %>%
gather(variable, value, 1:ncol(.)) %>%
left_join(df_names_b, by = c("variable" = "col_name")) %>%
arrange(col_group, col_des) %>%
select(col_des, value) %>%
mutate(value = ifelse(value > 4,
cell_spec(value, color = "#1e90ff", bold = T),
cell_spec(value, color = "black"))) %>%
kable_adj(names = c("Factors", "Value")) %>%
pack_rows("Convenience", 1, 4) %>%
pack_rows("Health", 5, 7) %>%
pack_rows("Price", 8, 10) %>%
pack_rows("Quality and safety", 11, 14) %>%
pack_rows("Sensory appeal", 15, 19) %>%
pack_rows("Traditional value", 20, 24)
| Factors | Value |
|---|---|
| Convenience | |
| Can be bought in shops close to where I live or work | 3.85 |
| Is easily available in shops and supermarkets | 4.04 |
| Is easy to prepare | 3.83 |
| Takes no time to prepare | 3.91 |
| Health | |
| Contains lots of protein, vitamins, omega-3, iron and lysine | 3.9 |
| Eating with fish sauce will help us eat more, thus providing more active energy | 3.68 |
| Keeps me healthy | 3.82 |
| Price | |
| Is cheap | 3.25 |
| Is good value for money | 4.02 |
| Is not expensive | 3.65 |
| Quality and safety | |
| Conform to Vietnam or global quality standard | 3.8 |
| Get certification of Vietnam high quality goods by consumers | 3.8 |
| Has a clear origin of production | 4.18 |
| Has Eco-friendly packaging | 3.57 |
| Sensory appeal | |
| Distinctive brownish color | 3.7 |
| Eye-catching packaging | 3.58 |
| Has a pleasant texture | 3.7 |
| Smells nice | 3.91 |
| Tastes good | 4.03 |
| Traditional value | |
| Be processed/prepared in a special way | 3.9 |
| Has origin of materials (geographical indications) | 4.23 |
| Is associated to specific celebrations of my country/family | 3.55 |
| Is familiar with me and my family | 3.74 |
| Is special product that reflects the nation or local culture | 4.06 |
df_b_long <- df_input %>%
select(ID, v_b) %>%
gather(variable, value, 2:ncol(.)) %>%
left_join(df_names_b, by = c("variable" = "col_name"))
df_b_long_summary <- df_b_long %>%
group_by(variable, col_des, col_des_vn, col_group) %>%
summarise(value = round(mean(value), 1)) %>%
ungroup() %>%
arrange(value)
df_b_long %>%
group_by(variable, col_des, col_des_vn, col_group, value) %>%
summarise(no_of_response = n()) %>%
ungroup() %>%
ggplot(aes(col_des, no_of_response))+
geom_col(aes(fill = value))+
geom_text(aes(y = 320, label = value, col = value, size = value), data = df_b_long_summary, fontface = "bold")+
coord_flip()+
scale_fill_gradient(high = "#132B43", low = "#56B1F7")+
scale_color_gradient(high = "#132B43", low = "#56B1F7")+
facet_grid(col_group ~ ., scales = "free_y")+
theme(axis.text = element_text(size = 12),
legend.position = "bottom")+
guides(col = F,
size = F)+
labs(x = "", y = "Number of respondents", fill = "Rating score")+
theme_bw()
The origin factor is highly important for both male and female groups. Male respondents seem to give higher scores in most of the factors compared to their counterparts. However, female respondents seem to care more about whether the products are available in shops, keeping them healthy and reflecting the local culture than male.
df_input %>%
group_by(gender) %>%
summarise_at(vars(v_b), ~round(mean(.),2)) %>%
gather(variable, value, 2:ncol(.)) %>%
left_join(df_names_b, by = c("variable" = "col_name")) %>%
select(col_group, col_des, gender, value) %>%
ggplot(aes(value, reorder(col_des, value), col = gender, group = gender, shape = gender))+
geom_point(size = 2)+
scale_color_manual(values = v_palette_pander[c(4,5)])+
facet_grid(col_group ~., scales = "free_y")+
labs(x = "Rating score", y = "Factors", fill = "Gender")+
theme_bw()
df_input %>%
group_by(gender) %>%
summarise_at(vars(v_b), ~round(mean(.),2)) %>%
gather(variable, value, 2:ncol(.)) %>%
left_join(df_names_b, by = c("variable" = "col_name")) %>%
select(col_group, col_des, gender, value) %>%
spread(gender, value) %>%
arrange(col_group, col_des) %>%
select(col_des, Male, Female) %>%
mutate_if(is.numeric, function(x) {
cell_spec(x, bold = T,
color = spec_color(x, end = 0.9),
font_size = spec_font_size(x))
}) %>%
kable_adj(names = c("Factors", "Male", "Female")) %>%
pack_rows("Convenience", 1, 4) %>%
pack_rows("Health", 5, 7) %>%
pack_rows("Price", 8, 10) %>%
pack_rows("Quality and safety", 11, 14) %>%
pack_rows("Sensory appeal", 15, 19) %>%
pack_rows("Traditional value", 20, 24)
| Factors | Male | Female |
|---|---|---|
| Convenience | ||
| Can be bought in shops close to where I live or work | 3.89 | 3.83 |
| Is easily available in shops and supermarkets | 3.99 | 4.07 |
| Is easy to prepare | 3.84 | 3.82 |
| Takes no time to prepare | 3.94 | 3.9 |
| Health | ||
| Contains lots of protein, vitamins, omega-3, iron and lysine | 4.06 | 3.82 |
| Eating with fish sauce will help us eat more, thus providing more active energy | 3.74 | 3.65 |
| Keeps me healthy | 3.76 | 3.85 |
| Price | ||
| Is cheap | 3.18 | 3.28 |
| Is good value for money | 4.05 | 4.01 |
| Is not expensive | 3.66 | 3.64 |
| Quality and safety | ||
| Conform to Vietnam or global quality standard | 3.9 | 3.74 |
| Get certification of Vietnam high quality goods by consumers | 3.9 | 3.75 |
| Has a clear origin of production | 4.2 | 4.17 |
| Has Eco-friendly packaging | 3.58 | 3.56 |
| Sensory appeal | ||
| Distinctive brownish color | 3.66 | 3.71 |
| Eye-catching packaging | 3.65 | 3.55 |
| Has a pleasant texture | 3.76 | 3.67 |
| Smells nice | 4.03 | 3.85 |
| Tastes good | 4.12 | 3.98 |
| Traditional value | ||
| Be processed/prepared in a special way | 3.91 | 3.89 |
| Has origin of materials (geographical indications) | 4.24 | 4.23 |
| Is associated to specific celebrations of my country/family | 3.53 | 3.56 |
| Is familiar with me and my family | 3.71 | 3.75 |
| Is special product that reflects the nation or local culture | 3.99 | 4.09 |