library(readr)
library(openxlsx)
library(textreadr)
library(topicmodels)
library(tidyverse)
library(stringr)
library(quanteda, pos = "package:base")
library(tidytext)
source("functions_all.R")
source("functions_survey.R")
TODAY <- params$TODAY
SOURCE.TYPE <- params$SOURCE.TYPE
TERMS <- params$TERMS
COUNTRIES <- c("USA", "India", "Germany", "Poland", "Mexico","China")
QUESTIONS <<- c("Q4a", "Q5a", "Q16a","Q16b")
survey_folder <- file.path('data/surveys')
file <- file.path(survey_folder,"J3289 Microsoft Segmentation verbatims.xlsm")
df.all <- lapply(QUESTIONS, function(x){
read.xlsx(file, sheet = x, startRow = 8) %>%
mutate(cleanText = cleanr(Verbatim), Question = x) %>%
select(cleanText, Segment, Question, respid)
}) %>% bind_rows() %>%
mutate(Segment = as.factor(Segment),
Question = as.factor(Question)) %>%
select(respid, Question, Segment, cleanText)
df.questions <- read_csv("data/Survey Question Info.csv")
| Number | Title | Question |
|---|---|---|
| Q4a | Microsoft brand perceptions | We’d like to understand more about your brand perceptions of Microsoft. Could you please tell us what you were thinking about when you gave your score of … |
| Q5a | HP brand perceptions | We’d like to understand more about your brand perceptions of HP. Could you please tell us what you were thinking about when you gave your score of … |
| Q16a | Future purchases - most exciting | Thinking about future technology purchases, what are you most excited about? |
| Q16b | Future purchases - concerned or apprehensive | Again, thinking about your future technology purchases, what, if anything are you concerned or apprehensive about? |
segment.stats <- df.all %>%
distinct(respid, Segment) %>%
group_by(Segment) %>%
mutate(Seg.People = n()) %>%
ungroup()
file <- file.path(survey_folder,
'P52_Tech consumer_persona_Universal Codeframe_13-07-2017-CLEAN.xlsx')
codeframe <- lapply(QUESTIONS, function(x) {
SHEET <- paste("Codeframe","-",x)
read.xlsx(file, sheet = SHEET) %>%
mutate(Question = paste(x))
}) %>% bind_rows() %>%
rename(Code = Codes)
| Code | Label | Question |
|---|---|---|
| 1 | General like | Q4a |
| 2 | User friendly/easy to use | Q4a |
| 3 | Innovative | Q4a |
| 4 | Features | Q4a |
| 5 | Latest technology/quality technology/updates | Q4a |
| 6 | Good services/security | Q4a |
| 7 | High quality products/good products/satisfied | Q4a |
| 8 | Reliable/safe products | Q4a |
| 9 | Scope for improvement | Q4a |
| 10 | Popularity/reputation/professional | Q4a |
data.coded <- lapply(COUNTRIES, function(y) {
lapply(QUESTIONS, function(x) {
file <- paste("Tech consumer persona_",y,"_coded_data_13-07-2017.xlsx",
sep = "")
path <- file.path(survey_folder,'coded data',file)
read.xlsx(path, sheet = x, check.names = TRUE) %>%
rename_(Text = paste(x)) %>%
mutate(Question = paste(x))
}) %>%
bind_rows()
}) %>%
bind_rows() %>%
gather(CodeName, Code, starts_with("Code"), -respid, -Text, -Question) %>%
select(-CodeName) %>%
filter(!is.na(Code)) %>%
left_join(codeframe, by = c("Question","Code")) %>%
left_join(df.all, by = c("Question","respid")) %>%
select(respid, Segment, Question, Label, Text)
Below I make a dataset that make the label counts relative for each group and then subtracts the proportional mean for each label.
data.test <- data.coded %>%
group_by(Question, Segment, Label) %>%
summarize(n = n()) %>%
filter(!is.na(Segment)) %>%
mutate(freq = n / sum(n, na.rm = TRUE)) %>%
ungroup() %>%
group_by(Question, Label) %>%
mutate(average = mean(freq, na.rm = TRUE),
center = freq-mean(freq, na.rm = TRUE)) %>%
mutate(check = freq - average) %>%
arrange(Label,-freq)
Below are plots that show the the proportional distribution of each label by segment. This is not the best plot so I subset the labels in the following set of charts.
results <- lapply(QUESTIONS, function (x) {
QUESTION <- x
TITLE <- df.questions %>%
filter(Number == QUESTION) %>% select(Title) %>% as.character()
output <- data.test %>%
filter(Question == QUESTION) %>%
ggplot(aes(x = Label, fill = factor(Segment))) +
geom_bar(aes(y = center), stat = "identity", show.legend = FALSE) +
coord_flip() +
facet_grid(. ~ Segment) +
labs(title = paste(SOURCE.TYPE, QUESTION,TITLE,
"\nCode Distribution by Segment", sep=" - "),
subtitle = paste("Date:", TODAY),
y = "Distribution")
})
The above was difficult to read so just selected the top 8 codes for each segment.
TOP <- TERMS * 6
data.test <- data.coded %>%
group_by(Question, Segment, Label) %>%
summarize(n = n()) %>%
filter(!is.na(Segment)) %>%
mutate(freq = n / sum(n, na.rm = TRUE)) %>%
ungroup() %>%
group_by(Question, Label) %>%
mutate(center = freq-mean(freq, na.rm = TRUE)) %>%
group_by(Question, Segment) %>%
top_n(TERMS, center) %>%
ungroup()
results <- lapply(QUESTIONS, function (x) {
QUESTION <- x
TITLE <- df.questions %>%
filter(Number == QUESTION) %>% select(Title) %>% as.character()
data.test <- data.test %>%
filter(Question == QUESTION) %>%
arrange(Segment, center) %>%
mutate(order = row_number())
output <- data.test %>%
ggplot(aes(order,center, fill = factor(Segment))) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap( ~ Segment, scales = "free") +
coord_flip() +
scale_x_continuous(
breaks = data.test$order,
labels = data.test$Label,
expand = c(0,0)) +
labs(title = paste(SOURCE.TYPE, QUESTION,TITLE,
paste("\nTop", TERMS, "Codes by Segment", sep = " "),
sep=" - "),
subtitle = paste("Date:", TODAY),
y = "Distribution")
})