Set Up

library(readr)
library(openxlsx)
library(textreadr)
library(topicmodels)
library(tidyverse)
library(stringr)
library(quanteda, pos = "package:base")
library(tidytext)
source("functions_all.R")
source("functions_survey.R")
TODAY <- params$TODAY
SOURCE.TYPE <- params$SOURCE.TYPE
TERMS <- params$TERMS

COUNTRIES <- c("USA", "India", "Germany", "Poland", "Mexico","China")
QUESTIONS <<- c("Q4a", "Q5a", "Q16a","Q16b")
survey_folder <- file.path('data/surveys')

Load Data

All Survey Data

file <- file.path(survey_folder,"J3289 Microsoft Segmentation verbatims.xlsm")

df.all <- lapply(QUESTIONS, function(x){
    read.xlsx(file, sheet = x, startRow = 8) %>%
    mutate(cleanText = cleanr(Verbatim), Question = x) %>%
    select(cleanText, Segment, Question, respid)
}) %>% bind_rows() %>%
    mutate(Segment = as.factor(Segment),
           Question = as.factor(Question)) %>%
    select(respid, Question, Segment, cleanText)

Questions

df.questions <- read_csv("data/Survey Question Info.csv") 
Number Title Question
Q4a Microsoft brand perceptions We’d like to understand more about your brand perceptions of Microsoft. Could you please tell us what you were thinking about when you gave your score of …
Q5a HP brand perceptions We’d like to understand more about your brand perceptions of HP. Could you please tell us what you were thinking about when you gave your score of …
Q16a Future purchases - most exciting Thinking about future technology purchases, what are you most excited about?
Q16b Future purchases - concerned or apprehensive Again, thinking about your future technology purchases, what, if anything are you concerned or apprehensive about?

Segmented Respondents

segment.stats <- df.all %>%
    distinct(respid, Segment) %>%
    group_by(Segment) %>%
    mutate(Seg.People = n()) %>%
    ungroup()

Codeframe

file <- file.path(survey_folder,
                  'P52_Tech consumer_persona_Universal Codeframe_13-07-2017-CLEAN.xlsx')

codeframe <- lapply(QUESTIONS, function(x) {
    SHEET <- paste("Codeframe","-",x)
    
    read.xlsx(file, sheet = SHEET) %>%
        mutate(Question = paste(x)) 
}) %>% bind_rows() %>%
    rename(Code = Codes)
Code Label Question
1 General like Q4a
2 User friendly/easy to use Q4a
3 Innovative Q4a
4 Features Q4a
5 Latest technology/quality technology/updates Q4a
6 Good services/security Q4a
7 High quality products/good products/satisfied Q4a
8 Reliable/safe products Q4a
9 Scope for improvement Q4a
10 Popularity/reputation/professional Q4a

Combine Data

data.coded <- lapply(COUNTRIES, function(y) {
    lapply(QUESTIONS, function(x) {
        file <- paste("Tech consumer persona_",y,"_coded_data_13-07-2017.xlsx",
                  sep = "")
        path <- file.path(survey_folder,'coded data',file)
        read.xlsx(path, sheet = x, check.names = TRUE) %>%
            rename_(Text = paste(x)) %>%
            mutate(Question = paste(x))
    }) %>%
        bind_rows() 
}) %>%
    bind_rows() %>%
    gather(CodeName, Code, starts_with("Code"), -respid, -Text, -Question) %>%
    select(-CodeName) %>%
    filter(!is.na(Code)) %>%
    left_join(codeframe, by = c("Question","Code")) %>%
    left_join(df.all, by = c("Question","respid")) %>%
    select(respid, Segment, Question, Label, Text)

Code Distribution By Segment (relative)

Below I make a dataset that make the label counts relative for each group and then subtracts the proportional mean for each label.

data.test <- data.coded %>%
    group_by(Question, Segment, Label) %>%
    summarize(n = n()) %>%
    filter(!is.na(Segment)) %>%
    mutate(freq = n / sum(n, na.rm = TRUE)) %>%
    ungroup() %>%
    group_by(Question, Label) %>%
    mutate(average = mean(freq, na.rm = TRUE),
           center = freq-mean(freq, na.rm = TRUE)) %>%
    mutate(check = freq - average) %>%
    arrange(Label,-freq)

Below are plots that show the the proportional distribution of each label by segment. This is not the best plot so I subset the labels in the following set of charts.

results <- lapply(QUESTIONS, function (x) {
    QUESTION <- x
    TITLE <- df.questions %>%
        filter(Number == QUESTION) %>% select(Title) %>% as.character()
    output <- data.test %>%
        filter(Question == QUESTION) %>%
        ggplot(aes(x = Label, fill = factor(Segment))) +
        geom_bar(aes(y = center), stat = "identity", show.legend = FALSE) +
        coord_flip() +
        facet_grid(. ~ Segment) +
        labs(title = paste(SOURCE.TYPE, QUESTION,TITLE,
                           "\nCode Distribution by Segment", sep=" - "),
        subtitle = paste("Date:", TODAY),
        y = "Distribution")
})

Q4a

Q5a

Q16a

Q16b

Top Codes by Segment

The above was difficult to read so just selected the top 8 codes for each segment.

TOP <- TERMS * 6

data.test <- data.coded %>%
    group_by(Question, Segment, Label) %>%
    summarize(n = n()) %>%
    filter(!is.na(Segment)) %>%
    mutate(freq = n / sum(n, na.rm = TRUE)) %>%
    ungroup() %>%
    
    group_by(Question, Label) %>%
    mutate(center = freq-mean(freq, na.rm = TRUE)) %>%

    group_by(Question, Segment) %>%
    top_n(TERMS, center) %>%
    ungroup()
results <- lapply(QUESTIONS, function (x) {
    QUESTION <- x
    TITLE <- df.questions %>%
        filter(Number == QUESTION) %>% select(Title) %>% as.character()
    

    data.test <- data.test %>%
        filter(Question == QUESTION) %>%
        
        arrange(Segment, center) %>%
        mutate(order = row_number())
    
    output <- data.test %>%
        ggplot(aes(order,center, fill = factor(Segment))) +
        geom_bar(stat = "identity", show.legend = FALSE) +
        facet_wrap( ~ Segment, scales = "free") +
        coord_flip() +
        scale_x_continuous(
            breaks = data.test$order,
            labels = data.test$Label,
            expand = c(0,0)) +
        labs(title = paste(SOURCE.TYPE, QUESTION,TITLE,
                           paste("\nTop", TERMS, "Codes by Segment", sep = " "),
                           sep=" - "),
        subtitle = paste("Date:", TODAY),
        y = "Distribution")
})

Q4a

Q5a

Q16a

Q16b