Set Up
Load Data
Combine Data
Code Distribution By Segment (relative)
- Q4a
- Q5a
- Q16a
- Q16b
Top Codes by Segment
- Q4a
- Q5a
- Q16a
- Q16b

Set Up

library(readr)
library(openxlsx)
library(textreadr)
library(topicmodels)
library(tidyverse)
library(stringr)
library(quanteda, pos = "package:base")
library(tidytext)
source("functions_all.R")
source("functions_survey.R")

TODAY <- params$TODAY
SOURCE.TYPE <- params$SOURCE.TYPE
TERMS <- params$TERMS

COUNTRIES <- c("USA", "India", "Germany", "Poland", "Mexico","China")
QUESTIONS <<- c("Q4a", "Q5a", "Q16a","Q16b")
survey_folder <- file.path('data/surveys')

Load Data

All Survey Data

file <- file.path(survey_folder,"J3289 Microsoft Segmentation verbatims.xlsm")

df.all <- lapply(QUESTIONS, function(x){
    read.xlsx(file, sheet = x, startRow = 8) %>%
    mutate(cleanText = cleanr(Verbatim), Question = x) %>%
    select(cleanText, Segment, Question, respid)
}) %>% bind_rows() %>%
    mutate(Segment = as.factor(Segment),
           Question = as.factor(Question)) %>%
    select(respid, Question, Segment, cleanText)

Questions

df.questions <- read_csv("data/Survey Question Info.csv")

Number	Title	Question
Q4a	Microsoft brand perceptions	We’d like to understand more about your brand perceptions of Microsoft. Could you please tell us what you were thinking about when you gave your score of …
Q5a	HP brand perceptions	We’d like to understand more about your brand perceptions of HP. Could you please tell us what you were thinking about when you gave your score of …
Q16a	Future purchases - most exciting	Thinking about future technology purchases, what are you most excited about?
Q16b	Future purchases - concerned or apprehensive	Again, thinking about your future technology purchases, what, if anything are you concerned or apprehensive about?

Segmented Respondents

segment.stats <- df.all %>%
    distinct(respid, Segment) %>%
    group_by(Segment) %>%
    mutate(Seg.People = n()) %>%
    ungroup()

Codeframe

file <- file.path(survey_folder,
                  'P52_Tech consumer_persona_Universal Codeframe_13-07-2017-CLEAN.xlsx')

codeframe <- lapply(QUESTIONS, function(x) {
    SHEET <- paste("Codeframe","-",x)
    
    read.xlsx(file, sheet = SHEET) %>%
        mutate(Question = paste(x)) 
}) %>% bind_rows() %>%
    rename(Code = Codes)

Code	Label	Question
1	General like	Q4a
2	User friendly/easy to use	Q4a
3	Innovative	Q4a
4	Features	Q4a
5	Latest technology/quality technology/updates	Q4a
6	Good services/security	Q4a
7	High quality products/good products/satisfied	Q4a
8	Reliable/safe products	Q4a
9	Scope for improvement	Q4a
10	Popularity/reputation/professional	Q4a

Combine Data

data.coded <- lapply(COUNTRIES, function(y) {
    lapply(QUESTIONS, function(x) {
        file <- paste("Tech consumer persona_",y,"_coded_data_13-07-2017.xlsx",
                  sep = "")
        path <- file.path(survey_folder,'coded data',file)
        read.xlsx(path, sheet = x, check.names = TRUE) %>%
            rename_(Text = paste(x)) %>%
            mutate(Question = paste(x))
    }) %>%
        bind_rows() 
}) %>%
    bind_rows() %>%
    gather(CodeName, Code, starts_with("Code"), -respid, -Text, -Question) %>%
    select(-CodeName) %>%
    filter(!is.na(Code)) %>%
    left_join(codeframe, by = c("Question","Code")) %>%
    left_join(df.all, by = c("Question","respid")) %>%
    select(respid, Segment, Question, Label, Text)

Code Distribution By Segment (relative)

Below I make a dataset that make the label counts relative for each group and then subtracts the proportional mean for each label.

data.test <- data.coded %>%
    group_by(Question, Segment, Label) %>%
    summarize(n = n()) %>%
    filter(!is.na(Segment)) %>%
    mutate(freq = n / sum(n, na.rm = TRUE)) %>%
    ungroup() %>%
    group_by(Question, Label) %>%
    mutate(average = mean(freq, na.rm = TRUE),
           center = freq-mean(freq, na.rm = TRUE)) %>%
    mutate(check = freq - average) %>%
    arrange(Label,-freq)

Below are plots that show the the proportional distribution of each label by segment. This is not the best plot so I subset the labels in the following set of charts.

results <- lapply(QUESTIONS, function (x) {
    QUESTION <- x
    TITLE <- df.questions %>%
        filter(Number == QUESTION) %>% select(Title) %>% as.character()
    output <- data.test %>%
        filter(Question == QUESTION) %>%
        ggplot(aes(x = Label, fill = factor(Segment))) +
        geom_bar(aes(y = center), stat = "identity", show.legend = FALSE) +
        coord_flip() +
        facet_grid(. ~ Segment) +
        labs(title = paste(SOURCE.TYPE, QUESTION,TITLE,
                           "\nCode Distribution by Segment", sep=" - "),
        subtitle = paste("Date:", TODAY),
        y = "Distribution")
})

Q4a

Q5a

Q16a

Q16b

Top Codes by Segment

The above was difficult to read so just selected the top 8 codes for each segment.

TOP <- TERMS * 6

data.test <- data.coded %>%
    group_by(Question, Segment, Label) %>%
    summarize(n = n()) %>%
    filter(!is.na(Segment)) %>%
    mutate(freq = n / sum(n, na.rm = TRUE)) %>%
    ungroup() %>%
    
    group_by(Question, Label) %>%
    mutate(center = freq-mean(freq, na.rm = TRUE)) %>%

    group_by(Question, Segment) %>%
    top_n(TERMS, center) %>%
    ungroup()

results <- lapply(QUESTIONS, function (x) {
    QUESTION <- x
    TITLE <- df.questions %>%
        filter(Number == QUESTION) %>% select(Title) %>% as.character()
    

    data.test <- data.test %>%
        filter(Question == QUESTION) %>%
        
        arrange(Segment, center) %>%
        mutate(order = row_number())
    
    output <- data.test %>%
        ggplot(aes(order,center, fill = factor(Segment))) +
        geom_bar(stat = "identity", show.legend = FALSE) +
        facet_wrap( ~ Segment, scales = "free") +
        coord_flip() +
        scale_x_continuous(
            breaks = data.test$order,
            labels = data.test$Label,
            expand = c(0,0)) +
        labs(title = paste(SOURCE.TYPE, QUESTION,TITLE,
                           paste("\nTop", TERMS, "Codes by Segment", sep = " "),
                           sep=" - "),
        subtitle = paste("Date:", TODAY),
        y = "Distribution")
})

HP OEM - Mapping Segments to Open-End Coding

Set Up

Load Data

All Survey Data

Questions

Segmented Respondents

Codeframe

Combine Data

Code Distribution By Segment (relative)

Q4a

Q5a

Q16a

Q16b

Top Codes by Segment

Q4a

Q5a

Q16a

Q16b