Acquire and familiarize yourself with the data

A. Identify two surveys that you want to compare.

I am using the two most recent waves of the World Values Survey to examine the responses to the following survey question:

How frequently do you think about meaning and purpose in life?

B. Describe the graph.

The graph that I will be making will attempt to convey the following:

  • A comparison of the response breakdown between countries within the same wave
  • A comparison of the response breakdown within countries across different waves.

This information can be represented by a stacked horizontal bar plot that has the proportion of survey responses on the horizontal axis and the countries on the vertical axis.

Moreover, this particular survey question has five responses of interest:

  1. Often
  2. Sometimes
  3. Rarely
  4. Never
  5. Don’t know

Applying the appropriate color palette could help the viewer to more easily interpret the breakdown of each country’s response. For example, we could use similar, positive colors for responses indicating a greater frequency of “thinking about meaning and purpose”.

C. Write down the target shape.

{wave: "", country: "", meaning: "", proportion: ""}

  • wave: “2005-2009”, “2010-2014”
  • country: “Afghanistan…”
  • meaning: “Often”, “Sometimes”, “Rarely”, “Never”, “Don’t Know”
  • proportion: “numeric”

Since the objective is to compare all countries that have values in both waves, we do not yet know the exact length of this particular variable.

Write code to analyze and visualize it

library(ggplot2)
library(dplyr)
library(reshape2)
library(scales)
library(RColorBrewer)
setwd("~/mailman/qmssviz/hw3/_posts/")

########################
# FUNCTIONS
########################
selectVars <- function(df, code, name) {
#     select variables to include in the subsetted
#     data frame. must supply variable code and desired
#     column name.
    if (length(code) != length(name)) {
    stop("Length of code and name vectors do not match.")
    }
    ## pre-allocate size of list
    varList <- vector("list", length(code))
    varList <- df[code]
    ## create data.frame from list
    dfNames <- data.frame(varList)
    names(dfNames) <- name
    return(dfNames)
}

factorizeVars <- function(df) {
#     convert all variables into factors
    data.frame(apply(df, 2, as.factor))
}

labelValues <- function(var, df, sep = ":") {
#     read in codebook from the current directory
#     for a given factor variable and label the values.
#     if no codebook found, then no labeling is done.
    filename <- paste(var, ".txt", sep = "")
    
    if(file.exists(filename)) {
        dfLabels <- read.delim(filename, header = FALSE, sep = sep)
        labeledVar <- factor(df[, var],
                              levels = dfLabels$V1,
                              labels = dfLabels$V2)
        labeledVar
    } else {
        cat(sprintf("%s does not exist in the current directory. \n", filename))
        cat(sprintf("No labeling done for %s.", var))
        df[, var]
    }
}

labelDataFrame <- function(df, sep = ":") {
#     read in codebooks from the current directory and
#     use them to label values in the supplied data.frame
    listLabeled <- sapply(colnames(df), function(var) {
        labelValues(var, df, sep = sep)
    })
    dfLabeled <- data.frame(listLabeled)
    return(dfLabeled)
}

reorderResponses <- function(df, var, newLevels) {
#     reorder factor levels of response variable
    df[, var] <- factor(df[, var],levels = newLevels)
    return(df)
}

handleMissingValues <- function(df, missingLabels) {
#     remove missing responses from the dataset
    df <- filter(df, !(meaning %in% missingLabels))
    return(df)
}

subsetByCountriesInBothWaves <- function(df, wave1, wave2) {
#     only include the countries that have responses in both waves
    firstWave <- df %>% filter(wave == wave1)
    secondWave <- df %>% filter(wave == wave2)
    
    firstWave <- firstWave[which(firstWave$country %in% secondWave$country), ]
    secondWave <- secondWave[which(secondWave$country %in% firstWave$country), ]
    
    df <- rbind(firstWave, secondWave)
    return(df)
}

reorderCountriesByProportion <- function(df, refWave) {
#     reorders the levels of the country factor variable by
#     the proportion of responses from the reference wave
    countryIndex <- df %>%
        group_by(meaning) %>%
        filter(wave == refWave) %>%
        filter(meaning == "Often") %>%
        arrange(desc(proportion))
    
    countryIndex <- data.frame(countryIndex)
    df$country <- factor(df$country, levels = countryIndex$country)
    return(df)
}

########################
# BEGIN SCRIPT
########################

## Globals
newLevels <- c("Often", "Sometimes", "Rarely", "Never", "No answer",
               "Not asked in survey", "Missing: Unknown", "Don´t know")
missingLabels <- c("No answer", "Missing: Unknown", "Not asked in survey")

## reading in the data
df <- load("~/Downloads/WVS_Longitudinal.rdata")
df <- `WVS_Longitudinal_1981-2014_spss_v_2014_06_17_(beta)`

df %>%
    # pre-processing
    selectVars(code = c("S002", "S003", "S020", "F001"),
               name = c("wave", "country", "year", "meaning")) %>%
    factorizeVars() %>%
    labelDataFrame() %>%
    reorderResponses(var = "meaning", newLevels = newLevels) %>%
    handleMissingValues(missingLabels = missingLabels) %>%
    ## get proportion of responses for each country by wave
    group_by(wave, country, meaning) %>%
        summarise(count = n()) %>%
        mutate(proportion = count/sum(count)) %>%
        arrange(meaning, desc(proportion)) %>%
    ## prep data for plotting
    subsetByCountriesInBothWaves(wave1 = "2005-2009",
                                 wave2 = "2010-2014") %>%
    reorderCountriesByProportion(refWave = "2005-2009") %>%
    
    ## generate the plot
    ggplot(aes(country, proportion, fill = meaning)) +
    geom_bar(stat = "identity", alpha = 7/8) + facet_grid(~wave) +
    ggtitle("How frequently do you think about meaning and purpose in life?") +
    xlab("Country") + ylab("Proportion of Survey Responses") +
    ## display proportion as percent
    scale_y_continuous(label = percent) +
    scale_fill_manual(values = rev(brewer.pal(n = 5, "RdYlBu"))) +
    coord_flip() + 
    ## minimalist theme
    theme_bw() +
    theme(legend.position = "bottom",
          legend.title = element_blank(),
          ## make reference lines darker
          panel.grid.major.x = element_line(color = "black"),
          panel.grid.major.y = element_blank(),
          panel.border = element_blank(),
          strip.background = element_blank(),
          axis.ticks = element_blank())
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## year.txt does not exist in the current directory. 
## No labeling done for year.

########################
# END SCRIPT
########################