Library loading

pacman::p_load(
  here,       # relative file pathways
  rio,        # importing data 
  
  # libraries for spss data
  haven,      
  sjlabelled,
  labelled,
  sjPlot,
  
  # libraries for data processing 
  dplyr,      # data cleaning 
  janitor,    # data cleaning and tables
  lubridate,  # working with dates
  matchmaker, # dictionary-based cleaning
  epikit,     # age_categories() function
  tidyverse,  # data management and visualization
  surveytoolbox,
  pbkrtest,
  foreign,
  gmodels,
  datawizard, # freq
  AMR,        # age  
  skimr,      # quick stat
  apyramid
)
## Installing package into 'C:/Users/linhp/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

Data import

mydata %>% sjPlot::view_df() # display SPSS variable view 
df <- mydata %>% janitor::clean_names()

Data Anonymous

A list of variables will be anonymised due to privacy concerns. Threshold: 1) df <- haven::as_factor(mydata), 2) df1$a3 <- haven::as_factor(df1$a3)

Step 1: Using haven::as_factor to read SPSS data with Value labels

# without the comma is a *list* subset
df[cols] <- lapply(df[cols], haven::as_factor)
sapply(df[cols], class) # check the class

# with the comma is a *matrix* subset 
df2 <- df
df2[, cols] <- lapply(df2[, cols], haven::as_factor) 

Step 2: Threshold

df[cols] <- lapply(df[cols], function(x) as.numeric(levels(x)[x]))
sapply(df[cols], class)
# Basic stats 
summaries <- vector(mode = "list", ncol(threshold))
for (i in 1:ncol(threshold)) {
  sm <- summary(threshold[[i]])
  summaries[[i]] <- sm
}
# histogram
outcome <- vector("list", 7)         
for (i in seq_along(threshold)) {
  print(i)
  var_name     <- names(threshold[i])
  title        <- paste0("Histogram of ", var_name, " values:") 
  x_lab        <- var_name
  outcome[[i]] <- hist(threshold[[i]], breaks = 60,
                     main = title, xlab = x_lab)
}
# frequency with datawizard
out <- vector("list", 7) 
for (i in seq_along(threshold)) {
  freq <- data_tabulate(threshold[[i]]) 
  out[[i]] <- freq
}
out

# lapply(threshold[cols], data_tabulate)

Step 3: Anonymise data

df <- df %>%   
  mutate(
      a3_1 = case_when(
      a3 >= 3 & a3 <= 6    ~ "3-6",
      a3 >= 7 & a3 <= 10   ~ "7-10",
      .default             = "11-12",    # TRUE ~ "11-12"
      is.na(a3)            ~ "No answer"
      ), .after = a3
  )

df <- df %>% select(-c(a3)) %>% rename('a3' = 'a3_1')

data_tabulate(df$a3)

Annual earning

df_h5 <- haven::as_factor(df_h5) 

Solution 1

# combine three income values into one value of annual_income 
df_h5 <- df_h5 %>% 
  unite(
    col = 'annual_income',
    c('h5a', 'h5b', 'h5c'),
    sep = " ",
    remove = FALSE,
    na.rm = TRUE
  )

Solution 2:

Average weekly income

Step 1: remove string, text

df_h5 <- df_h5 %>% 
  separate(h5a, into = c("w_from", "w_to"), sep = "-", extra = "merge")     

df_h5$w_from <- gsub('and more', '', as.character(df_h5$w_from))  # df$w_from <- str_replace(string = df$w_from, pattern = 'and more', replacement = '') 
df_h5$w_from <- as.numeric(gsub(",","", df_h5$w_from))             # remove comma (,) and turn to numeric
df_h5$w_to <- as.numeric(gsub(",","", df_h5$w_to))                 # remove comma (,) and turn to numeric

Step 2: annual income calculation

df_h5 <- df_h5 %>% 
  mutate(w_from_annual = (df_h5$w_from * 52), .after = w_to) %>%        # 52 week per year    
  mutate(w_to_annual = (df_h5$w_to * 52), .after = w_from_annual)       # 52 week per year 

Step 3: Unite columns

df_h5$w_from_annual <- as.character(df_h5$w_from_annual)
df_h5$w_to_annual <- as.character(df_h5$w_to_annual)

df_h5 <- df_h5 %>% 
  unite(
    col = 'annual_income (w)',
    c('w_from_annual', 'w_to_annual'),
    sep = '-',
    remove = TRUE,
    na.rm = FALSE
  )
df_h5$w_from <- NULL
df_h5$w_to <- NULL 
df_h5$`annual_income (w)` <- gsub('NA-NA', '', as.character(df_h5$`annual_income (w)`)) 
df_h5$`annual_income (w)` <- str_replace(df_h5$`annual_income (w)`, '54236-NA', '54236 and more')

Average monthly income

Step 1: remove string, text

df_h5 <- df_h5 %>% 
  separate(h5b, into = c("m_from", "m_to"), sep = "-", extra = "merge")     

df_h5$m_from <- gsub('and more', '', as.character(df_h5$m_from))           
df_h5$m_from <- as.numeric(gsub(",","",df_h5$m_from))                     
df_h5$m_to <- as.numeric(gsub(",","",df_h5$m_to))                          

Step 2: Calculate annual income

df_h5 <- df_h5 %>% 
  mutate(m_from_annual = (df_h5$m_from * 12), .after = m_to) %>%        # 12 month per year    
  mutate(m_to_annual = (df_h5$m_to * 12), .after = m_from_annual)       # 12 month per year 

Step 3: Unite columns

df_h5$m_from_annual <- as.character(df_h5$m_from_annual) 
df_h5$m_to_annual <- as.character(df_h5$m_to_annual) 

df_h5 <- df_h5 %>% 
  unite(
    col = 'annual_income (m)',
    c('m_from_annual', 'm_to_annual'),
    sep = '-',
    remove = TRUE,
    na.rm = FALSE
  )
df_h5$m_from <- NULL
df_h5$m_to <- NULL 

df_h5$`annual_income (m)` <- gsub('NA-NA', '', as.character(df_h5$`annual_income (m)`)) 
df_h5$`annual_income (m)` <- str_replace(df_h5$`annual_income (m)`, '55596-NA', '55596 and more')
str(df_h5$h5c)
df_h5$h5c <- as.character(df_h5$h5c)

df_h5 <- df_h5 %>% 
  unite(
    col = 'annual income (£)',
    c('annual_income (w)', 'annual_income (m)', 'h5c'),
    sep = ' ',
    remove = TRUE,
    na.rm = FALSE
  )
df_h5$`annual income (£)` <- gsub('NA','', df_h5$`annual income (£)`)
df_h5$`annual income (£)` <- gsub(',','', df_h5$`annual income (£)`)
df_h5$`annual income (£)` <- str_trim(df_h5$`annual income (£)`, side = c("both"))
df_h5 %>% 
  mutate(earnings = (fct_relevel(`annual income (£)`)), .after = `annual income (£)`) %>% 
  tabyl(earnings)
df_h5 <- df_h5 %>% select(c(h5, `annual income (£)`))
df_h5 %>% group_by(h5, `annual income (£)`) %>% summarise(count = n()) 
## `summarise()` has grouped output by 'h5'. You can override using the `.groups`
## argument.

Label missing data

Data preparation

names(which(colSums(is.na(df)) > 0)) # List cols having NA 
df[cols] <- lapply(df[cols], as.factor) # Convert data from numeric to factor
sapply(df[cols], class)
df <- haven::as_factor(df)

Data label

# Label missing value by 'No answer'
df <- df %>% mutate(across(where(is.factor), ~fct_na_value_to_level(.,'No answer')))

df <- df %>% 
  mutate(region_fb    = fct_recode(region_fb, 'Not applicable'    = 'No answer'),
         age_fb       = fct_recode(age_fb, 'Not applicable'       = 'No answer'),
         distribution = fct_recode(distribution, 'Not applicable' = 'No answer'),
         adset        = fct_recode(adset, 'Not applicable'        = 'No answer')
         )

# Check missing value
names(which(colSums(is.na(df)) > 0))

# display SPSS variable view
df %>% sjPlot::view_df()  
class(df)
# For multiple choice questions, labelling '0' by 'Not marked - Marked' 
df <- df %>% mutate(across(c(b1a_1:b2_12, e5_1:e6_5), 
                            ~case_when(. == 0           ~ "Not marked",
                                       . == "No answer" ~ "No answer",    # .default = 
                                       TRUE             ~ "Marked")))
lapply(df[cols], data_tabulate)

Export data

df <- haven::as_factor(df) # display SPSS variable view
class(df)
df %>% sjPlot::view_df()
write_sav(df, "Annonymised data.sav")

Data visualisation

Likert-scale questions

Step 1: Data preparation

valid_questions <- df %>% select(c(c5a_1:c5a_5))                                      
valid_questions <- valid_questions %>% gather(key = "Question_num", value = "Answer") # wide to long format 
# valid_questions <- na.omit(valid_questions) 
valid_questions_summary <- valid_questions %>%
 group_by(Question_num, Answer) %>%
    dplyr::summarize(freq = n()) %>%
      ungroup %>% group_by(Question_num) %>% 
        mutate(proportion = round(freq / sum(freq),1))
## `summarise()` has grouped output by 'Question_num'. You can override using the
## `.groups` argument.

Step 2: Data visualisation

ggplot(valid_questions, aes(x=Question_num)) +
  geom_bar(aes(fill=Answer), position = "fill") +
  geom_text(
    data = valid_questions_summary,
    aes(y = freq, label = scales::percent(proportion), group = Answer),
    position = position_fill(vjust = 0.5),
    color = 'gray25', size = 3
  ) +
  scale_fill_brewer(palette = 'Spectral', direction = -1) +
  scale_y_continuous(expand = expansion(0), labels = scales::percent_format()) +
  labs(
    title = "Likert Plot of Question c5", subtitle = "Valid responses",
    x = "", 
    y = ""
  ) +
  theme(legend.text  = element_text(size= 8.5),
        legend.title = element_text(size= 9)) +
  theme_classic()

ggplot(valid_questions, aes(x=Question_num)) +
  geom_bar(aes(fill=Answer), position = "fill") +
  geom_text(
    data = valid_questions_summary,
    aes(y = freq, label = scales::percent(proportion), group = Answer),
    position = position_fill(vjust = 0.5),
    color = 'gray25', size = 3
  ) +
  scale_fill_brewer(palette = 'Spectral', direction = -1) +
  scale_y_continuous(expand = expansion(0), labels = scales::percent_format()) +
  labs(
    title = "Likert Plot of Question c5", subtitle = "Valid responses",
    x = "", 
    y = ""
  ) +
  theme_classic() +
  theme(legend.position = "top") +
  theme(legend.text  = element_text(size= 8.5),
        legend.title = element_text(size= 9)) +
  coord_flip()

In case we don’t want to showcase the answer No answer:

Step 1: Data preparation

valid_questions12 <- valid_questions %>% filter(Answer != "No answer")

valid_questions_summary <- valid_questions12 %>%
 group_by(Question_num, Answer) %>%
    dplyr::summarize(freq = n()) %>%
      ungroup %>% group_by(Question_num) %>% 
        mutate(proportion = round(freq / sum(freq),1))
## `summarise()` has grouped output by 'Question_num'. You can override using the
## `.groups` argument.

Step 2: Data visualisation

ggplot(valid_questions12, aes(x=Question_num)) +
  geom_bar(aes(fill=Answer), position = "fill") +
  geom_text(
    data = valid_questions_summary,
    aes(y = freq, label = scales::percent(proportion), group = Answer),
    position = position_fill(vjust = 0.5),
    color = 'gray25', size = 3
  ) +
  scale_fill_brewer(palette = 'Spectral', direction = -1) +
  scale_y_continuous(expand = expansion(0), labels = scales::percent_format()) +
  labs(
    title = "Likert Plot of Question c5", 
    subtitle = "Valid responses",
    x = "",
    y = "") +
  theme_classic() +
  theme(legend.position = "top") +
  theme(legend.text  = element_text(size= 8.5),
        legend.title = element_blank()) +        # remove legend title 
  coord_flip()

Age-group questions

age <- df %>% select(c(h1_1, h2)) %>% 
  filter(h1_1 != "No answer") %>% 
  filter(!h2 %in% c("Prefer not to say", "No answer", "Non-binary", "Any other gender identity")) %>% 
  group_by(h1_1, h2) %>% 
  summarise(freq = n()) %>% ungroup() %>% 
  mutate(age = factor(h1_1, ordered = TRUE),
         freq = ifelse(h2 == 'Male', (freq*-1), freq)) %>% 
  rename("gender" = "h2", "ageGroup" = "age") %>% 
  select(-c(h1_1))
## `summarise()` has grouped output by 'h1_1'. You can override using the
## `.groups` argument.
ggplot(age, aes(y = ageGroup, x = freq, fill = gender)) +
  geom_bar(data=subset(age,gender=="Male"), stat = "identity") +
  geom_bar(data=subset(age,gender=="Female"), stat = "identity") +
  scale_x_continuous(breaks=c(-15, -50, -75, -100, 0, 25, 75, 100, 150, 180)) +
  labs(title = "Age group ~ Gender",
       subtitle = "Valid responses",
       x = "",
       y = "") +
  theme_classic()+
  scale_fill_manual(values = c('darkred', '#254061')) +
  theme(legend.text  = element_text(size= 8.5),
        legend.title = element_blank()) 

Reference

  1. Using Skimr;
  2. R for Data Science;
  3. Working with SPSS labels in R;
  4. datawizard: Easy Data Wrangling and Statistical Transformations;
  5. easystats: An R Framework for Easy Statistical Modeling, Visualization, and Reporting;
  6. Some good practices for research with R;
  7. Split Ages into Age Groups;
  8. Apply function across colums;
  9. Data Cleaning Challenge: Outliers (R);
  10. SPSS - Variable measurement level;
  11. SPSS - Leveraging labelled data in R;
  12. SPSS - Introduction to labelled data