Objective

  1. Labeling
  2. Export file from R to SPSS (sav) format

Library

pacman::p_load(
  here,       # relative file pathways  
  haven,      # reading sav file
  sjlabelled,
  labelled,
  dplyr,      # data cleaning 
  rio,        # importing data  
  janitor,    # data cleaning and tables
  lubridate,  # working with dates
  matchmaker, # dictionary-based cleaning
  epikit,     # age_categories() function
  tidyverse,  # data management and visualization
  surveytoolbox,
  pbkrtest,
  sjPlot,
  foreign,
  gmodels,
  datawizard, # freq
  AMR,        # age  
  skimr 
)
## Installing package into 'C:/Users/linhp/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

Data import

mydata1 %>% sjPlot::view_df() # display SPSS variable view 
df <- mydata1

Data labelling

  1. df <- haven::as_factor(mydata) : DO NOT change the whole dataset, it will make your SPSS format BROKEN (It replaces -99, -97 into factor level 1,2,…);
  2. df1$Q3 <- haven::as_factor(df1$Q3) : Only change as_factor for variables you want to change.
## Option 1: Using haven::as_factor()
df1 <- df 
df1$Q3 <- haven::as_factor(df1$Q3)  # repeat it for each columns <- ok but time-consuming 
df1$Q_age <- haven::as_factor(df1$Q_age)

## Option 2: Using mutate_each_()
df3 <- df
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')

df3 <- df3 %>% mutate_each_(funs(haven::as_factor(.)), cols) # can't use it anymore, Warning: `mutate_each_()` was deprecated in dplyr 0.7.0. <- Use mutate(across()) instead. 
df3 <- df3 %>% mutate(across(cols, function(x) as_factor(x)))

sapply(df3[cols], class)
## Option 3: Using lapply()  <---- the best 
### without the comma is a *list* subset
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df[cols] <- lapply(df[cols], haven::as_factor)
sapply(df[cols], class) # check the class
##       Q3       Q4       Q5       Q6       Q7       Q8    Q_age 
## "factor" "factor" "factor" "factor" "factor" "factor" "factor"
### with the comma is a *matrix* subset 
df2 <- df
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df2[, cols] <- lapply(df2[, cols], haven::as_factor) 
sapply(df2[, cols], class) 
##       Q3       Q4       Q5       Q6       Q7       Q8    Q_age 
## "factor" "factor" "factor" "factor" "factor" "factor" "factor"
lapply(df2[cols], class) # check the class
## $Q3
## [1] "factor"
## 
## $Q4
## [1] "factor"
## 
## $Q5
## [1] "factor"
## 
## $Q6
## [1] "factor"
## 
## $Q7
## [1] "factor"
## 
## $Q8
## [1] "factor"
## 
## $Q_age
## [1] "factor"

Convert from factor to numeric for case_when labelling

## Option 1 <-- good for 1 variable only 
df1$Q3 <- as.numeric(levels(df1$Q3)[df1$Q3])  

## Option 3 <-- acceptable 
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df2[cols] <- lapply(df2[cols], function(x) as.numeric(as.character(x)))
## Option 2 <--- the best, fast 
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df[cols] <- lapply(df[cols], function(x) as.numeric(levels(x)[x]))
sapply(df[cols], class)
##        Q3        Q4        Q5        Q6        Q7        Q8     Q_age 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"

Labelling

df <- df %>%   
  mutate(
      Q3_1 = case_when(
      Q3 >= 3 & Q3 <= 6    ~ "3-6",
      Q3 >= 7 & Q3 <= 10   ~ "7-10",
      .default             = "11-12",
      is.na(Q3)            ~ " "
    ), .after = Q3
  )
df <- df %>%   
  mutate(
    Q4_1 = case_when(
      Q4 >= 1 & Q4 <= 3   ~ "1-3",
      Q4 >= 4 & Q4 <= 5   ~ "4-5",
      .default                = "6+",
      is.na(Q4)             ~ " ",
    ), .after = Q4
  )
df <- df %>%  
  mutate(
    Q5_1 = case_when(
      Q5 == 0                ~ "None",
      Q5 >= 1 & Q5 <= 2      ~ "1-2",
      .default               = "3+",
      is.na(Q5)              ~ " "
    ), .after = Q5
  )
df <- df %>%  
  mutate(
    Q6_1 = case_when(
      Q6 == 0             ~ "None",
      Q6 == 1             ~ "1",
      Q6 >= 2 & Q6 <= 4   ~ "2-4",
      .default            = "5+",
      is.na(Q6)           ~ " "
    ), .after = Q6
  )
df <- df %>% mutate(Q71 = 2020-Q7, .after = Q7)    # 2020 
df <- df %>% mutate(Q81 = 2020-Q8, .after = Q8)    # 2020 
df <- df %>% mutate(Qx  = 2020-Q_age, .after = Q_age)      # 2020 

df <- df %>%  
  mutate(
    Q71_X = case_when(
      Q71 == 0              ~ "Less than 1",
      Q71 >= 1 & Q71 <= 5   ~ "1-5",
      Q71 >= 6 & Q71 <= 10  ~ "6-10",
      Q71 >= 11 & Q71 <= 15 ~ "11-15",
      Q71 >= 16 & Q71 <= 20 ~ "16-20",
      Q71 >= 20             ~ "21+",
      is.na(Q71)            ~ " "
    ), .after = Q71
  )

df <- df %>%  
  mutate(
    Q82_X = case_when(
      Q81 == 0              ~ "Less than 1",
      Q81 >= 1 & Q81 <= 5   ~ "1-5",
      Q81 >= 6 & Q81 <= 10  ~ "6-10",
      Q81 >= 11 & Q81 <= 15 ~ "11-15",
      is.na(Q81)            ~ " "
    ), .after = Q81
  )


df <- df %>%  
  mutate(
    Qx1 = case_when(
      Qx >= 18 & Qx <= 24 ~ "18-24",
      Qx >= 25 & Qx <= 29 ~ "25-29",
      Qx >= 30 & Qx <= 34 ~ "30-34",
      Qx >= 35 & Qx <= 39 ~ "35-39",
      Qx >= 40 & Qx <= 44 ~ "40-44",
      Qx >= 45 & Qx <= 49 ~ "45-49",
      Qx >= 50 & Qx <= 54 ~ "50-54",
      Qx >= 55 & Qx <= 59 ~ "55-59",
      Qx >= 60 & Qx <= 64 ~ "60-64",
      Qx >= 65            ~ "65+",
      is.na(Qx)           ~ " "
    ), .after = Qx
  )

Export from R to SPSS

Convert again from numeric to factor

library(haven)
# # Option 1: transform each variable <- not recommended 
# df2$Q3_1 <- as.factor(df2$Q3_1) 

# Option 2: 
cols <- c('Q3_1', 'Q4_1', 'Q5_1', 'Q6_1', 'Q71_X', 'Q82_X', 'Qx1')
df[cols] <- lapply(df[cols], as.factor)
sapply(df[cols], class)
##     Q3_1     Q4_1     Q5_1     Q6_1    Q71_X    Q82_X      Qx1 
## "factor" "factor" "factor" "factor" "factor" "factor" "factor"
write_sav(df, "SPSS_new_datasample.sav")

References

  1. Coerce multiple columns to factors at once;
  2. How to convert data.frame column from Factor to numeric.