survey data with r

Objective

Working with SPSS file in R;
Understanding variable, label, label value, data type.

Library

pacman::p_load(
  here,       # relative file pathways  
  haven,      # reading sav file
  sjlabelled,
  labelled,
  dplyr,      # data cleaning 
  rio,        # importing data  
  janitor,    # data cleaning and tables
  lubridate,  # working with dates
  matchmaker, # dictionary-based cleaning
  epikit,     # age_categories() function
  tidyverse,  # data management and visualization
  pbkrtest,
  sjPlot,
  foreign,
  gmodels,
  datawizard, # freq
  AMR,        # age  
  skimr 
)

Data

dt1 <- df 
dt1 %>% sjPlot::view_df() # display SPSS variable view

glimpse(dt1)
str(dt1)
attributes(dt1$Questionx3) # view attribute (Label, label values)
dim(dt1)

Exploratory Analysis

Time

data <- dt1 %>% 
  mutate(duration_in_min = round((Timeresponse/60), 2)) %>% 
    mutate(across(.cols = where(is.POSIXct), .fns = as.Date)) %>%  #data$EndDate <- as.Date(as.POSIXct(data$EndDate))
      mutate(diff.dates = (difftime(dt1$EndDate, dt1$RecordedDate, units = "days")), .after = RecordedDate) %>%  # as.numeric
        as.data.frame()

Data types

Convert from <dbl+lbl> to fct [nominal]

col.names<-c('Question1','Question2','Question4') # columns to go to factors
data[col.names] <- do.call(cbind.data.frame, lapply(data[col.names], as.factor)) # opt1: convert to factor 
data %>% mutate_at(col.names, as.factor) # opt2: convert to factor

Convert from <dbl+lbl> to numeric [nominal]

col.names<-c('Questionx3','Questionx1')
data[col.names] <- do.call(cbind.data.frame, lapply(data[col.names], as.numeric))

Basic stats

data_tabulate(data$Question4)                  # frequency with categorical value 
summary(data$Questionx6)                       # basic stat with ordinal value 
data %>% 
  rstatix::get_summary_stats(
    Questionx3, Questionx1,                    # select columns  
    type = "common")

Deal with missing value, not-applicable value or prefer not to say value in survey (replace -97, -99)

data_1 <- data 
data_1$Questionx6[data_1$Questionx6 < 0] <- NA       # replace for 1 selected column
data_1$Questionx6 <- replace(data_1$Questionx6, which(data_1$Questionx6 < 0), NA)
data_tabulate(data_1$Questionx6)

data_3 <- select_if(data, is.numeric)                # opt 1: subset -> then case_when (a bit long)
data_3[,1:5] <- NULL
data_3[,25:64] <- NULL
data_3 %>% mutate(across(everything(), ~ case_when(.x >= 0 ~ .x)))

data <- data %>%                                     # replace all dataframe / all columns --- opt 2: replace()
  mutate(across(where(is.numeric), function(x){replace(x, which (x < 0), NA)}))

Data anonymous

Thresholds

## Thresholds  
subset <- data %>% select(Question3, Questionx7, Questionx8, Questionx9, Questionx10, Questionx11, Questionx12) # subset data - scale level  (needed to be anonymised)
dta <- haven::as_factor(subset)  # 3 last columns display Value Label (year), not Valude code (1,2,3...)

### change data from dbl-labled to numeric for analysis
dta_1 <- mutate_at(dta, vars(Question3, Questionx7, Questionx8, Questionx9, Questionx10, Questionx11, Questionx12), as.factor) # step1
dta_2 <- dta_1 %>% mutate(across(where(is.factor), as.character)) # step2
labelled_data <- dta_2 %>% mutate_if(is.character, as.numeric) # step3  
glimpse(labelled_data) # dbl ~ numeric 
class(labelled_data$Question3) # numeric

labelled_data <- labelled_data %>% 
  mutate(Questionx10_new = 2020-Questionx10,
         Questionx11_new = 2020-Questionx11,
         Questionx12_new  = 2020-Questionx12)

data_new_label <- labelled_data[-c(5:7)]