Objective

  1. Working with SPSS file in R;
  2. Understanding variable, label, label value, data type.

Library

pacman::p_load(
  here,       # relative file pathways  
  haven,      # reading sav file
  sjlabelled,
  labelled,
  dplyr,      # data cleaning 
  rio,        # importing data  
  janitor,    # data cleaning and tables
  lubridate,  # working with dates
  matchmaker, # dictionary-based cleaning
  epikit,     # age_categories() function
  tidyverse,  # data management and visualization
  pbkrtest,
  sjPlot,
  foreign,
  gmodels,
  datawizard, # freq
  AMR,        # age  
  skimr 
)

Data

dt1 <- df 
dt1 %>% sjPlot::view_df() # display SPSS variable view 
glimpse(dt1)
str(dt1)
attributes(dt1$Questionx3) # view attribute (Label, label values)
dim(dt1)

Exploratory Analysis

Time

data <- dt1 %>% 
  mutate(duration_in_min = round((Timeresponse/60), 2)) %>% 
    mutate(across(.cols = where(is.POSIXct), .fns = as.Date)) %>%  #data$EndDate <- as.Date(as.POSIXct(data$EndDate))
      mutate(diff.dates = (difftime(dt1$EndDate, dt1$RecordedDate, units = "days")), .after = RecordedDate) %>%  # as.numeric
        as.data.frame()

Data types

Convert from <dbl+lbl> to fct [nominal]

col.names<-c('Question1','Question2','Question4') # columns to go to factors
data[col.names] <- do.call(cbind.data.frame, lapply(data[col.names], as.factor)) # opt1: convert to factor 
data %>% mutate_at(col.names, as.factor) # opt2: convert to factor 

Convert from <dbl+lbl> to numeric [nominal]

col.names<-c('Questionx3','Questionx1')
data[col.names] <- do.call(cbind.data.frame, lapply(data[col.names], as.numeric))

Basic stats

data_tabulate(data$Question4)                  # frequency with categorical value 
summary(data$Questionx6)                       # basic stat with ordinal value 
data %>% 
  rstatix::get_summary_stats(
    Questionx3, Questionx1,                    # select columns  
    type = "common")                     

Deal with missing value, not-applicable value or prefer not to say value in survey (replace -97, -99)

data_1 <- data 
data_1$Questionx6[data_1$Questionx6 < 0] <- NA       # replace for 1 selected column
data_1$Questionx6 <- replace(data_1$Questionx6, which(data_1$Questionx6 < 0), NA)
data_tabulate(data_1$Questionx6)

data_3 <- select_if(data, is.numeric)                # opt 1: subset -> then case_when (a bit long)
data_3[,1:5] <- NULL
data_3[,25:64] <- NULL
data_3 %>% mutate(across(everything(), ~ case_when(.x >= 0 ~ .x)))

data <- data %>%                                     # replace all dataframe / all columns --- opt 2: replace()
  mutate(across(where(is.numeric), function(x){replace(x, which (x < 0), NA)}))

Data anonymous

Thresholds

## Thresholds  
subset <- data %>% select(Question3, Questionx7, Questionx8, Questionx9, Questionx10, Questionx11, Questionx12) # subset data - scale level  (needed to be anonymised)
dta <- haven::as_factor(subset)  # 3 last columns display Value Label (year), not Valude code (1,2,3...)

### change data from dbl-labled to numeric for analysis
dta_1 <- mutate_at(dta, vars(Question3, Questionx7, Questionx8, Questionx9, Questionx10, Questionx11, Questionx12), as.factor) # step1
dta_2 <- dta_1 %>% mutate(across(where(is.factor), as.character)) # step2
labelled_data <- dta_2 %>% mutate_if(is.character, as.numeric) # step3  
glimpse(labelled_data) # dbl ~ numeric 
class(labelled_data$Question3) # numeric

labelled_data <- labelled_data %>% 
  mutate(Questionx10_new = 2020-Questionx10,
         Questionx11_new = 2020-Questionx11,
         Questionx12_new  = 2020-Questionx12)

data_new_label <- labelled_data[-c(5:7)]

Basic stats

skim(labelled_data) 

summaries <- vector(mode = "list", ncol(data_new_label))
for (i in 1:ncol(data_new_label)) {
  sm <- summary(data_new_label[[i]])
  summaries[[i]] <- sm
}
summaries  
outcome <- vector("list", 7)         
for (i in seq_along(data_new_label)) {
  print(i)
  var_name     <- names(data_new_label[i])
  title        <- paste0("Histogram of ", var_name, " values:") # histogram
  x_lab        <- var_name
  outcome[[i]] <- hist(data_new_label[[i]], breaks = 60,
                     main = title, xlab = x_lab)
}

out <- vector("list", 7) 
for (i in seq_along(data_new_label)) {
  freq <- data_tabulate(data_new_label[[i]]) # frequency with datawizard
  out[[i]] <- freq
}

Data labelling

data_new_label <- data_new_label %>%   
  mutate(
    Question3_tempo = case_when(
      Question3 >= 3 & Question3 <= 6    ~ "3-6",
      Question3 >= 7 & Question3 <= 10   ~ "7-10",
      .default                           = "11-12",
      is.na(Question3)                   ~ " "
    )
  )
data_new_label <- data_new_label %>%   
  mutate(
    Questionx7_senior = case_when(
      Questionx7 >= 1 & Questionx7 <= 3   ~ "1-3",
      Questionx7 >= 4 & Questionx7 <= 5   ~ "4-5",
      .default                            = "+6",
      is.na(Questionx7)                   ~ " "
    )
  )
data_new_label <- data_new_label %>%  
  mutate(
    Questionx8_junior = case_when(
      Questionx8 == 0                    ~ "None",
      Questionx8 >= 1 & Questionx8 <= 2  ~ "1-2",
      .default                           = "+3",
      is.na(Questionx8)                  ~ " "
    )
  )
data_new_label <- data_new_label %>%  
  mutate(
    Questionx9_space = case_when(
      Questionx9 == 0                     ~ "None",
      Questionx9 == 1                     ~ "1",
      Questionx9 >= 2 & Questionx9 <= 4   ~ "2-4",
      .default                            = "+5",
      is.na(Questionx9)                   ~ " "
    )
  )
data_new_label <- data_new_label %>%  
  mutate(
    Questionx10_long   = AMR::age_groups(data_new_label$Questionx10_new, split_at = "fives"),
    Questionx11_short  = AMR::age_groups(data_new_label$Questionx11_new, split_at = "fives"),
    Questionx12_group  = AMR::age_groups(data_new_label$Questionx12_new, split_at = "fives"),
    )
data_new_label <- data_new_label %>% 
  mutate(across(Question3_tempo:Questionx12_group, ~ case_when(.x == " " | is.na(.x)  ~ "-99", TRUE ~ as.character(.x)))) # missing value -99 


data_new_label <- mutate_at(data_new_label, vars(Question3_tempo:Questionx12_group), as.factor) # change character columns into factor level 

dt_label <- data_new_label %>% select(-c(1:7))
output <- vector("list", 7) 
for (i in seq_along(dt_label)) {
  freq <- data_tabulate(dt_label[[i]]) # frequency with datawizard
  output[[i]] <- freq
}

Outlier detection

labelled_data_outlier <- data_new_label %>% select(c(Questionx9, Questionx7, Questionx8))
str(labelled_data_outlier) # numeric check 
# z scores estimation 
labelled_data_outlier <- labelled_data_outlier %>% drop_na()         # remove NA before estimation of z score 
outlier_scores <- outliers::scores(labelled_data_outlier)  
is_outlier <- outlier_scores > 3 | outlier_scores < -3       # z score threshold = -3, 3

colnames(is_outlier)[colnames(is_outlier) %in% c("Questionx9",       # rename column (object class)
                                                 "Questionx7", 
                                                 "Questionx8")] <- c("Questionx9_outlier", 
                                                               "Questionx7_outlier", 
                                                               "Questionx8_outlier"
                                                               ) 
# basic stats
labelled_data_outlier <- cbind(labelled_data_outlier, is_outlier)                                        
data_tabulate(labelled_data_outlier[, 4:6]) 
ggplot(labelled_data_outlier, aes(x = "Questionx9", y = Questionx9)) +                                
  geom_boxplot()

# replace outliers with NA 
labelled_data_outlier <- labelled_data_outlier %>% 
  mutate(Questionx9_new = case_when(
    Questionx9_outlier == "TRUE" ~ NA,
    .default = as.character(Questionx9)
  )
)     
labelled_data_outlier <- labelled_data_outlier %>% 
  mutate(Questionx7_new = case_when(
    Questionx7_outlier == "TRUE" ~ NA,
    .default = as.character(Questionx7)
  )
)
labelled_data_outlier <- labelled_data_outlier %>% 
  mutate(Questionx8_new = case_when(
    Questionx8_outlier == "TRUE" ~ NA,
    .default = as.character(Questionx7)
  )
)

References

  1. Using Skimr;
  2. R for Data Science;
  3. Working with SPSS labels in R;
  4. datawizard: Easy Data Wrangling and Statistical Transformations;
  5. easystats: An R Framework for Easy Statistical Modeling, Visualization, and Reporting;
  6. Some good practices for research with R;
  7. Split Ages into Age Groups;
  8. Apply function across colums;
  9. Data Cleaning Challenge: Outliers (R);
  10. SPSS - Variable measurement level;
  11. SPSS - Leveraging labelled data in R;
  12. SPSS - Introduction to labelled data