Objective
- Labeling
- Export file from R to SPSS (sav) format
Library
pacman::p_load(
here, # relative file pathways
haven, # reading sav file
sjlabelled,
labelled,
dplyr, # data cleaning
rio, # importing data
janitor, # data cleaning and tables
lubridate, # working with dates
matchmaker, # dictionary-based cleaning
epikit, # age_categories() function
tidyverse, # data management and visualization
surveytoolbox,
pbkrtest,
sjPlot,
foreign,
gmodels,
datawizard, # freq
AMR, # age
skimr
)
## Installing package into 'C:/Users/linhp/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
Data import
mydata1 %>% sjPlot::view_df() # display SPSS variable view
df <- mydata1
Data labelling
df <- haven::as_factor(mydata) : DO NOT change the
whole dataset, it will make your SPSS format BROKEN (It replaces -99,
-97 into factor level 1,2,…);
df1$Q3 <- haven::as_factor(df1$Q3) : Only change
as_factor for variables you want to change.
## Option 1: Using haven::as_factor()
df1 <- df
df1$Q3 <- haven::as_factor(df1$Q3) # repeat it for each columns <- ok but time-consuming
df1$Q_age <- haven::as_factor(df1$Q_age)
## Option 2: Using mutate_each_()
df3 <- df
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df3 <- df3 %>% mutate_each_(funs(haven::as_factor(.)), cols) # can't use it anymore, Warning: `mutate_each_()` was deprecated in dplyr 0.7.0. <- Use mutate(across()) instead.
df3 <- df3 %>% mutate(across(cols, function(x) as_factor(x)))
sapply(df3[cols], class)
## Option 3: Using lapply() <---- the best
### without the comma is a *list* subset
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df[cols] <- lapply(df[cols], haven::as_factor)
sapply(df[cols], class) # check the class
## Q3 Q4 Q5 Q6 Q7 Q8 Q_age
## "factor" "factor" "factor" "factor" "factor" "factor" "factor"
### with the comma is a *matrix* subset
df2 <- df
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df2[, cols] <- lapply(df2[, cols], haven::as_factor)
sapply(df2[, cols], class)
## Q3 Q4 Q5 Q6 Q7 Q8 Q_age
## "factor" "factor" "factor" "factor" "factor" "factor" "factor"
lapply(df2[cols], class) # check the class
## $Q3
## [1] "factor"
##
## $Q4
## [1] "factor"
##
## $Q5
## [1] "factor"
##
## $Q6
## [1] "factor"
##
## $Q7
## [1] "factor"
##
## $Q8
## [1] "factor"
##
## $Q_age
## [1] "factor"
Convert from factor to numeric for case_when labelling
## Option 1 <-- good for 1 variable only
df1$Q3 <- as.numeric(levels(df1$Q3)[df1$Q3])
## Option 3 <-- acceptable
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df2[cols] <- lapply(df2[cols], function(x) as.numeric(as.character(x)))
## Option 2 <--- the best, fast
cols <- c('Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q_age')
df[cols] <- lapply(df[cols], function(x) as.numeric(levels(x)[x]))
sapply(df[cols], class)
## Q3 Q4 Q5 Q6 Q7 Q8 Q_age
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
Labelling
df <- df %>%
mutate(
Q3_1 = case_when(
Q3 >= 3 & Q3 <= 6 ~ "3-6",
Q3 >= 7 & Q3 <= 10 ~ "7-10",
.default = "11-12",
is.na(Q3) ~ " "
), .after = Q3
)
df <- df %>%
mutate(
Q4_1 = case_when(
Q4 >= 1 & Q4 <= 3 ~ "1-3",
Q4 >= 4 & Q4 <= 5 ~ "4-5",
.default = "6+",
is.na(Q4) ~ " ",
), .after = Q4
)
df <- df %>%
mutate(
Q5_1 = case_when(
Q5 == 0 ~ "None",
Q5 >= 1 & Q5 <= 2 ~ "1-2",
.default = "3+",
is.na(Q5) ~ " "
), .after = Q5
)
df <- df %>%
mutate(
Q6_1 = case_when(
Q6 == 0 ~ "None",
Q6 == 1 ~ "1",
Q6 >= 2 & Q6 <= 4 ~ "2-4",
.default = "5+",
is.na(Q6) ~ " "
), .after = Q6
)
df <- df %>% mutate(Q71 = 2020-Q7, .after = Q7) # 2020
df <- df %>% mutate(Q81 = 2020-Q8, .after = Q8) # 2020
df <- df %>% mutate(Qx = 2020-Q_age, .after = Q_age) # 2020
df <- df %>%
mutate(
Q71_X = case_when(
Q71 == 0 ~ "Less than 1",
Q71 >= 1 & Q71 <= 5 ~ "1-5",
Q71 >= 6 & Q71 <= 10 ~ "6-10",
Q71 >= 11 & Q71 <= 15 ~ "11-15",
Q71 >= 16 & Q71 <= 20 ~ "16-20",
Q71 >= 20 ~ "21+",
is.na(Q71) ~ " "
), .after = Q71
)
df <- df %>%
mutate(
Q82_X = case_when(
Q81 == 0 ~ "Less than 1",
Q81 >= 1 & Q81 <= 5 ~ "1-5",
Q81 >= 6 & Q81 <= 10 ~ "6-10",
Q81 >= 11 & Q81 <= 15 ~ "11-15",
is.na(Q81) ~ " "
), .after = Q81
)
df <- df %>%
mutate(
Qx1 = case_when(
Qx >= 18 & Qx <= 24 ~ "18-24",
Qx >= 25 & Qx <= 29 ~ "25-29",
Qx >= 30 & Qx <= 34 ~ "30-34",
Qx >= 35 & Qx <= 39 ~ "35-39",
Qx >= 40 & Qx <= 44 ~ "40-44",
Qx >= 45 & Qx <= 49 ~ "45-49",
Qx >= 50 & Qx <= 54 ~ "50-54",
Qx >= 55 & Qx <= 59 ~ "55-59",
Qx >= 60 & Qx <= 64 ~ "60-64",
Qx >= 65 ~ "65+",
is.na(Qx) ~ " "
), .after = Qx
)
Export from R to SPSS
Convert again from numeric to factor
library(haven)
# # Option 1: transform each variable <- not recommended
# df2$Q3_1 <- as.factor(df2$Q3_1)
# Option 2:
cols <- c('Q3_1', 'Q4_1', 'Q5_1', 'Q6_1', 'Q71_X', 'Q82_X', 'Qx1')
df[cols] <- lapply(df[cols], as.factor)
sapply(df[cols], class)
## Q3_1 Q4_1 Q5_1 Q6_1 Q71_X Q82_X Qx1
## "factor" "factor" "factor" "factor" "factor" "factor" "factor"
write_sav(df, "SPSS_new_datasample.sav")