pacman::p_load(
here, # relative file pathways
haven, # reading sav file
sjlabelled,
labelled,
dplyr, # data cleaning
rio, # importing data
janitor, # data cleaning and tables
lubridate, # working with dates
matchmaker, # dictionary-based cleaning
epikit, # age_categories() function
tidyverse, # data management and visualization
pbkrtest,
sjPlot,
foreign,
gmodels,
datawizard, # freq
AMR, # age
skimr
)
dt1 <- df
dt1 %>% sjPlot::view_df() # display SPSS variable view
glimpse(dt1)
str(dt1)
attributes(dt1$Questionx3) # view attribute (Label, label values)
dim(dt1)
data <- dt1 %>%
mutate(duration_in_min = round((Timeresponse/60), 2)) %>%
mutate(across(.cols = where(is.POSIXct), .fns = as.Date)) %>% #data$EndDate <- as.Date(as.POSIXct(data$EndDate))
mutate(diff.dates = (difftime(dt1$EndDate, dt1$RecordedDate, units = "days")), .after = RecordedDate) %>% # as.numeric
as.data.frame()
Convert from <dbl+lbl> to fct [nominal]
col.names<-c('Question1','Question2','Question4') # columns to go to factors
data[col.names] <- do.call(cbind.data.frame, lapply(data[col.names], as.factor)) # opt1: convert to factor
data %>% mutate_at(col.names, as.factor) # opt2: convert to factor
Convert from <dbl+lbl> to numeric [nominal]
col.names<-c('Questionx3','Questionx1')
data[col.names] <- do.call(cbind.data.frame, lapply(data[col.names], as.numeric))
Basic stats
data_tabulate(data$Question4) # frequency with categorical value
summary(data$Questionx6) # basic stat with ordinal value
data %>%
rstatix::get_summary_stats(
Questionx3, Questionx1, # select columns
type = "common")
Deal with missing value, not-applicable value or prefer not to say
value in survey (replace -97, -99)
data_1 <- data
data_1$Questionx6[data_1$Questionx6 < 0] <- NA # replace for 1 selected column
data_1$Questionx6 <- replace(data_1$Questionx6, which(data_1$Questionx6 < 0), NA)
data_tabulate(data_1$Questionx6)
data_3 <- select_if(data, is.numeric) # opt 1: subset -> then case_when (a bit long)
data_3[,1:5] <- NULL
data_3[,25:64] <- NULL
data_3 %>% mutate(across(everything(), ~ case_when(.x >= 0 ~ .x)))
data <- data %>% # replace all dataframe / all columns --- opt 2: replace()
mutate(across(where(is.numeric), function(x){replace(x, which (x < 0), NA)}))
Thresholds
## Thresholds
subset <- data %>% select(Question3, Questionx7, Questionx8, Questionx9, Questionx10, Questionx11, Questionx12) # subset data - scale level (needed to be anonymised)
dta <- haven::as_factor(subset) # 3 last columns display Value Label (year), not Valude code (1,2,3...)
### change data from dbl-labled to numeric for analysis
dta_1 <- mutate_at(dta, vars(Question3, Questionx7, Questionx8, Questionx9, Questionx10, Questionx11, Questionx12), as.factor) # step1
dta_2 <- dta_1 %>% mutate(across(where(is.factor), as.character)) # step2
labelled_data <- dta_2 %>% mutate_if(is.character, as.numeric) # step3
glimpse(labelled_data) # dbl ~ numeric
class(labelled_data$Question3) # numeric
labelled_data <- labelled_data %>%
mutate(Questionx10_new = 2020-Questionx10,
Questionx11_new = 2020-Questionx11,
Questionx12_new = 2020-Questionx12)
data_new_label <- labelled_data[-c(5:7)]
Basic stats
skim(labelled_data)
summaries <- vector(mode = "list", ncol(data_new_label))
for (i in 1:ncol(data_new_label)) {
sm <- summary(data_new_label[[i]])
summaries[[i]] <- sm
}
summaries
outcome <- vector("list", 7)
for (i in seq_along(data_new_label)) {
print(i)
var_name <- names(data_new_label[i])
title <- paste0("Histogram of ", var_name, " values:") # histogram
x_lab <- var_name
outcome[[i]] <- hist(data_new_label[[i]], breaks = 60,
main = title, xlab = x_lab)
}
out <- vector("list", 7)
for (i in seq_along(data_new_label)) {
freq <- data_tabulate(data_new_label[[i]]) # frequency with datawizard
out[[i]] <- freq
}
Data labelling
data_new_label <- data_new_label %>%
mutate(
Question3_tempo = case_when(
Question3 >= 3 & Question3 <= 6 ~ "3-6",
Question3 >= 7 & Question3 <= 10 ~ "7-10",
.default = "11-12",
is.na(Question3) ~ " "
)
)
data_new_label <- data_new_label %>%
mutate(
Questionx7_senior = case_when(
Questionx7 >= 1 & Questionx7 <= 3 ~ "1-3",
Questionx7 >= 4 & Questionx7 <= 5 ~ "4-5",
.default = "+6",
is.na(Questionx7) ~ " "
)
)
data_new_label <- data_new_label %>%
mutate(
Questionx8_junior = case_when(
Questionx8 == 0 ~ "None",
Questionx8 >= 1 & Questionx8 <= 2 ~ "1-2",
.default = "+3",
is.na(Questionx8) ~ " "
)
)
data_new_label <- data_new_label %>%
mutate(
Questionx9_space = case_when(
Questionx9 == 0 ~ "None",
Questionx9 == 1 ~ "1",
Questionx9 >= 2 & Questionx9 <= 4 ~ "2-4",
.default = "+5",
is.na(Questionx9) ~ " "
)
)
data_new_label <- data_new_label %>%
mutate(
Questionx10_long = AMR::age_groups(data_new_label$Questionx10_new, split_at = "fives"),
Questionx11_short = AMR::age_groups(data_new_label$Questionx11_new, split_at = "fives"),
Questionx12_group = AMR::age_groups(data_new_label$Questionx12_new, split_at = "fives"),
)
data_new_label <- data_new_label %>%
mutate(across(Question3_tempo:Questionx12_group, ~ case_when(.x == " " | is.na(.x) ~ "-99", TRUE ~ as.character(.x)))) # missing value -99
data_new_label <- mutate_at(data_new_label, vars(Question3_tempo:Questionx12_group), as.factor) # change character columns into factor level
dt_label <- data_new_label %>% select(-c(1:7))
output <- vector("list", 7)
for (i in seq_along(dt_label)) {
freq <- data_tabulate(dt_label[[i]]) # frequency with datawizard
output[[i]] <- freq
}
labelled_data_outlier <- data_new_label %>% select(c(Questionx9, Questionx7, Questionx8))
str(labelled_data_outlier) # numeric check
# z scores estimation
labelled_data_outlier <- labelled_data_outlier %>% drop_na() # remove NA before estimation of z score
outlier_scores <- outliers::scores(labelled_data_outlier)
is_outlier <- outlier_scores > 3 | outlier_scores < -3 # z score threshold = -3, 3
colnames(is_outlier)[colnames(is_outlier) %in% c("Questionx9", # rename column (object class)
"Questionx7",
"Questionx8")] <- c("Questionx9_outlier",
"Questionx7_outlier",
"Questionx8_outlier"
)
# basic stats
labelled_data_outlier <- cbind(labelled_data_outlier, is_outlier)
data_tabulate(labelled_data_outlier[, 4:6])
ggplot(labelled_data_outlier, aes(x = "Questionx9", y = Questionx9)) +
geom_boxplot()
# replace outliers with NA
labelled_data_outlier <- labelled_data_outlier %>%
mutate(Questionx9_new = case_when(
Questionx9_outlier == "TRUE" ~ NA,
.default = as.character(Questionx9)
)
)
labelled_data_outlier <- labelled_data_outlier %>%
mutate(Questionx7_new = case_when(
Questionx7_outlier == "TRUE" ~ NA,
.default = as.character(Questionx7)
)
)
labelled_data_outlier <- labelled_data_outlier %>%
mutate(Questionx8_new = case_when(
Questionx8_outlier == "TRUE" ~ NA,
.default = as.character(Questionx7)
)
)