pacman::p_load(
here, # relative file pathways
rio, # importing data
# libraries for spss data
haven,
sjlabelled,
labelled,
sjPlot,
# libraries for data processing
dplyr, # data cleaning
janitor, # data cleaning and tables
lubridate, # working with dates
matchmaker, # dictionary-based cleaning
epikit, # age_categories() function
tidyverse, # data management and visualization
surveytoolbox,
pbkrtest,
foreign,
gmodels,
datawizard, # freq
AMR, # age
skimr, # quick stat
apyramid
)
## Installing package into 'C:/Users/linhp/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
mydata %>% sjPlot::view_df() # display SPSS variable view
df <- mydata %>% janitor::clean_names()
A list of variables will be anonymised due to privacy concerns.
Threshold: 1) df <- haven::as_factor(mydata), 2)
df1$a3 <- haven::as_factor(df1$a3)
Step 1: Using haven::as_factor to read SPSS data with Value labels
# without the comma is a *list* subset
df[cols] <- lapply(df[cols], haven::as_factor)
sapply(df[cols], class) # check the class
# with the comma is a *matrix* subset
df2 <- df
df2[, cols] <- lapply(df2[, cols], haven::as_factor)
Step 2: Threshold
df[cols] <- lapply(df[cols], function(x) as.numeric(levels(x)[x]))
sapply(df[cols], class)
# Basic stats
summaries <- vector(mode = "list", ncol(threshold))
for (i in 1:ncol(threshold)) {
sm <- summary(threshold[[i]])
summaries[[i]] <- sm
}
# histogram
outcome <- vector("list", 7)
for (i in seq_along(threshold)) {
print(i)
var_name <- names(threshold[i])
title <- paste0("Histogram of ", var_name, " values:")
x_lab <- var_name
outcome[[i]] <- hist(threshold[[i]], breaks = 60,
main = title, xlab = x_lab)
}
# frequency with datawizard
out <- vector("list", 7)
for (i in seq_along(threshold)) {
freq <- data_tabulate(threshold[[i]])
out[[i]] <- freq
}
out
# lapply(threshold[cols], data_tabulate)
Step 3: Anonymise data
df <- df %>%
mutate(
a3_1 = case_when(
a3 >= 3 & a3 <= 6 ~ "3-6",
a3 >= 7 & a3 <= 10 ~ "7-10",
.default = "11-12", # TRUE ~ "11-12"
is.na(a3) ~ "No answer"
), .after = a3
)
df <- df %>% select(-c(a3)) %>% rename('a3' = 'a3_1')
data_tabulate(df$a3)
df_h5 <- haven::as_factor(df_h5)
Solution 1
# combine three income values into one value of annual_income
df_h5 <- df_h5 %>%
unite(
col = 'annual_income',
c('h5a', 'h5b', 'h5c'),
sep = " ",
remove = FALSE,
na.rm = TRUE
)
Solution 2:
Average weekly income
Step 1: remove string, text
df_h5 <- df_h5 %>%
separate(h5a, into = c("w_from", "w_to"), sep = "-", extra = "merge")
df_h5$w_from <- gsub('and more', '', as.character(df_h5$w_from)) # df$w_from <- str_replace(string = df$w_from, pattern = 'and more', replacement = '')
df_h5$w_from <- as.numeric(gsub(",","", df_h5$w_from)) # remove comma (,) and turn to numeric
df_h5$w_to <- as.numeric(gsub(",","", df_h5$w_to)) # remove comma (,) and turn to numeric
Step 2: annual income calculation
df_h5 <- df_h5 %>%
mutate(w_from_annual = (df_h5$w_from * 52), .after = w_to) %>% # 52 week per year
mutate(w_to_annual = (df_h5$w_to * 52), .after = w_from_annual) # 52 week per year
Step 3: Unite columns
df_h5$w_from_annual <- as.character(df_h5$w_from_annual)
df_h5$w_to_annual <- as.character(df_h5$w_to_annual)
df_h5 <- df_h5 %>%
unite(
col = 'annual_income (w)',
c('w_from_annual', 'w_to_annual'),
sep = '-',
remove = TRUE,
na.rm = FALSE
)
df_h5$w_from <- NULL
df_h5$w_to <- NULL
df_h5$`annual_income (w)` <- gsub('NA-NA', '', as.character(df_h5$`annual_income (w)`))
df_h5$`annual_income (w)` <- str_replace(df_h5$`annual_income (w)`, '54236-NA', '54236 and more')
Average monthly income
Step 1: remove string, text
df_h5 <- df_h5 %>%
separate(h5b, into = c("m_from", "m_to"), sep = "-", extra = "merge")
df_h5$m_from <- gsub('and more', '', as.character(df_h5$m_from))
df_h5$m_from <- as.numeric(gsub(",","",df_h5$m_from))
df_h5$m_to <- as.numeric(gsub(",","",df_h5$m_to))
Step 2: Calculate annual income
df_h5 <- df_h5 %>%
mutate(m_from_annual = (df_h5$m_from * 12), .after = m_to) %>% # 12 month per year
mutate(m_to_annual = (df_h5$m_to * 12), .after = m_from_annual) # 12 month per year
Step 3: Unite columns
df_h5$m_from_annual <- as.character(df_h5$m_from_annual)
df_h5$m_to_annual <- as.character(df_h5$m_to_annual)
df_h5 <- df_h5 %>%
unite(
col = 'annual_income (m)',
c('m_from_annual', 'm_to_annual'),
sep = '-',
remove = TRUE,
na.rm = FALSE
)
df_h5$m_from <- NULL
df_h5$m_to <- NULL
df_h5$`annual_income (m)` <- gsub('NA-NA', '', as.character(df_h5$`annual_income (m)`))
df_h5$`annual_income (m)` <- str_replace(df_h5$`annual_income (m)`, '55596-NA', '55596 and more')
str(df_h5$h5c)
df_h5$h5c <- as.character(df_h5$h5c)
df_h5 <- df_h5 %>%
unite(
col = 'annual income (£)',
c('annual_income (w)', 'annual_income (m)', 'h5c'),
sep = ' ',
remove = TRUE,
na.rm = FALSE
)
df_h5$`annual income (£)` <- gsub('NA','', df_h5$`annual income (£)`)
df_h5$`annual income (£)` <- gsub(',','', df_h5$`annual income (£)`)
df_h5$`annual income (£)` <- str_trim(df_h5$`annual income (£)`, side = c("both"))
df_h5 %>%
mutate(earnings = (fct_relevel(`annual income (£)`)), .after = `annual income (£)`) %>%
tabyl(earnings)
df_h5 <- df_h5 %>% select(c(h5, `annual income (£)`))
df_h5 %>% group_by(h5, `annual income (£)`) %>% summarise(count = n())
## `summarise()` has grouped output by 'h5'. You can override using the `.groups`
## argument.
Data preparation
names(which(colSums(is.na(df)) > 0)) # List cols having NA
df[cols] <- lapply(df[cols], as.factor) # Convert data from numeric to factor
sapply(df[cols], class)
df <- haven::as_factor(df)
Data label
# Label missing value by 'No answer'
df <- df %>% mutate(across(where(is.factor), ~fct_na_value_to_level(.,'No answer')))
df <- df %>%
mutate(region_fb = fct_recode(region_fb, 'Not applicable' = 'No answer'),
age_fb = fct_recode(age_fb, 'Not applicable' = 'No answer'),
distribution = fct_recode(distribution, 'Not applicable' = 'No answer'),
adset = fct_recode(adset, 'Not applicable' = 'No answer')
)
# Check missing value
names(which(colSums(is.na(df)) > 0))
# display SPSS variable view
df %>% sjPlot::view_df()
class(df)
# For multiple choice questions, labelling '0' by 'Not marked - Marked'
df <- df %>% mutate(across(c(b1a_1:b2_12, e5_1:e6_5),
~case_when(. == 0 ~ "Not marked",
. == "No answer" ~ "No answer", # .default =
TRUE ~ "Marked")))
lapply(df[cols], data_tabulate)
df <- haven::as_factor(df) # display SPSS variable view
class(df)
df %>% sjPlot::view_df()
write_sav(df, "Annonymised data.sav")
Likert-scale questions
Step 1: Data preparation
valid_questions <- df %>% select(c(c5a_1:c5a_5))
valid_questions <- valid_questions %>% gather(key = "Question_num", value = "Answer") # wide to long format
# valid_questions <- na.omit(valid_questions)
valid_questions_summary <- valid_questions %>%
group_by(Question_num, Answer) %>%
dplyr::summarize(freq = n()) %>%
ungroup %>% group_by(Question_num) %>%
mutate(proportion = round(freq / sum(freq),1))
## `summarise()` has grouped output by 'Question_num'. You can override using the
## `.groups` argument.
Step 2: Data visualisation
ggplot(valid_questions, aes(x=Question_num)) +
geom_bar(aes(fill=Answer), position = "fill") +
geom_text(
data = valid_questions_summary,
aes(y = freq, label = scales::percent(proportion), group = Answer),
position = position_fill(vjust = 0.5),
color = 'gray25', size = 3
) +
scale_fill_brewer(palette = 'Spectral', direction = -1) +
scale_y_continuous(expand = expansion(0), labels = scales::percent_format()) +
labs(
title = "Likert Plot of Question c5", subtitle = "Valid responses",
x = "",
y = ""
) +
theme(legend.text = element_text(size= 8.5),
legend.title = element_text(size= 9)) +
theme_classic()
ggplot(valid_questions, aes(x=Question_num)) +
geom_bar(aes(fill=Answer), position = "fill") +
geom_text(
data = valid_questions_summary,
aes(y = freq, label = scales::percent(proportion), group = Answer),
position = position_fill(vjust = 0.5),
color = 'gray25', size = 3
) +
scale_fill_brewer(palette = 'Spectral', direction = -1) +
scale_y_continuous(expand = expansion(0), labels = scales::percent_format()) +
labs(
title = "Likert Plot of Question c5", subtitle = "Valid responses",
x = "",
y = ""
) +
theme_classic() +
theme(legend.position = "top") +
theme(legend.text = element_text(size= 8.5),
legend.title = element_text(size= 9)) +
coord_flip()
In case we don’t want to showcase the answer No answer:
Step 1: Data preparation
valid_questions12 <- valid_questions %>% filter(Answer != "No answer")
valid_questions_summary <- valid_questions12 %>%
group_by(Question_num, Answer) %>%
dplyr::summarize(freq = n()) %>%
ungroup %>% group_by(Question_num) %>%
mutate(proportion = round(freq / sum(freq),1))
## `summarise()` has grouped output by 'Question_num'. You can override using the
## `.groups` argument.
Step 2: Data visualisation
ggplot(valid_questions12, aes(x=Question_num)) +
geom_bar(aes(fill=Answer), position = "fill") +
geom_text(
data = valid_questions_summary,
aes(y = freq, label = scales::percent(proportion), group = Answer),
position = position_fill(vjust = 0.5),
color = 'gray25', size = 3
) +
scale_fill_brewer(palette = 'Spectral', direction = -1) +
scale_y_continuous(expand = expansion(0), labels = scales::percent_format()) +
labs(
title = "Likert Plot of Question c5",
subtitle = "Valid responses",
x = "",
y = "") +
theme_classic() +
theme(legend.position = "top") +
theme(legend.text = element_text(size= 8.5),
legend.title = element_blank()) + # remove legend title
coord_flip()
Age-group questions
age <- df %>% select(c(h1_1, h2)) %>%
filter(h1_1 != "No answer") %>%
filter(!h2 %in% c("Prefer not to say", "No answer", "Non-binary", "Any other gender identity")) %>%
group_by(h1_1, h2) %>%
summarise(freq = n()) %>% ungroup() %>%
mutate(age = factor(h1_1, ordered = TRUE),
freq = ifelse(h2 == 'Male', (freq*-1), freq)) %>%
rename("gender" = "h2", "ageGroup" = "age") %>%
select(-c(h1_1))
## `summarise()` has grouped output by 'h1_1'. You can override using the
## `.groups` argument.
ggplot(age, aes(y = ageGroup, x = freq, fill = gender)) +
geom_bar(data=subset(age,gender=="Male"), stat = "identity") +
geom_bar(data=subset(age,gender=="Female"), stat = "identity") +
scale_x_continuous(breaks=c(-15, -50, -75, -100, 0, 25, 75, 100, 150, 180)) +
labs(title = "Age group ~ Gender",
subtitle = "Valid responses",
x = "",
y = "") +
theme_classic()+
scale_fill_manual(values = c('darkred', '#254061')) +
theme(legend.text = element_text(size= 8.5),
legend.title = element_blank())