survey data with r (part 3)

Objective

Create a new variable

Library

pacman::p_load(
  here,       # relative file pathways  
  haven,      # reading sav file
  sjlabelled,
  labelled,
  dplyr,      # data cleaning 
  rio,        # importing data  
  janitor,    # data cleaning and tables
  lubridate,  # working with dates
  matchmaker, # dictionary-based cleaning
  epikit,     # age_categories() function
  tidyverse,  # data management and visualization
  surveytoolbox,
  pbkrtest,
  sjPlot,
  foreign,
  gmodels,
  datawizard, # freq
  AMR,        # age  
  skimr 
)

## Installing package into 'C:/Users/linhp/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

Data import

mydata1 %>% sjPlot::view_df() # display SPSS variable view 
df <- mydata1

Data cleaning

Drop columns

df <- df[-c(99, 100:108)]

Column names

df <- df %>% 
  # janitor::clean_names() %>%                              # clean name (capital, special symbols, etc.)
  dplyr::rename(responsetime = surveytime,  # rename 
                userid = id)

Creation new variable

df_Q10 <- df

Solution 1

df_Q10 <- haven::as_factor(df_Q10) 
str(df_Q10$Q10a)                      # factor level

# combine three columns Q10a, Q10b, Q10c into one single columns annual_income 
df_Q10 <- df_Q10 %>% 
  unite(
    col = 'annual_income',
    c('Q10a', 'Q10b', 'Q10c'),
    sep = " ",
    remove = FALSE,
    na.rm = TRUE
  )

df_Q10 <- df_Q10 %>% 
  mutate(annual_income = fct_recode(
    annual_income,
    "0-8,626" = "0-718",
    "0-8,626" = "0-155",
    
    # ....
  )
)

Solution 2

Q10a - average weekly incomes

#### Step 1: remove string, text 
df <- haven::as_factor(df) 
df <- df %>% 
  separate(Q10a, into = c("w_from", "w_to"), sep = "-", extra = "merge")     

df$w_from <- gsub('and more', '', as.character(df$w_from))           # df$w_from <- str_replace(string = df$w_from, pattern = 'and more', replacement = '') 
df$w_from <- as.numeric(gsub(",","",df$w_from))                      # remove comma (,) and turn to numeric
df$w_to <- as.numeric(gsub(",","",df$w_to))                          # remove comma (,) and turn to numeric

#### Step 2: Calculate annual income 
df <- df %>% 
  mutate(w_from_annual = (df$w_from * 52), .after = w_to) %>%        # 52 week per year    
  mutate(w_to_annual = (df$w_to * 52), .after = w_from_annual)       # 52 week per year 

# # Separate thousand - hundreds,...
# # Adding "," and numeric is converted back to character
# df$w_from_annual <- format(as.numeric(df$w_from_annual),big.mark=",") 
# df$w_to_annual <- format(as.numeric(df$w_to_annual),big.mark=",")

#### Step 3: Unite columns 
df$w_from_annual <- as.character(df$w_from_annual)
df$w_to_annual <- as.character(df$w_to_annual)

df <- df %>% 
  unite(
    col = 'annual_income (w)',
    c('w_from_annual', 'w_to_annual'),
    sep = '-',
    remove = TRUE,
    na.rm = FALSE
  )

#### Step 4
df$w_from <- NULL
df$w_to <- NULL 

df$`annual_income (w)` <- gsub('NA-NA', '', as.character(df$`annual_income (w)`)) 
# df$`avg annual earnings` <- str_replace(string = df$`avg annual earnings`, pattern = 'NA-NA', replacement = '') 
df$`annual_income (w)` <- str_replace(df$`annual_income (w)`, '54236-NA', '54236 and more')
# df$`avg annual earnings`[df$`avg annual earnings` == '54236-NA'] <- '54236 and more'  <-- using R base

Q10b - average monthly income

#### Step 1: remove string, text 
df <- haven::as_factor(df) 
df <- df %>% 
  separate(Q10b, into = c("m_from", "m_to"), sep = "-", extra = "merge")     

df$m_from <- gsub('and more', '', as.character(df$m_from))           
df$m_from <- as.numeric(gsub(",","",df$m_from))                     
df$m_to <- as.numeric(gsub(",","",df$m_to))

#### Step 2: Calculate annual income 
df <- df %>% 
  mutate(m_from_annual = (df$m_from * 12), .after = m_to) %>%        # 12 month per year    
  mutate(m_to_annual = (df$m_to * 12), .after = m_from_annual)       # 12 month per year 

# # Separate thousand - hundreds,...
# df$m_from_annual <- format(as.numeric(df$m_from_annual),big.mark=",") 
# df$m_to_annual <- format(as.numeric(df$m_to_annual),big.mark=",")

#### Step 3: Unite columns  
df$m_from_annual <- as.character(df$m_from_annual) 
df$m_to_annual <- as.character(df$m_to_annual) 

df <- df %>% 
  unite(
    col = 'annual_income (m)',
    c('m_from_annual', 'm_to_annual'),
    sep = '-',
    remove = TRUE,
    na.rm = FALSE
  )

df$m_from <- NULL
df$m_to <- NULL 

df$`annual_income (m)` <- gsub('NA-NA', '', as.character(df$`annual_income (m)`)) 
df$`annual_income (m)` <- str_replace(df$`annual_income (m)`, '55596-NA', '55596 and more')

str(df$Q10c)
df$Q10c <- as.character(df$Q10c)
str(df$Q10c)

df <- df %>% 
  unite(
    col = 'annual income (£)',
    c('annual_income (w)', 'annual_income (m)', 'Q10c'),
    sep = ' ',
    remove = TRUE,
    na.rm = FALSE
  )

df$`annual income (£)` <- gsub('NA','', df$`annual income (£)`)
df$`annual income (£)` <- gsub(',','', df$`annual income (£)`)
df$`annual income (£)` <- str_trim(df$`annual income (£)`, side = c("both"))

df %>% 
  mutate(earnings = (fct_relevel(`annual income (£)`)), .after = `annual income (£)`) %>% 
  tabyl(earnings)

df2 <- df %>% select(c(Q10, `annual income (£)`))

df2 %>% group_by(Q10, `annual income (£)`) %>% summarise(count = n())

## `summarise()` has grouped output by 'Q10'. You can override using the `.groups`
## argument.

survey data with r (part 3)

2023-04-12

Objective

Library

Data import

Data cleaning

Drop columns

Column names

Creation new variable

Solution 1

Solution 2

Q10a - average weekly incomes

Q10b - average monthly income