initial set up

loading dataset and recoding variables

setwd("/Users/kaylapatricia/Desktop/soc222/REPORT")
library(dplyr)
cronos <- read_csv("CRON2W5e01.csv")
## Rows: 5765 Columns: 139
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (7): name, proddate, cntry, survlng, lastansw, uagent, region
## dbl  (122): edition, idno, c2weight, mode, w5q1, w5q2, w5q3, w5q4, w5q5, w5q...
## dttm  (10): inwds, inwde, w5ininwe, w5tsinwe, w5osinwe, w5cdinwe, w5ipinwe, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
portugal_data <- cronos %>%
  filter(cntry == "PT")
portugal_data <- portugal_data %>%
  mutate(health = case_when(
      w5q40 == 1 ~ 1, # much less vulnerable to 1
      w5q40 == 2 ~ 2, # less vulnerable to 2
      w5q40 == 3 ~ 3, # the same to 3
      w5q40 == 4 ~ 4, # more vulnerable to 4
      w5q40 == 5 ~ 5, # much more vulnerable to 5
      TRUE ~ NA_real_
  ))
portugal_data <- portugal_data %>% filter(!is.na(health))

selecting variables of interest

portugaldf <- portugal_data %>%
  dplyr::select(w5q2, w5q32, w5q40)
portugaldf <- na.omit(portugaldf)
head(portugaldf)
## # A tibble: 6 × 3
##    w5q2 w5q32 w5q40
##   <dbl> <dbl> <dbl>
## 1     8     2     3
## 2     8     4     3
## 3     7     1     2
## 4    10     5     2
## 5     8     5     3
## 6     9     4     3

descriptive table

portugal_data <- portugal_data %>%
  mutate(
    health = health,
    trstsnc = case_when(
      w5q2 > 10 ~ NA_real_,     # Set values above 10 to NA
      TRUE ~ w5q2               # Keep other values as is
    ),
    speed = case_when(
      w5q32 > 7 ~ NA_real_,       # Set values above 7 to NA
      w5q32 < 1 ~ NA_real_,        # Set values below 1 to NA
      TRUE ~ w5q32                  # Keep other values as is
    )
  )
dfpd <- portugal_data
dfpd <- dfpd %>% filter(!is.na(trstsnc))
dfpd <- dfpd %>% filter(!is.na(speed))

table1b <- datasummary_skim(dfpd  %>% dplyr::select(health, trstsnc, speed), title = "Table 1. Descriptive statistics for main variables", output = "flextable")
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
## Warning: `type='all'` is only supported for the `tinytable` backend. Set the
##   `type` argument explicitly to suppress this warning.
table1b

Unique

Missing Pct.

Mean

SD

Min

Median

Max

health

5

0

2.8

0.9

1.0

3.0

5.0

trstsnc

10

0

7.9

1.6

0.0

8.0

10.0

speed

7

0

4.2

1.5

1.0

4.0

7.0

saving output

portugal_data <- read_csv("CRON2W5e01.csv")
## Rows: 5765 Columns: 139
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (7): name, proddate, cntry, survlng, lastansw, uagent, region
## dbl  (122): edition, idno, c2weight, mode, w5q1, w5q2, w5q3, w5q4, w5q5, w5q...
## dttm  (10): inwds, inwde, w5ininwe, w5tsinwe, w5osinwe, w5cdinwe, w5ipinwe, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
portugal_data <- portugal_data %>%
  filter(cntry == "PT") %>%
  
     mutate(
    trstsnc = case_when(
      w5q2 %in% 0:3 ~ "Low trust",
      w5q2 %in% 7:10 ~ "High trust",
      w5q2 %in% 4:6 ~ "Moderate trust", 
      w5q2 %in% c(77, 88, 99) ~ NA_character_,
      TRUE ~ as.character(w5q2)
    ), 
    
    speed = case_when(
      w5q32 %in% 1:2 ~ "slowly",
      w5q32 %in% 6:7 ~ "quickly",
      w5q32 %in% 3:5 ~ "moderately",
      w5q32 %in% c(9, 77, 88, 99) ~ NA_character_,
      TRUE ~ as.character(w5q32)
    ),
    
    health = case_when(
      w5q40 %in% 1:2 ~ "less vulnerable",
      w5q40 %in% 3 ~ "the same",
      w5q40 %in% 4:5 ~ "more vulnerable",
      w5q40 %in% c(9, 77, 88, 99) ~ NA_character_,
      TRUE ~ as.character(w5q40)
  )
     ) %>%
    filter(!is.na(trstsnc), !is.na(speed), !is.na(health)) 

checking variables

table(portugal_data$trstsnc)
## 
##     High trust      Low trust Moderate trust 
##            304              5             57
table(portugal_data$speed)
## 
## moderately    quickly     slowly 
##        242         70         54
table(portugal_data$health)
## 
## less vulnerable more vulnerable        the same 
##             108              55             203

hypothesis testing with infer – Step 1

test_stat <- portugal_data %>%
  specify(explanatory = speed, 
          response = health) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq")
print(test_stat$stat)
## X-squared 
##  4.179716

Step 2

null_distribution <- portugal_data %>%
  specify(explanatory = speed,
          response = health) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")
null_distribution
## Response: health (factor)
## Explanatory: speed (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.54 
##  2         2 2.13 
##  3         3 4.87 
##  4         4 5.47 
##  5         5 1.68 
##  6         6 6.83 
##  7         7 0.867
##  8         8 6.43 
##  9         9 2.66 
## 10        10 4.10 
## # ℹ 990 more rows

step 3

p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "greater") 
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.444

step 4

# Check the first few rows of the data
head(null_distribution)
## Response: health (factor)
## Explanatory: speed (factor)
## Null Hypothesis: independence
## # A tibble: 6 × 2
##   replicate  stat
##       <int> <dbl>
## 1         1  1.54
## 2         2  2.13
## 3         3  4.87
## 4         4  5.47
## 5         5  1.68
## 6         6  6.83
# Summarize the data
summary(null_distribution)
##    replicate           stat        
##  Min.   :   1.0   Min.   : 0.1474  
##  1st Qu.: 250.8   1st Qu.: 2.0351  
##  Median : 500.5   Median : 3.6826  
##  Mean   : 500.5   Mean   : 4.2540  
##  3rd Qu.: 750.2   3rd Qu.: 5.8291  
##  Max.   :1000.0   Max.   :26.9883
# Check the structure of the data
str(null_distribution)
## infer [1,000 × 2] (S3: infer/tbl_df/tbl/data.frame)
##  $ replicate: int [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ stat     : Named num [1:1000] 1.54 2.13 4.87 5.47 1.68 ...
##   ..- attr(*, "names")= chr [1:1000] "X-squared" "X-squared" "X-squared" "X-squared" ...
##  - attr(*, "response")= symbol health
##  - attr(*, "explanatory")= symbol speed
##  - attr(*, "response_type")= chr "factor"
##  - attr(*, "explanatory_type")= Named chr "factor"
##   ..- attr(*, "names")= chr "explanatory_variable(x)"
##  - attr(*, "distr_param")= Named int 4
##   ..- attr(*, "names")= chr "df"
##  - attr(*, "theory_type")= chr "Chi-square test of indep"
##  - attr(*, "type_desc_response")= chr "mult"
##  - attr(*, "type_desc_explanatory")= chr "mult"
##  - attr(*, "null")= chr "independence"
##  - attr(*, "generated")= logi TRUE
##  - attr(*, "type")= chr "permute"
##  - attr(*, "hypothesized")= logi TRUE
##  - attr(*, "fitted")= logi FALSE
##  - attr(*, "stat")= chr "Chisq"
# Calculate confidence interval using percentile method
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")

null_distribution %>%
  visualize(data, bins = 10, method = "simulation", dens_color = "black") + shade_p_value(obs_stat = test_stat, direction = "greater") + shade_confidence_interval(endpoints = conf_int)

# saving output

plot <- null_distribution %>%
  visualize(data, bins = 10, method = "simulation", dens_color = "black") +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
ggsave("null_distribution_plot.pdf", plot = plot, device = "pdf")
## Saving 7 x 5 in image
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

regression modelling output

dfp <- portugal_data
dfp <- dfp %>%
   mutate(health = case_when(
      w5q40 %in% c(1, 2) ~ 0,  # less vulnerable to 0
      w5q40 == 3 ~ NA_integer_, # collapse "the same" category
      w5q40 %in% c(4, 5) ~ 1,   # more vulnerable to 1 
      TRUE ~ NA_integer_
  ))

regression models table

model1 <- glm(health ~ speed, family = binomial, data = dfp)  # single predictor 
model2 <- glm(health ~ speed + trstsnc, family = binomial, data = dfp) # two predictors 
model3 <- glm(health ~ speed + trstsnc + speed*trstsnc, family = binomial, data = dfp) # both predictors + interaction

modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
tinytable_5taqv2q4dfa9ejacr1ft
(1) (2) (3)
(Intercept) -0.7 (0.2)*** -0.7 (0.2)** -0.7 (0.2)**
speedquickly -0.2 (0.4) -0.2 (0.4) -0.3 (0.4)
speedslowly 0.6 (0.4) 0.6 (0.4) 0.4 (0.5)
trstsncLow trust 0.7 (1.4) 0.7 (1.4)
trstsncModerate trust -0.3 (0.5) -0.6 (0.6)
speedslowly × trstsncModerate trust 1.3 (1.2)
Num.Obs. 163 163 163
AIC 212.2 215.6 216.4
BIC 221.5 231.1 235.0
Log.Lik. -103.093 -102.820 -102.211
RMSE 0.47 0.47 0.47

improving display

models <- list()
models[['SLR']]<- glm(health ~ speed, family = binomial, data = dfp)  # single predictor 
models[['MLR']]<- glm(health ~ speed + trstsnc, family = binomial, data = dfp) # two predictors 
models[['Interaction']]<- glm(health ~ speed + trstsnc + speed*trstsnc, family = binomial, data = dfp) # both predictors + interaction
modelsummary(models,
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  exponentiate = TRUE, ## NOTE: this is to exponentiate directly, otherwise leave out or set to FALSE
  coef_rename = c("speedquickly" = "Speed Quickly", "speedslowly" = "Speed Slowly", "trstsncLow trust" = "Low Trust", "trstsncModerate trust" = "Moderate Trust", "speedslowly x trstsncModerate trust" = "Speed Slowly x Moderate Trust"),
  title = 'Table 2. Regression models predicting subjective mental health vulnerability')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
tinytable_0lolj36xrqmrhhrstlea
Table 2. Regression models predicting subjective mental health vulnerability
SLR MLR Interaction
(Intercept) 0.5 (0.1)*** 0.5 (0.1)** 0.5 (0.1)**
Speed Quickly 0.8 (0.4) 0.8 (0.4) 0.8 (0.3)
Speed Slowly 1.8 (0.8) 1.8 (0.8) 1.4 (0.7)
Low Trust 2.0 (2.9) 1.9 (2.8)
Moderate Trust 0.8 (0.4) 0.6 (0.3)
Speed Slowly:Moderate Trust 3.6 (4.3)
Num.Obs. 163 163 163
AIC 212.2 215.6 216.4
BIC 221.5 231.1 235.0
Log.Lik. -103.093 -102.820 -102.211
RMSE 0.47 0.47 0.47

visual of choice

interaction_plot_portugal <- effect("speed*trstsnc", model3, na.rm=TRUE)
## Note:
##   3 values in the speed*trstsnc effect are not estimable
plot(interaction_plot_portugal,
     main="Interaction effect",
     xlab="speed",
     ylab="perception of mental health vulnerability")