initial set up

loading dataset and recoding variables

setwd("/Users/kaylapatricia/Desktop/soc222/REPORT")
library(dplyr)
cronos <- read_csv("CRON2W5e01.csv")

## Rows: 5765 Columns: 139
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (7): name, proddate, cntry, survlng, lastansw, uagent, region
## dbl  (122): edition, idno, c2weight, mode, w5q1, w5q2, w5q3, w5q4, w5q5, w5q...
## dttm  (10): inwds, inwde, w5ininwe, w5tsinwe, w5osinwe, w5cdinwe, w5ipinwe, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

portugal_data <- cronos %>%
  filter(cntry == "PT")
portugal_data <- portugal_data %>%
  mutate(health = case_when(
      w5q40 == 1 ~ 1, # much less vulnerable to 1
      w5q40 == 2 ~ 2, # less vulnerable to 2
      w5q40 == 3 ~ 3, # the same to 3
      w5q40 == 4 ~ 4, # more vulnerable to 4
      w5q40 == 5 ~ 5, # much more vulnerable to 5
      TRUE ~ NA_real_
  ))
portugal_data <- portugal_data %>% filter(!is.na(health))

selecting variables of interest

portugaldf <- portugal_data %>%
  dplyr::select(w5q2, w5q32, w5q40)
portugaldf <- na.omit(portugaldf)
head(portugaldf)

## # A tibble: 6 × 3
##    w5q2 w5q32 w5q40
##   <dbl> <dbl> <dbl>
## 1     8     2     3
## 2     8     4     3
## 3     7     1     2
## 4    10     5     2
## 5     8     5     3
## 6     9     4     3

descriptive table

portugal_data <- portugal_data %>%
  mutate(
    health = health,
    trstsnc = case_when(
      w5q2 > 10 ~ NA_real_,     # Set values above 10 to NA
      TRUE ~ w5q2               # Keep other values as is
    ),
    speed = case_when(
      w5q32 > 7 ~ NA_real_,       # Set values above 7 to NA
      w5q32 < 1 ~ NA_real_,        # Set values below 1 to NA
      TRUE ~ w5q32                  # Keep other values as is
    )
  )
dfpd <- portugal_data
dfpd <- dfpd %>% filter(!is.na(trstsnc))
dfpd <- dfpd %>% filter(!is.na(speed))

table1b <- datasummary_skim(dfpd  %>% dplyr::select(health, trstsnc, speed), title = "Table 1. Descriptive statistics for main variables", output = "flextable")

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

## Warning: `type='all'` is only supported for the `tinytable` backend. Set the
##   `type` argument explicitly to suppress this warning.

table1b

	Unique	Mean	SD	Min	Median	Max
health	5	2.8	0.9	1.0	3.0	5.0
trstsnc	10	7.9	1.6	0.0	8.0	10.0
speed	7	4.2	1.5	1.0	4.0	7.0

saving output

portugal_data <- read_csv("CRON2W5e01.csv")

## Rows: 5765 Columns: 139
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (7): name, proddate, cntry, survlng, lastansw, uagent, region
## dbl  (122): edition, idno, c2weight, mode, w5q1, w5q2, w5q3, w5q4, w5q5, w5q...
## dttm  (10): inwds, inwde, w5ininwe, w5tsinwe, w5osinwe, w5cdinwe, w5ipinwe, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

portugal_data <- portugal_data %>%
  filter(cntry == "PT") %>%
  
     mutate(
    trstsnc = case_when(
      w5q2 %in% 0:3 ~ "Low trust",
      w5q2 %in% 7:10 ~ "High trust",
      w5q2 %in% 4:6 ~ "Moderate trust", 
      w5q2 %in% c(77, 88, 99) ~ NA_character_,
      TRUE ~ as.character(w5q2)
    ), 
    
    speed = case_when(
      w5q32 %in% 1:2 ~ "slowly",
      w5q32 %in% 6:7 ~ "quickly",
      w5q32 %in% 3:5 ~ "moderately",
      w5q32 %in% c(9, 77, 88, 99) ~ NA_character_,
      TRUE ~ as.character(w5q32)
    ),
    
    health = case_when(
      w5q40 %in% 1:2 ~ "less vulnerable",
      w5q40 %in% 3 ~ "the same",
      w5q40 %in% 4:5 ~ "more vulnerable",
      w5q40 %in% c(9, 77, 88, 99) ~ NA_character_,
      TRUE ~ as.character(w5q40)
  )
     ) %>%
    filter(!is.na(trstsnc), !is.na(speed), !is.na(health))

checking variables

table(portugal_data$trstsnc)

## 
##     High trust      Low trust Moderate trust 
##            304              5             57

table(portugal_data$speed)

## 
## moderately    quickly     slowly 
##        242         70         54

table(portugal_data$health)

## 
## less vulnerable more vulnerable        the same 
##             108              55             203

hypothesis testing with infer – Step 1

test_stat <- portugal_data %>%
  specify(explanatory = speed, 
          response = health) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "chisq")

print(test_stat$stat)

## X-squared 
##  4.179716

Step 2

null_distribution <- portugal_data %>%
  specify(explanatory = speed,
          response = health) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")
null_distribution

## Response: health (factor)
## Explanatory: speed (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 1.54 
##  2         2 2.13 
##  3         3 4.87 
##  4         4 5.47 
##  5         5 1.68 
##  6         6 6.83 
##  7         7 0.867
##  8         8 6.43 
##  9         9 2.66 
## 10        10 4.10 
## # ℹ 990 more rows

step 3

p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "greater") 
p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.444

step 4

# Check the first few rows of the data
head(null_distribution)

## Response: health (factor)
## Explanatory: speed (factor)
## Null Hypothesis: independence
## # A tibble: 6 × 2
##   replicate  stat
##       <int> <dbl>
## 1         1  1.54
## 2         2  2.13
## 3         3  4.87
## 4         4  5.47
## 5         5  1.68
## 6         6  6.83

# Summarize the data
summary(null_distribution)

##    replicate           stat        
##  Min.   :   1.0   Min.   : 0.1474  
##  1st Qu.: 250.8   1st Qu.: 2.0351  
##  Median : 500.5   Median : 3.6826  
##  Mean   : 500.5   Mean   : 4.2540  
##  3rd Qu.: 750.2   3rd Qu.: 5.8291  
##  Max.   :1000.0   Max.   :26.9883

# Check the structure of the data
str(null_distribution)

## infer [1,000 × 2] (S3: infer/tbl_df/tbl/data.frame)
##  $ replicate: int [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ stat     : Named num [1:1000] 1.54 2.13 4.87 5.47 1.68 ...
##   ..- attr(*, "names")= chr [1:1000] "X-squared" "X-squared" "X-squared" "X-squared" ...
##  - attr(*, "response")= symbol health
##  - attr(*, "explanatory")= symbol speed
##  - attr(*, "response_type")= chr "factor"
##  - attr(*, "explanatory_type")= Named chr "factor"
##   ..- attr(*, "names")= chr "explanatory_variable(x)"
##  - attr(*, "distr_param")= Named int 4
##   ..- attr(*, "names")= chr "df"
##  - attr(*, "theory_type")= chr "Chi-square test of indep"
##  - attr(*, "type_desc_response")= chr "mult"
##  - attr(*, "type_desc_explanatory")= chr "mult"
##  - attr(*, "null")= chr "independence"
##  - attr(*, "generated")= logi TRUE
##  - attr(*, "type")= chr "permute"
##  - attr(*, "hypothesized")= logi TRUE
##  - attr(*, "fitted")= logi FALSE
##  - attr(*, "stat")= chr "Chisq"

# Calculate confidence interval using percentile method
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")

null_distribution %>%
  visualize(data, bins = 10, method = "simulation", dens_color = "black") + shade_p_value(obs_stat = test_stat, direction = "greater") + shade_confidence_interval(endpoints = conf_int)

# saving output

plot <- null_distribution %>%
  visualize(data, bins = 10, method = "simulation", dens_color = "black") +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
ggsave("null_distribution_plot.pdf", plot = plot, device = "pdf")

## Saving 7 x 5 in image

## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.

regression modelling output

dfp <- portugal_data
dfp <- dfp %>%
   mutate(health = case_when(
      w5q40 %in% c(1, 2) ~ 0,  # less vulnerable to 0
      w5q40 == 3 ~ NA_integer_, # collapse "the same" category
      w5q40 %in% c(4, 5) ~ 1,   # more vulnerable to 1 
      TRUE ~ NA_integer_
  ))

regression models table

model1 <- glm(health ~ speed, family = binomial, data = dfp)  # single predictor 
model2 <- glm(health ~ speed + trstsnc, family = binomial, data = dfp) # two predictors 
model3 <- glm(health ~ speed + trstsnc + speed*trstsnc, family = binomial, data = dfp) # both predictors + interaction

modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases

tinytable_5taqv2q4dfa9ejacr1ft

	(1)	(2)	(3)
(Intercept)	-0.7 (0.2)***	-0.7 (0.2)**	-0.7 (0.2)**
speedquickly	-0.2 (0.4)	-0.2 (0.4)	-0.3 (0.4)
speedslowly	0.6 (0.4)	0.6 (0.4)	0.4 (0.5)
trstsncLow trust		0.7 (1.4)	0.7 (1.4)
trstsncModerate trust		-0.3 (0.5)	-0.6 (0.6)
speedslowly × trstsncModerate trust			1.3 (1.2)
Num.Obs.	163	163	163
AIC	212.2	215.6	216.4
BIC	221.5	231.1	235.0
Log.Lik.	-103.093	-102.820	-102.211
RMSE	0.47	0.47	0.47

improving display

models <- list()
models[['SLR']]<- glm(health ~ speed, family = binomial, data = dfp)  # single predictor 
models[['MLR']]<- glm(health ~ speed + trstsnc, family = binomial, data = dfp) # two predictors 
models[['Interaction']]<- glm(health ~ speed + trstsnc + speed*trstsnc, family = binomial, data = dfp) # both predictors + interaction
modelsummary(models,
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  exponentiate = TRUE, ## NOTE: this is to exponentiate directly, otherwise leave out or set to FALSE
  coef_rename = c("speedquickly" = "Speed Quickly", "speedslowly" = "Speed Slowly", "trstsncLow trust" = "Low Trust", "trstsncModerate trust" = "Moderate Trust", "speedslowly x trstsncModerate trust" = "Speed Slowly x Moderate Trust"),
  title = 'Table 2. Regression models predicting subjective mental health vulnerability')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases

tinytable_0lolj36xrqmrhhrstlea

Table 2. Regression models predicting subjective mental health vulnerability
	SLR	MLR	Interaction
(Intercept)	0.5 (0.1)***	0.5 (0.1)**	0.5 (0.1)**
Speed Quickly	0.8 (0.4)	0.8 (0.4)	0.8 (0.3)
Speed Slowly	1.8 (0.8)	1.8 (0.8)	1.4 (0.7)
Low Trust		2.0 (2.9)	1.9 (2.8)
Moderate Trust		0.8 (0.4)	0.6 (0.3)
Speed Slowly:Moderate Trust			3.6 (4.3)
Num.Obs.	163	163	163
AIC	212.2	215.6	216.4
BIC	221.5	231.1	235.0
Log.Lik.	-103.093	-102.820	-102.211
RMSE	0.47	0.47	0.47

visual of choice

interaction_plot_portugal <- effect("speed*trstsnc", model3, na.rm=TRUE)

## Note:
##   3 values in the speed*trstsnc effect are not estimable

plot(interaction_plot_portugal,
     main="Interaction effect",
     xlab="speed",
     ylab="perception of mental health vulnerability")

report

Kayla Salomon

2024-04-13

initial set up

loading dataset and recoding variables

selecting variables of interest

descriptive table

saving output

checking variables

hypothesis testing with infer – Step 1

Step 2

step 3

step 4

regression modelling output

regression models table

improving display

visual of choice