setwd("/Users/kaylapatricia/Desktop/soc222/REPORT")
library(dplyr)
cronos <- read_csv("CRON2W5e01.csv")
## Rows: 5765 Columns: 139
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, proddate, cntry, survlng, lastansw, uagent, region
## dbl (122): edition, idno, c2weight, mode, w5q1, w5q2, w5q3, w5q4, w5q5, w5q...
## dttm (10): inwds, inwde, w5ininwe, w5tsinwe, w5osinwe, w5cdinwe, w5ipinwe, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
portugal_data <- cronos %>%
filter(cntry == "PT")
portugal_data <- portugal_data %>%
mutate(health = case_when(
w5q40 == 1 ~ 1, # much less vulnerable to 1
w5q40 == 2 ~ 2, # less vulnerable to 2
w5q40 == 3 ~ 3, # the same to 3
w5q40 == 4 ~ 4, # more vulnerable to 4
w5q40 == 5 ~ 5, # much more vulnerable to 5
TRUE ~ NA_real_
))
portugal_data <- portugal_data %>% filter(!is.na(health))
portugaldf <- portugal_data %>%
dplyr::select(w5q2, w5q32, w5q40)
portugaldf <- na.omit(portugaldf)
head(portugaldf)
## # A tibble: 6 × 3
## w5q2 w5q32 w5q40
## <dbl> <dbl> <dbl>
## 1 8 2 3
## 2 8 4 3
## 3 7 1 2
## 4 10 5 2
## 5 8 5 3
## 6 9 4 3
portugal_data <- portugal_data %>%
mutate(
health = health,
trstsnc = case_when(
w5q2 > 10 ~ NA_real_, # Set values above 10 to NA
TRUE ~ w5q2 # Keep other values as is
),
speed = case_when(
w5q32 > 7 ~ NA_real_, # Set values above 7 to NA
w5q32 < 1 ~ NA_real_, # Set values below 1 to NA
TRUE ~ w5q32 # Keep other values as is
)
)
dfpd <- portugal_data
dfpd <- dfpd %>% filter(!is.na(trstsnc))
dfpd <- dfpd %>% filter(!is.na(speed))
table1b <- datasummary_skim(dfpd %>% dplyr::select(health, trstsnc, speed), title = "Table 1. Descriptive statistics for main variables", output = "flextable")
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
## produced by the `tinytable` backend.
## Warning: `type='all'` is only supported for the `tinytable` backend. Set the
## `type` argument explicitly to suppress this warning.
table1b
| Unique | Missing Pct. | Mean | SD | Min | Median | Max |
|---|---|---|---|---|---|---|---|
health | 5 | 0 | 2.8 | 0.9 | 1.0 | 3.0 | 5.0 |
trstsnc | 10 | 0 | 7.9 | 1.6 | 0.0 | 8.0 | 10.0 |
speed | 7 | 0 | 4.2 | 1.5 | 1.0 | 4.0 | 7.0 |
portugal_data <- read_csv("CRON2W5e01.csv")
## Rows: 5765 Columns: 139
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, proddate, cntry, survlng, lastansw, uagent, region
## dbl (122): edition, idno, c2weight, mode, w5q1, w5q2, w5q3, w5q4, w5q5, w5q...
## dttm (10): inwds, inwde, w5ininwe, w5tsinwe, w5osinwe, w5cdinwe, w5ipinwe, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
portugal_data <- portugal_data %>%
filter(cntry == "PT") %>%
mutate(
trstsnc = case_when(
w5q2 %in% 0:3 ~ "Low trust",
w5q2 %in% 7:10 ~ "High trust",
w5q2 %in% 4:6 ~ "Moderate trust",
w5q2 %in% c(77, 88, 99) ~ NA_character_,
TRUE ~ as.character(w5q2)
),
speed = case_when(
w5q32 %in% 1:2 ~ "slowly",
w5q32 %in% 6:7 ~ "quickly",
w5q32 %in% 3:5 ~ "moderately",
w5q32 %in% c(9, 77, 88, 99) ~ NA_character_,
TRUE ~ as.character(w5q32)
),
health = case_when(
w5q40 %in% 1:2 ~ "less vulnerable",
w5q40 %in% 3 ~ "the same",
w5q40 %in% 4:5 ~ "more vulnerable",
w5q40 %in% c(9, 77, 88, 99) ~ NA_character_,
TRUE ~ as.character(w5q40)
)
) %>%
filter(!is.na(trstsnc), !is.na(speed), !is.na(health))
table(portugal_data$trstsnc)
##
## High trust Low trust Moderate trust
## 304 5 57
table(portugal_data$speed)
##
## moderately quickly slowly
## 242 70 54
table(portugal_data$health)
##
## less vulnerable more vulnerable the same
## 108 55 203
test_stat <- portugal_data %>%
specify(explanatory = speed,
response = health) %>%
hypothesize(null = "independence") %>%
calculate(stat = "chisq")
print(test_stat$stat)
## X-squared
## 4.179716
null_distribution <- portugal_data %>%
specify(explanatory = speed,
response = health) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
null_distribution
## Response: health (factor)
## Explanatory: speed (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 1.54
## 2 2 2.13
## 3 3 4.87
## 4 4 5.47
## 5 5 1.68
## 6 6 6.83
## 7 7 0.867
## 8 8 6.43
## 9 9 2.66
## 10 10 4.10
## # ℹ 990 more rows
p_val <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0.444
# Check the first few rows of the data
head(null_distribution)
## Response: health (factor)
## Explanatory: speed (factor)
## Null Hypothesis: independence
## # A tibble: 6 × 2
## replicate stat
## <int> <dbl>
## 1 1 1.54
## 2 2 2.13
## 3 3 4.87
## 4 4 5.47
## 5 5 1.68
## 6 6 6.83
# Summarize the data
summary(null_distribution)
## replicate stat
## Min. : 1.0 Min. : 0.1474
## 1st Qu.: 250.8 1st Qu.: 2.0351
## Median : 500.5 Median : 3.6826
## Mean : 500.5 Mean : 4.2540
## 3rd Qu.: 750.2 3rd Qu.: 5.8291
## Max. :1000.0 Max. :26.9883
# Check the structure of the data
str(null_distribution)
## infer [1,000 × 2] (S3: infer/tbl_df/tbl/data.frame)
## $ replicate: int [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
## $ stat : Named num [1:1000] 1.54 2.13 4.87 5.47 1.68 ...
## ..- attr(*, "names")= chr [1:1000] "X-squared" "X-squared" "X-squared" "X-squared" ...
## - attr(*, "response")= symbol health
## - attr(*, "explanatory")= symbol speed
## - attr(*, "response_type")= chr "factor"
## - attr(*, "explanatory_type")= Named chr "factor"
## ..- attr(*, "names")= chr "explanatory_variable(x)"
## - attr(*, "distr_param")= Named int 4
## ..- attr(*, "names")= chr "df"
## - attr(*, "theory_type")= chr "Chi-square test of indep"
## - attr(*, "type_desc_response")= chr "mult"
## - attr(*, "type_desc_explanatory")= chr "mult"
## - attr(*, "null")= chr "independence"
## - attr(*, "generated")= logi TRUE
## - attr(*, "type")= chr "permute"
## - attr(*, "hypothesized")= logi TRUE
## - attr(*, "fitted")= logi FALSE
## - attr(*, "stat")= chr "Chisq"
# Calculate confidence interval using percentile method
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize(data, bins = 10, method = "simulation", dens_color = "black") + shade_p_value(obs_stat = test_stat, direction = "greater") + shade_confidence_interval(endpoints = conf_int)
# saving output
plot <- null_distribution %>%
visualize(data, bins = 10, method = "simulation", dens_color = "black") +
shade_p_value(obs_stat = test_stat, direction = "two-sided") +
shade_confidence_interval(endpoints = conf_int)
ggsave("null_distribution_plot.pdf", plot = plot, device = "pdf")
## Saving 7 x 5 in image
## Warning: Chi-Square usually corresponds to right-tailed tests. Proceed with
## caution.
dfp <- portugal_data
dfp <- dfp %>%
mutate(health = case_when(
w5q40 %in% c(1, 2) ~ 0, # less vulnerable to 0
w5q40 == 3 ~ NA_integer_, # collapse "the same" category
w5q40 %in% c(4, 5) ~ 1, # more vulnerable to 1
TRUE ~ NA_integer_
))
model1 <- glm(health ~ speed, family = binomial, data = dfp) # single predictor
model2 <- glm(health ~ speed + trstsnc, family = binomial, data = dfp) # two predictors
model3 <- glm(health ~ speed + trstsnc + speed*trstsnc, family = binomial, data = dfp) # both predictors + interaction
modelsummary(
list(model1, model2, model3),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | -0.7 (0.2)*** | -0.7 (0.2)** | -0.7 (0.2)** |
| speedquickly | -0.2 (0.4) | -0.2 (0.4) | -0.3 (0.4) |
| speedslowly | 0.6 (0.4) | 0.6 (0.4) | 0.4 (0.5) |
| trstsncLow trust | 0.7 (1.4) | 0.7 (1.4) | |
| trstsncModerate trust | -0.3 (0.5) | -0.6 (0.6) | |
| speedslowly × trstsncModerate trust | 1.3 (1.2) | ||
| Num.Obs. | 163 | 163 | 163 |
| AIC | 212.2 | 215.6 | 216.4 |
| BIC | 221.5 | 231.1 | 235.0 |
| Log.Lik. | -103.093 | -102.820 | -102.211 |
| RMSE | 0.47 | 0.47 | 0.47 |
models <- list()
models[['SLR']]<- glm(health ~ speed, family = binomial, data = dfp) # single predictor
models[['MLR']]<- glm(health ~ speed + trstsnc, family = binomial, data = dfp) # two predictors
models[['Interaction']]<- glm(health ~ speed + trstsnc + speed*trstsnc, family = binomial, data = dfp) # both predictors + interaction
modelsummary(models,
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL,
exponentiate = TRUE, ## NOTE: this is to exponentiate directly, otherwise leave out or set to FALSE
coef_rename = c("speedquickly" = "Speed Quickly", "speedslowly" = "Speed Slowly", "trstsncLow trust" = "Low Trust", "trstsncModerate trust" = "Moderate Trust", "speedslowly x trstsncModerate trust" = "Speed Slowly x Moderate Trust"),
title = 'Table 2. Regression models predicting subjective mental health vulnerability')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
| SLR | MLR | Interaction | |
|---|---|---|---|
| (Intercept) | 0.5 (0.1)*** | 0.5 (0.1)** | 0.5 (0.1)** |
| Speed Quickly | 0.8 (0.4) | 0.8 (0.4) | 0.8 (0.3) |
| Speed Slowly | 1.8 (0.8) | 1.8 (0.8) | 1.4 (0.7) |
| Low Trust | 2.0 (2.9) | 1.9 (2.8) | |
| Moderate Trust | 0.8 (0.4) | 0.6 (0.3) | |
| Speed Slowly:Moderate Trust | 3.6 (4.3) | ||
| Num.Obs. | 163 | 163 | 163 |
| AIC | 212.2 | 215.6 | 216.4 |
| BIC | 221.5 | 231.1 | 235.0 |
| Log.Lik. | -103.093 | -102.820 | -102.211 |
| RMSE | 0.47 | 0.47 | 0.47 |
interaction_plot_portugal <- effect("speed*trstsnc", model3, na.rm=TRUE)
## Note:
## 3 values in the speed*trstsnc effect are not estimable
plot(interaction_plot_portugal,
main="Interaction effect",
xlab="speed",
ylab="perception of mental health vulnerability")