# Load the necessary packages
library(tidyverse)
library(infer)
# Set the seed for reproducibility
set.seed(02138)
# Load the data from the given URL
cbt <- read_csv("https://bit.ly/3KfrFtx")
# Create new variables
cbt <- cbt %>%
mutate(
cbt_assigned = if_else(tpassigned == 1, "CBT", "No CBT"),
unhoused_baseline = if_else(homeless_baseline == 1, "Unhoused", "Housed")
)
# Calculate the difference in proportions of being unhoused at baseline
base_diff <- cbt %>%
group_by(cbt_assigned) %>%
summarize(prop_unhoused = mean(homeless_baseline)) %>%
summarize(diff = diff(prop_unhoused)) %>%
pull(diff); base_diff
## [1] 0.006020995
# Convert homeless_baseline to a factor first
cbt <- cbt %>%
mutate(homeless_baseline = as.factor(homeless_baseline))
# Conduct a permutation test
permutation_test <- cbt %>%
specify(homeless_baseline ~ cbt_assigned, success = "1") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in props", order = c("CBT", "No CBT"))
# Calculate the p-value
base_p <- permutation_test %>%
get_p_value(obs_stat = base_diff, direction = "two-sided"); base_p
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0.884
According to the report below, The Observed Difference in proportions (base_diff) is 0.006020995 and The Two-sided p-value (base_p$p_value) is 0.884. In this case, Two-sided p-value is greater than 0.05, it will be failed to reject the null hypothesis because the proportion of unhoused individuals does not differ significantly (no significant difference) between CBT treatment group and the non-CBT group. Thus, the Type II error would be concerned since it fails to reject a false null hypothesis, and there might be an actual difference that we did not detect due to the limitations of the sample size or experimental design.
# Again, set seed for reproducibility
set.seed(02138)
# Create new variables call "Attended CBT" for "cbt_attended"
cbt <- cbt %>%
mutate(
cbt_assigned = if_else(tpassigned == 1, "CBT", "No CBT"),
unhoused_baseline = if_else(homeless_baseline == 1, "Unhoused", "Housed"),
cbt_attended = if_else(attend_80 == 1, "Attended CBT", "Not Attended")
)
# Filter data for those assigned to CBT treatment only
cbt_treatment <- cbt %>% filter(tpassigned == 1)
# Calculate the difference in proportions of being unhoused at baseline
base_diff_attend <- cbt_treatment %>%
group_by(cbt_attended) %>%
summarize(prop_unhoused = mean(as.numeric(homeless_baseline))) %>%
summarize(diff = diff(prop_unhoused)) %>%
pull(diff); base_diff_attend
## [1] 0.1471161
# Convert homeless_baseline to a factor first
cbt_treatment <- cbt_treatment %>%
mutate(homeless_baseline = as.factor(homeless_baseline))
# Conduct a permutation test
permutation_test_attend <- cbt_treatment %>%
specify(homeless_baseline ~ cbt_attended, success = "1") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in props", order = c("Attended CBT", "Not Attended"))
# Calculate the p-value
base_p_attend <- permutation_test_attend %>%
get_p_value(obs_stat = base_diff_attend, direction = "two-sided"); base_p_attend
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
The report shows The Observed Difference in proportions is 0.1471161 (base_diff_attend) and Two-sided p-value is 0 (base_p_attend$p_value). In this case, the Two-sided of p-value is less than 0.05, it can reject the null hypothesis since it has a significant difference in the proportion of unhoused individuals between those who attended at least 80% of the CBT meetings and those who did not. Type I error is what needs to concern because it might be rejecting a true null hypothesis. Asides, the parameter that controls this type of error is the significance level (alpha).
# Set seed for reproducibility
set.seed(02138)
# Calculate the estimated Average Treatment Effect (ATE)
ate <- cbt %>%
group_by(cbt_assigned) %>%
summarize(mean_asb_st = mean(fam_asb_st, na.rm = TRUE)) %>%
summarize(diff = diff(mean_asb_st)) %>%
pull(diff); ate
## [1] 0.224695
# Conduct a permutation test
ate_null_dist <- cbt %>%
specify(fam_asb_st ~ cbt_assigned) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("CBT", "No CBT"))
## Warning: Removed 77 rows containing missing values.
# Plot the null distribution and the observed ATE
ggplot(data = ate_null_dist, aes(x = stat)) +
geom_histogram(binwidth = 0.05, fill = "pink3", color = "white") +
geom_vline(aes(xintercept = ate), color = "lightslateblue", linetype = "dashed") +
labs(title = "Null Distribution of the Difference in Means",
x = "Difference in Means",
y = "Frequency") +
theme_minimal()
The result shows two-sided p-value is 0 (0 < 0.05), it can reject the null hypothesis. This indicates the CBT treatment has a statistically significant effect on reducing short-term anti-social behavior. The probability of observing a difference in means at least as extreme as the one we found, assuming the null hypothesis is true since there is no actual treatment effect.
# Calculate the two-sided p-value
ate_p <- ate_null_dist %>%
get_p_value(obs_stat = ate, direction = "two-sided"); ate_p
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
# Set seed for reproducibility
set.seed(02138)
# Calculate the estimated Average Treatment Effect (ATE) for long-term measure
ate_lt <- cbt %>%
group_by(cbt_assigned) %>%
summarize(mean_asb_lt = mean(fam_asb_lt, na.rm = TRUE)) %>%
summarize(diff = diff(mean_asb_lt)) %>%
pull(diff); ate_lt
## [1] 0.1854676
# Conduct a permutation test for the long-term measure
ate_lt_null_dist <- cbt %>%
specify(fam_asb_lt ~ cbt_assigned) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("CBT", "No CBT"))
## Warning: Removed 52 rows containing missing values.
# Plot the null distribution and the observed ATE for the long-term measure
ate_lt_null_dist %>%
visualize() +
shade_p_value(obs_stat = ate_lt, direction = "two-sided") +
labs(title = "Null Distribution of the Difference in Means (Long-term)",
x = "Difference in Means",
y = "Frequency")
The p-value is 0.002 which is less than 0.05, it can reject the null hypothesis, indicating the CBT treatment has a statistically significant effect on long-term anti-social behavior. In this case, the long-term effect is significant (p < 0.05), the benefits of the CBT treatment are not just immediate but persist over a longer period, which indicates the intervention’s effectiveness in sustaining behavioral changes.
# Calculate the p-value for the long-term measure
ate_lt_p <- ate_lt_null_dist %>%
get_p_value(obs_stat = ate_lt, direction = "two-sided"); ate_lt_p
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0.002