This code and corresponding dataset are available on github: https://github.com/kareena-delrosario/regression_rcode
# read in data
data <- read_xlsx("/Users/kareenadelrosario/Desktop/local r code/regression_code/data_ttest.xlsx")
head(data)## # A tibble: 6 × 102
## `Response ID` GENDER AGE PARTY TWITTER TRUST RU1 RU2 RU3 RU4 RU5
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 R_0cj5dsJg2wfp… 1 18 1 0 95 4 26 0 -5 4
## 2 R_0rkhLjwWPHHj… 0 19 2 1 76 -5 16 3 -1 7
## 3 R_10BMNpjhInMf… 1 18 1 1 86 -5 -2 5 5 17
## 4 R_120iGR6WlLnb… 0 22 1 0 95 23 -10 -40 22 6
## 5 R_12qW8cDY0bNl… 0 19 3 0 76 18 -12 1 16 11
## 6 R_1BoktAHwyuom… 1 20 3 0 41 -4 -16 -4 -6 -2
## # ℹ 91 more variables: RU6 <dbl>, RU7 <dbl>, RU8 <dbl>, RU9 <dbl>, RU10 <dbl>,
## # RU11 <dbl>, RU12 <dbl>, RU13 <dbl>, RU14 <dbl>, RU15 <dbl>, RU16 <dbl>,
## # RU17 <dbl>, RU18 <dbl>, RU19 <dbl>, RU20 <dbl>, RU21 <dbl>, RU22 <dbl>,
## # RU23 <dbl>, RU24 <dbl>, RU25 <dbl>, RU26 <dbl>, RU27 <dbl>, RU28 <dbl>,
## # RU29 <dbl>, RU30 <dbl>, RU31 <dbl>, RU32 <dbl>, Pre1 <dbl>, Pre2 <dbl>,
## # Pre3 <dbl>, Pre4 <dbl>, Pre5 <dbl>, Pre6 <dbl>, Pre7 <dbl>, Pre8 <dbl>,
## # Pre9 <dbl>, Pre10 <dbl>, Pre11 <dbl>, Pre12 <dbl>, Pre13 <dbl>, …
# Option 1: similar to Python
## Trust in democrats
TrustD <- data[data$PARTY==1, "TRUST"]
## Trust in republicans
TrustR <- data[data$PARTY==2, "TRUST"]
# Option 2: dplyr
## Create "trustD" for Democrats
trustD <- data %>%
filter(PARTY == 1) %>%
select(TRUST)
## Create "trustR" for Republicans
trustR <- data %>%
filter(PARTY == 2) %>%
select(TRUST)# similar to Python - plots density (scales to 1)
d_hist_plot <- trustD %>%
ggplot(aes(x = TRUST)) +
geom_histogram(aes(y = ..density..), fill = "cadetblue2", bins = 10, color = "black") +
geom_density(color = "#327AAE") +
geom_vline(xintercept = 75, color = "red", size = 1) +
labs(x = "Trust in Science", y = "Density") +
ggtitle("Democrats Trust in Science") +
theme_minimal()
print(d_hist_plot)\[ t = \frac{\bar{x} - \mu_0}{s / \sqrt{N}}\]
## [1] 15.96992
Main function for t-tests
# do Democrats trust in science
t.test(TrustD$TRUST, mu = 75) # can set specific population mean to test by setting mu##
## One Sample t-test
##
## data: TrustD$TRUST
## t = 15.967, df = 126, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 75
## 95 percent confidence interval:
## 86.90615 90.27495
## sample estimates:
## mean of x
## 88.59055
# ggplot approach
ggplot(trustD, aes(sample = TRUST)) +
stat_qq() +
stat_qq_line(color = "red") + # add line to see normal dist
ggtitle("Q-Q Plot of Democrat Trust") +
theme_minimal()We can also quantitatively test whether the data are normally distributed.
##
## Shapiro-Wilk normality test
##
## data: trustD$TRUST
## W = 0.92291, p-value = 2.002e-06
##
## Shapiro-Wilk normality test
##
## data: trustR$TRUST
## W = 0.87772, p-value = 0.01325
This is the non-parametric test for non-normally distributed data.
With continuity correction (default; treats like it’s normally distributed)
##
## Wilcoxon signed rank test with continuity correction
##
## data: trustD$TRUST
## V = 8128, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
Without continuity correction (only when you have super small samples)
##
## Wilcoxon signed rank test
##
## data: trustD$TRUST
## V = 7646, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 75
bar_plot_party <- data %>%
ggplot(aes(x = PARTY, y = TRUST, fill = PARTY)) +
stat_summary(fun=mean, geom="bar", position = position_dodge(), width = .3) + # plot mean as barplot
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(0.9), width = .1) + # mean_se is an argument that calculates SD (can calculate SE with additional arguments)
labs(x = "Party", y = "Trust") + # add title to x and y-axis (labs = "labels")
theme_minimal() + # preset theme
ggtitle("Trust by Political Party")
bar_plot_partybar_plot_party <- data %>%
ggplot(aes(x = factor(PARTY), y = TRUST, fill = factor(PARTY))) +
stat_summary(fun=mean, geom="bar", position = position_dodge(), width = .3) + # plot mean as barplot
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(0.9), width = .1) + # mean_se is an argument that calculates SD (can calculate SE with additional arguments)
labs(x = "Party", y = "Trust in Science") + # add title to x and y-axis (labs = "labels")
theme_classic() + # to match python
ggtitle("Trust by Political Party")
bar_plot_partybar_plot_party +
scale_x_discrete(labels=c('Democrats', 'Republicans', 'Independents')) + # x-axis labels
theme(legend.position = "none") + # no legend
ylim(0, 100) + # change y-axis "limits" to 0 to 100
scale_fill_manual(values = c("#377eb8", "#e41a1c", "#ffcc33")) # change colors# we're just going to plot democrats and republicans
data$PartyLabel <- ifelse(data$PARTY == 1, "Democrats", "Republicans") # assign a label
p <- ggplot(data, aes(x = TRUST, fill = PartyLabel)) +
geom_histogram(aes(y = ..density..), position = "identity", alpha = 0.5, bins = 40) +
geom_density(alpha = 0.3) + # transparency of density graphs
scale_fill_manual(values = c("Democrats" = "#327AAE", "Republicans" = "#F93F17")) + # colors
labs(x = "Trust in Science", y = "Density", fill = "Party") + # labels
theme_minimal() +
guides(fill = guide_legend(title = "Party")) + # title for legend
ggtitle("Distribution of Trust in Science among Dems and Reps") +
scale_x_continuous(limits=c(0,130)) + # change x-axis to match python
scale_y_continuous(limits=c(0,.075)) # change y-axis (alternative to ylim)
print(p)# Option 1 - default welch correction
t.test(trustD$TRUST, trustR$TRUST, var.equal = FALSE) # T Test comparing Group 1 to Group 2##
## Welch Two Sample t-test
##
## data: trustD$TRUST and trustR$TRUST
## t = 2.0321, df = 21.517, p-value = 0.05468
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1996851 18.4284065
## sample estimates:
## mean of x mean of y
## 88.59055 79.47619
# get rid of party 3 (independents)
df_no_ind <- data %>%
filter(PARTY != 3)
# Option 2 - T-test without welch correction (same as Python)
t.test(TRUST ~ PARTY, data=df_no_ind, var.equal = T) ##
## Two Sample t-test
##
## data: TRUST by PARTY
## t = 3.3276, df = 146, p-value = 0.001109
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## 3.701127 14.527595
## sample estimates:
## mean in group 1 mean in group 2
## 88.59055 79.47619
# package for effect size
library(lsr)
cohensD(TRUST ~ PARTY,
data = df_no_ind) # reference data without independents## [1] 0.7838837
# ggplot approach
ggplot(df_no_ind, aes(sample = TRUST)) +
stat_qq() +
stat_qq_line(color = "red") + # add line to see normal dist
ggtitle("Q-Q Plot of Trust") +
theme_minimal() +
facet_wrap(~ PartyLabel) # separate by party##
## Shapiro-Wilk normality test
##
## data: trustD$TRUST
## W = 0.92291, p-value = 2.002e-06
##
## Shapiro-Wilk normality test
##
## data: trustR$TRUST
## W = 0.87772, p-value = 0.01325
How would you interpret the Shapiro test?
FYI Main difference between this test and the Wilcoxon:
The Mann-Whitney U test (or Wilcoxon rank-sum test) is for two independent samples, while the Wilcoxon signed-rank test is for paired or matched samples.
# not necessary to add the paired argument. Just want to make it clear that this is independent.
wilcox.test(trustD$TRUST, trustR$TRUST, paired = FALSE) ##
## Wilcoxon rank sum test with continuity correction
##
## data: trustD$TRUST and trustR$TRUST
## W = 1612, p-value = 0.1244
## alternative hypothesis: true location shift is not equal to 0
We need the pre and post measurement in long format (i.e., same variable). We did this before, remember? How do we do that?
HINT: the columns we want to ‘pivot’ are Pre1 and Post1. We’re creating a new column called TimePoint (should have Pre1 and Post1 names) and its values should be in a new column called Belief.
long_df <- data %>%
pivot_longer(cols = c("Pre1", "Post1"),
names_to = "TimePoint",
values_to = "Belief")
# check our work
long_df %>%
select(`Response ID`, TimePoint, Belief)## # A tibble: 400 × 3
## `Response ID` TimePoint Belief
## <chr> <chr> <dbl>
## 1 R_0cj5dsJg2wfpiuJ Pre1 83
## 2 R_0cj5dsJg2wfpiuJ Post1 87
## 3 R_0rkhLjwWPHHjnTX Pre1 66
## 4 R_0rkhLjwWPHHjnTX Post1 61
## 5 R_10BMNpjhInMfUeO Pre1 67
## 6 R_10BMNpjhInMfUeO Post1 62
## 7 R_120iGR6WlLnbZnI Pre1 74
## 8 R_120iGR6WlLnbZnI Post1 97
## 9 R_12qW8cDY0bNlId2 Pre1 81
## 10 R_12qW8cDY0bNlId2 Post1 99
## # ℹ 390 more rows
bar_plot_paired <- long_df %>%
mutate(TimePoint = factor(TimePoint, levels = c("Pre1", "Post1"))) %>% # change order of time points
ggplot(aes(x = TimePoint, y = Belief, fill = TimePoint)) +
stat_summary(fun=mean, geom="bar", position = position_dodge(), width = .3) + # plot mean as barplot
stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(0.9), width = .1) + #
labs(x = "Time Point", y = "Belief") + # add title to x and y-axis (labs = "labels")
theme_classic() +
ylim(c(0,80))
bar_plot_paired# using original data where pre and post are separate variables
t.test(data$Pre1, data$Post1, paired=T) # tests whether difference is significantly different from 0##
## Paired t-test
##
## data: data$Pre1 and data$Post1
## t = 1.9547, df = 199, p-value = 0.05202
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.01711808 3.89021808
## sample estimates:
## mean difference
## 1.93655
Let’s manually calculate Cohen’s D, borrowing from Madalina’s Python code. I’m simplifying it because we just need Cohen’s D.
cohen <- function(x1, x2) {
xd = x1 - x2
Md = mymean(xd)
sd = mysd(xd)
d = abs(Md) / sd
print(d)
}
cohen(data$Pre1, data$Post1)## [1] 0.1382166
##
## Wilcoxon signed rank test with continuity correction
##
## data: data$Pre1 and data$Post1
## V = 9406.5, p-value = 0.0715
## alternative hypothesis: true location shift is not equal to 0
You can get the means and sds the old-fashioned way, but if you plan to get descriptives from several variables, you might as well build a function.
descriptives <- function(x1, x2) {
m1 <- mymean(x1)
m2 <- mymean(x2)
s1 <- mysd(x1)
s2 <- mysd(x2)
# Create a data frame for nicer display - could use table() but it's ugly
results_df <- data.frame(
Metric = c("Mean", "SD"), # 3 columns = metric, pre, post
Pre1 = c(m1, s1),
Post1 = c(m2, s2)
)
return(results_df)
}
descriptives(data$Pre1, data$Post1)## Metric Pre1 Post1
## 1 Mean 73.3500 71.41345
## 2 SD 15.3988 17.56382