Prep the data

This code and corresponding dataset are available on github: https://github.com/kareena-delrosario/regression_rcode

# read in data
data <- read_xlsx("/Users/kareenadelrosario/Desktop/local r code/regression_code/data_ttest.xlsx")

head(data)
## # A tibble: 6 × 102
##   `Response ID`   GENDER   AGE PARTY TWITTER TRUST   RU1   RU2   RU3   RU4   RU5
##   <chr>            <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 R_0cj5dsJg2wfp…      1    18     1       0    95     4    26     0    -5     4
## 2 R_0rkhLjwWPHHj…      0    19     2       1    76    -5    16     3    -1     7
## 3 R_10BMNpjhInMf…      1    18     1       1    86    -5    -2     5     5    17
## 4 R_120iGR6WlLnb…      0    22     1       0    95    23   -10   -40    22     6
## 5 R_12qW8cDY0bNl…      0    19     3       0    76    18   -12     1    16    11
## 6 R_1BoktAHwyuom…      1    20     3       0    41    -4   -16    -4    -6    -2
## # ℹ 91 more variables: RU6 <dbl>, RU7 <dbl>, RU8 <dbl>, RU9 <dbl>, RU10 <dbl>,
## #   RU11 <dbl>, RU12 <dbl>, RU13 <dbl>, RU14 <dbl>, RU15 <dbl>, RU16 <dbl>,
## #   RU17 <dbl>, RU18 <dbl>, RU19 <dbl>, RU20 <dbl>, RU21 <dbl>, RU22 <dbl>,
## #   RU23 <dbl>, RU24 <dbl>, RU25 <dbl>, RU26 <dbl>, RU27 <dbl>, RU28 <dbl>,
## #   RU29 <dbl>, RU30 <dbl>, RU31 <dbl>, RU32 <dbl>, Pre1 <dbl>, Pre2 <dbl>,
## #   Pre3 <dbl>, Pre4 <dbl>, Pre5 <dbl>, Pre6 <dbl>, Pre7 <dbl>, Pre8 <dbl>,
## #   Pre9 <dbl>, Pre10 <dbl>, Pre11 <dbl>, Pre12 <dbl>, Pre13 <dbl>, …

Create new variable that selects trust in democrats and republicans

# Option 1: similar to Python

## Trust in democrats
TrustD <- data[data$PARTY==1, "TRUST"]
## Trust in republicans
TrustR <- data[data$PARTY==2, "TRUST"]

# Option 2: dplyr

## Create "trustD" for Democrats
trustD <- data %>% 
  filter(PARTY == 1) %>%
  select(TRUST)

## Create "trustR" for Republicans
trustR <- data %>%
  filter(PARTY == 2) %>%
  select(TRUST)

Question: Do Democrats trust in science (above chance, if chance is 75)?

Preview the data

Base R: hist()

# simple histogram - plots frequency
hist(TrustD$TRUST)

ggplot

# similar to Python - plots density (scales to 1)
d_hist_plot <- trustD %>% 
  ggplot(aes(x = TRUST)) + 
  geom_histogram(aes(y = ..density..), fill = "cadetblue2", bins = 10, color = "black") +
  geom_density(color = "#327AAE") +
  geom_vline(xintercept = 75, color = "red",  size = 1) +
  labs(x = "Trust in Science", y = "Density") +
  ggtitle("Democrats Trust in Science") +
  theme_minimal()

print(d_hist_plot)

Descriptives

We’re going to shorten the mean and sd code to make our descriptives easier.

mymean <- partial(mean, na.rm = TRUE)
mysd <- partial(sd, na.rm = TRUE)
mymean(trustD$TRUST)
## [1] 88.59055
mysd(trustD$TRUST)
## [1] 9.591956
length(trustD$TRUST)
## [1] 127

One sample t-test

\[ t = \frac{\bar{x} - \mu_0}{s / \sqrt{N}}\]

# manual calculation of t-test
# mu = population mean (75)

(88.59-75)/(9.59/sqrt(127))
## [1] 15.96992

Main function for t-tests

# do Democrats trust in science
t.test(TrustD$TRUST, mu = 75) # can set specific population mean to test by setting mu
## 
##  One Sample t-test
## 
## data:  TrustD$TRUST
## t = 15.967, df = 126, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 75
## 95 percent confidence interval:
##  86.90615 90.27495
## sample estimates:
## mean of x 
##  88.59055

Check whether TRUST is normally distributed

Base R: Q-Q Plot

qqnorm(trustD$TRUST)

ggplot

# ggplot approach
ggplot(trustD, aes(sample = TRUST)) +
  stat_qq() +
  stat_qq_line(color = "red") + # add line to see normal dist
  ggtitle("Q-Q Plot of Democrat Trust") +
  theme_minimal()

Shapiro-Wilk Test

We can also quantitatively test whether the data are normally distributed.

  • Democrats
shapiro.test(trustD$TRUST)
## 
##  Shapiro-Wilk normality test
## 
## data:  trustD$TRUST
## W = 0.92291, p-value = 2.002e-06
  • Republicans
shapiro.test(trustR$TRUST)
## 
##  Shapiro-Wilk normality test
## 
## data:  trustR$TRUST
## W = 0.87772, p-value = 0.01325

Wilcoxon Test for non-normal dist

This is the non-parametric test for non-normally distributed data.

With continuity correction (default; treats like it’s normally distributed)

wilcox.test(trustD$TRUST)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  trustD$TRUST
## V = 8128, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0

Without continuity correction (only when you have super small samples)

wilcox.test(trustD$TRUST, mu = 75, correct = FALSE)
## 
##  Wilcoxon signed rank test
## 
## data:  trustD$TRUST
## V = 7646, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 75

Two-samples t-test

Visualize trust by political party

bar_plot_party <- data %>%
  ggplot(aes(x = PARTY, y = TRUST, fill = PARTY)) + 
  stat_summary(fun=mean, geom="bar", position = position_dodge(), width = .3) + # plot mean as barplot
  stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(0.9), width = .1) + # mean_se is an argument that calculates SD (can calculate SE with additional arguments)
  labs(x = "Party", y = "Trust") + # add title to x and y-axis (labs = "labels")
  theme_minimal() + # preset theme
  ggtitle("Trust by Political Party")

bar_plot_party

Quick fix for categorical variables that are coded as continuous
bar_plot_party <- data %>%
  ggplot(aes(x = factor(PARTY), y = TRUST, fill = factor(PARTY))) + 
  stat_summary(fun=mean, geom="bar", position = position_dodge(), width = .3) + # plot mean as barplot
  stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(0.9), width = .1) + # mean_se is an argument that calculates SD (can calculate SE with additional arguments)
  labs(x = "Party", y = "Trust in Science") + # add title to x and y-axis (labs = "labels")
  theme_classic() + # to match python
  ggtitle("Trust by Political Party")

bar_plot_party

Match python plot
bar_plot_party +
    scale_x_discrete(labels=c('Democrats', 'Republicans', 'Independents')) + # x-axis labels
  theme(legend.position = "none") + # no legend
  ylim(0, 100) + # change y-axis "limits" to 0 to 100
  scale_fill_manual(values = c("#377eb8", "#e41a1c", "#ffcc33")) # change colors

Histogram

# we're just going to plot democrats and republicans
data$PartyLabel <- ifelse(data$PARTY == 1, "Democrats", "Republicans") # assign a label

p <- ggplot(data, aes(x = TRUST, fill = PartyLabel)) +
  geom_histogram(aes(y = ..density..), position = "identity", alpha = 0.5, bins = 40) +
  geom_density(alpha = 0.3) + # transparency of density graphs
  scale_fill_manual(values = c("Democrats" = "#327AAE", "Republicans" = "#F93F17")) + # colors
  labs(x = "Trust in Science", y = "Density", fill = "Party") + # labels
  theme_minimal() +
  guides(fill = guide_legend(title = "Party")) + # title for legend
  ggtitle("Distribution of Trust in Science among Dems and Reps") +
  scale_x_continuous(limits=c(0,130)) + # change x-axis to match python
  scale_y_continuous(limits=c(0,.075)) # change y-axis (alternative to ylim)

print(p)

## If you want to save the image. It'll output to your working directory
# ggsave("trust.png", plot = p, dpi = 300, width = 8, height = 6)

Separate by party using facet_wrap()

Note that they share the same y-axis scale (unlike Python)

p + facet_wrap(~PartyLabel)


Independent samples t-test

# Option 1 - default welch correction
t.test(trustD$TRUST, trustR$TRUST, var.equal = FALSE) # T Test comparing Group 1 to Group 2
## 
##  Welch Two Sample t-test
## 
## data:  trustD$TRUST and trustR$TRUST
## t = 2.0321, df = 21.517, p-value = 0.05468
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1996851 18.4284065
## sample estimates:
## mean of x mean of y 
##  88.59055  79.47619
# get rid of party 3 (independents)
df_no_ind <- data %>% 
  filter(PARTY != 3)

# Option 2 - T-test without welch correction (same as Python)
t.test(TRUST ~ PARTY, data=df_no_ind, var.equal = T) 
## 
##  Two Sample t-test
## 
## data:  TRUST by PARTY
## t = 3.3276, df = 146, p-value = 0.001109
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##   3.701127 14.527595
## sample estimates:
## mean in group 1 mean in group 2 
##        88.59055        79.47619
# package for effect size
library(lsr)

cohensD(TRUST ~ PARTY,
       data = df_no_ind) # reference data without independents
## [1] 0.7838837

Check assumptions of normality

Visualize with Q-Q Plot

# ggplot approach
ggplot(df_no_ind, aes(sample = TRUST)) +
  stat_qq() +
  stat_qq_line(color = "red") + # add line to see normal dist
  ggtitle("Q-Q Plot of Trust") +
  theme_minimal() +
  facet_wrap(~ PartyLabel) # separate by party

Test with Shapiro

# going back to our df with just democrats
shapiro.test(trustD$TRUST)
## 
##  Shapiro-Wilk normality test
## 
## data:  trustD$TRUST
## W = 0.92291, p-value = 2.002e-06
# just republicans
shapiro.test(trustR$TRUST)
## 
##  Shapiro-Wilk normality test
## 
## data:  trustR$TRUST
## W = 0.87772, p-value = 0.01325

How would you interpret the Shapiro test?

Mann–Whitney U test

FYI Main difference between this test and the Wilcoxon:

The Mann-Whitney U test (or Wilcoxon rank-sum test) is for two independent samples, while the Wilcoxon signed-rank test is for paired or matched samples.

# not necessary to add the paired argument. Just want to make it clear that this is independent.
wilcox.test(trustD$TRUST, trustR$TRUST, paired = FALSE) 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  trustD$TRUST and trustR$TRUST
## W = 1612, p-value = 0.1244
## alternative hypothesis: true location shift is not equal to 0

Paired samples t-test

We need the pre and post measurement in long format (i.e., same variable). We did this before, remember? How do we do that?

HINT: the columns we want to ‘pivot’ are Pre1 and Post1. We’re creating a new column called TimePoint (should have Pre1 and Post1 names) and its values should be in a new column called Belief.

long_df <- data %>% 
  pivot_longer(cols = c("Pre1", "Post1"),
               names_to = "TimePoint", 
               values_to = "Belief")

# check our work
long_df %>% 
  select(`Response ID`, TimePoint, Belief)
## # A tibble: 400 × 3
##    `Response ID`     TimePoint Belief
##    <chr>             <chr>      <dbl>
##  1 R_0cj5dsJg2wfpiuJ Pre1          83
##  2 R_0cj5dsJg2wfpiuJ Post1         87
##  3 R_0rkhLjwWPHHjnTX Pre1          66
##  4 R_0rkhLjwWPHHjnTX Post1         61
##  5 R_10BMNpjhInMfUeO Pre1          67
##  6 R_10BMNpjhInMfUeO Post1         62
##  7 R_120iGR6WlLnbZnI Pre1          74
##  8 R_120iGR6WlLnbZnI Post1         97
##  9 R_12qW8cDY0bNlId2 Pre1          81
## 10 R_12qW8cDY0bNlId2 Post1         99
## # ℹ 390 more rows

Visualize belief by timepoint

bar_plot_paired <- long_df %>%
    mutate(TimePoint = factor(TimePoint, levels = c("Pre1", "Post1"))) %>% # change order of time points 
  ggplot(aes(x = TimePoint, y = Belief, fill = TimePoint)) + 
  stat_summary(fun=mean, geom="bar", position = position_dodge(), width = .3) + # plot mean as barplot
  stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(0.9), width = .1) + #
  labs(x = "Time Point", y = "Belief") + # add title to x and y-axis (labs = "labels")
  theme_classic() +
  ylim(c(0,80))

bar_plot_paired

# using original data where pre and post are separate variables
t.test(data$Pre1, data$Post1, paired=T) # tests whether difference is significantly different from 0
## 
##  Paired t-test
## 
## data:  data$Pre1 and data$Post1
## t = 1.9547, df = 199, p-value = 0.05202
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.01711808  3.89021808
## sample estimates:
## mean difference 
##         1.93655

Let’s manually calculate Cohen’s D, borrowing from Madalina’s Python code. I’m simplifying it because we just need Cohen’s D.

cohen <- function(x1, x2) {
   xd = x1 - x2
   Md = mymean(xd)
   sd = mysd(xd)
   d = abs(Md) / sd
   
   print(d)
}

cohen(data$Pre1, data$Post1)
## [1] 0.1382166

Wilcoxon Test for paired data

wilcox.test(data$Pre1, data$Post1, paired = TRUE) 
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  data$Pre1 and data$Post1
## V = 9406.5, p-value = 0.0715
## alternative hypothesis: true location shift is not equal to 0

Descriptives for reporting

You can get the means and sds the old-fashioned way, but if you plan to get descriptives from several variables, you might as well build a function.

descriptives <- function(x1, x2) {
  m1 <- mymean(x1)
  m2 <- mymean(x2)
  s1 <- mysd(x1)
  s2 <- mysd(x2)
  
 # Create a data frame for nicer display - could use table() but it's ugly
  results_df <- data.frame(
    Metric = c("Mean", "SD"), # 3 columns = metric, pre, post
    Pre1 = c(m1, s1),
    Post1 = c(m2, s2)
  )
  
  return(results_df)
}

descriptives(data$Pre1, data$Post1)
##   Metric    Pre1    Post1
## 1   Mean 73.3500 71.41345
## 2     SD 15.3988 17.56382