Data Analysis Exercise

Question 1: Does knowledge differ significantly from knowledge2?

# Installing required packages
if (!require("dplyr"))
  install.packages("dplyr")
if (!require("tidyverse"))
  install.packages("tidyverse")
library(dplyr)
library(ggplot2)
options(scipen = 999)

# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv

# Specify the two variables involved
mydata$V1 <- mydata$Knowledge
mydata$V2 <- mydata$Knowledge2

# Look at the distribution of the pair differences
mydata$PairDifferences <- mydata$V2 - mydata$V1

ggplot(mydata, aes(x = PairDifferences)) +
  geom_histogram(color = "black", fill = "#1f78b4") +
  geom_vline(aes(xintercept = mean(PairDifferences)))

# Get descriptive statistics for pair differences
mydata %>%
  select(PairDifferences) %>%
  summarise(
    count = n(),
    mean = mean(PairDifferences, na.rm = TRUE),
    sd = sd(PairDifferences, na.rm = TRUE),
    min = min(PairDifferences, na.rm = TRUE),
    max = max(PairDifferences, na.rm = TRUE)
  )

##   count     mean       sd min max
## 1   600 3.106667 2.903907  -6  12

# If pair differences look non-normal, you can use a Shapiro-Wilk test to check
# whether their distribution differs significantly from normal. If the
# Shapiro-Wilk test p-value is less than 0.05, #use a Wilcoxon signed rank test
# instead of a paired-samples t-test.

# Shapiro-Wilk test
options(scipen = 999)
shapiro.test(mydata$PairDifferences)

## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$PairDifferences
## W = 0.98799, p-value = 0.00007779

# If the pair distribution is non-normal, consider # using a Wilcoxon signed rank test instead of a
# paired-samples t-test.

mydata %>%
  select(V1, V2) %>%
  summarise_all(list(Mean = mean, SD = sd))

##    V1_Mean V2_Mean    V1_SD    V2_SD
## 1 9.878333  12.985 3.079394 4.157754

wilcox.test(mydata$V1, mydata$V2, paired = TRUE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mydata$V1 and mydata$V2
## V = 9099, p-value < 0.00000000000000022
## alternative hypothesis: true location shift is not equal to 0

# If the pair differences are normally distributed,
# though, you may use a paired-samples t-test.

mydata %>%
  select(V1, V2) %>%
  summarise_all(list(Mean = mean, SD = sd))

##    V1_Mean V2_Mean    V1_SD    V2_SD
## 1 9.878333  12.985 3.079394 4.157754

options(scipen = 999)
t.test(mydata$V2, mydata$V1,
       paired = TRUE)

## 
##  Paired t-test
## 
## data:  mydata$V2 and mydata$V1
## t = 26.205, df = 599, p-value < 0.00000000000000022
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  2.873840 3.339494
## sample estimates:
## mean difference 
##        3.106667

Question 2: Does Voted depend on Education?

# Installing required packages
if (!require("tidyverse"))
  install.packages("tidyverse")
library(ggplot2)

# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv

# Specify the DV and IV
mydata$DV <- mydata$Voted #Edit YOURDVNAME
mydata$IV <- mydata$Education #Edit YOURIVNAME

# Look at the DV and IV
ggplot(mydata, aes(x = DV)) + geom_bar(color = "black", fill = "#1f78b4")

ggplot(mydata, aes(x = IV)) + geom_histogram(color = "black", fill = "#1f78b4")

# Logistic regression plot
ggplot(mydata, aes(x = IV,
                   y = DV))+
  geom_jitter(height = .03,
              alpha = .5) +
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = FALSE,
              color = "#1f78b4")

# Run the logistic regression and view the summary results
options(scipen = 999)
log.ed <- glm(DV ~ IV, data = mydata, family = "binomial")
summary(log.ed)

## 
## Call:
## glm(formula = DV ~ IV, family = "binomial", data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8396  -1.2929   0.8337   0.9458   1.3939  
## 
## Coefficients:
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept) -1.25914    0.42907  -2.935   0.00334 ** 
## IV           0.15265    0.03601   4.239 0.0000225 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 789.68  on 599  degrees of freedom
## Residual deviance: 770.93  on 598  degrees of freedom
## AIC: 774.93
## 
## Number of Fisher Scoring iterations: 4

p <- .50
Inflection_point <- (log(p/(1-p))-coef(log.ed)[1])/coef(log.ed)[2]
Inflection_point

## (Intercept) 
##    8.248492

Question 3: Does Knowledge depend on EdCat?

# Installing required packages
if (!require("dplyr"))
  install.packages("dplyr")
if (!require("tidyverse"))
  install.packages("tidyverse")
library(dplyr)
library(ggplot2)
options(scipen = 999)

# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv

# Specify the DV and IV
mydata$DV <- mydata$Knowledge
mydata$IV <- mydata$EdCat

# Graph the group distributions and averages
averages <- group_by(mydata, IV) %>%
  summarise(mean = mean(DV, na.rm = TRUE))
ggplot(mydata, aes(x = DV)) +
  geom_histogram() +
  facet_grid(IV ~ .) +
  geom_histogram(color = "black", fill = "#1f78b4") +
  geom_vline(data = averages, aes(xintercept = mean, ))

# Calculate and show the group counts, means, standard
# deviations, minimums, and maximums
group_by(mydata, IV) %>%
  summarise(
    count = n(),
    mean = mean(DV, na.rm = TRUE),
    sd = sd(DV, na.rm = TRUE),
    min = min(DV, na.rm = TRUE),
    max = max(DV, na.rm = TRUE)
  )

## # A tibble: 2 × 6
##   IV                             count  mean    sd   min   max
##   <chr>                          <int> <dbl> <dbl> <int> <int>
## 1 1. High school or less           358  8.73  2.69     1    17
## 2 2. Some college/college degree   242 11.6   2.83     3    18

# If you see evidence of non-normality in the distributions of one or both groups, use a Shapiro-Wilk test.
mydata %>%
  group_by(IV) %>%
  summarise(`W Statistic` = shapiro.test(DV)$statistic,
            `p-value` = shapiro.test(DV)$p.value)

## # A tibble: 2 × 3
##   IV                             `W Statistic` `p-value`
##   <chr>                                  <dbl>     <dbl>
## 1 1. High school or less                 0.982  0.000224
## 2 2. Some college/college degree         0.977  0.000681

# A significant Shapiro-Wilk test for one or both groups means you should use a Wilcoxon signed rank test rather thana t-test.

options(scipen = 999)
wilcox.test(mydata$DV ~ mydata$IV)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  mydata$DV by mydata$IV
## W = 19846, p-value < 0.00000000000000022
## alternative hypothesis: true location shift is not equal to 0

# If both groups are normally distributed, though, you may run a t-test.
options(scipen = 999)
t.test(mydata$DV ~ mydata$IV,
       var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  mydata$DV by mydata$IV
## t = -12.277, df = 500.02, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group 1. High school or less and group 2. Some college/college degree is not equal to 0
## 95 percent confidence interval:
##  -3.289388 -2.381834
## sample estimates:
##         mean in group 1. High school or less 
##                                     8.734637 
## mean in group 2. Some college/college degree 
##                                    11.570248

Question 4: Does Knowledge depend on Education?

# Installing required packages
if (!require("dplyr"))
  install.packages("dplyr")
if (!require("tidyverse"))
  install.packages("tidyverse")
library(dplyr)
library(ggplot2)

# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv

# Specify the DV and IV
mydata$DV <- mydata$Knowledge #Edit YOURDVNAME
mydata$IV <- mydata$Education #Edit YOURIVNAME

# Look at the DV and IV
ggplot(mydata, aes(x = DV)) + geom_histogram(color = "black", fill = "#1f78b4")

ggplot(mydata, aes(x = IV)) + geom_histogram(color = "black", fill = "#1f78b4")

# Creating and summarizing an initial regression model called myreg, and checking for bivariate outliers.
options(scipen = 999)
myreg <- lm(DV ~ IV,
            data = mydata)
summary(myreg)

## 
## Call:
## lm(formula = DV ~ IV, data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0741 -1.5344 -0.0741  1.6338  9.2974 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  1.45498    0.51516   2.824              0.0049 ** 
## IV           0.70794    0.04241  16.694 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.545 on 598 degrees of freedom
## Multiple R-squared:  0.3179, Adjusted R-squared:  0.3168 
## F-statistic: 278.7 on 1 and 598 DF,  p-value: < 0.00000000000000022

plot(mydata$IV, mydata$DV)
abline(lm(mydata$DV ~ mydata$IV))

leverage <- as.data.frame(hatvalues(myreg))
view(leverage)

Question 5: Does NewsCat depend on EdCat?

# Installing required packages
if (!require("tidyverse"))
  install.packages("tidyverse")
if (!require("gmodels"))
  install.packages("gmodels")
library(gmodels)
library(ggplot2)

# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv

# Specify the DV and IV
mydata$DV <- mydata$NewsCat #Edit YOURDVNAME
mydata$IV <- mydata$EdCat #Edit YOURIVNAME

# Look at the DV and IV
ggplot(mydata, aes(x = IV, fill = DV)) +
  geom_bar(colour = "black") +
  scale_fill_brewer(palette = "Paired")

# Make the crosstab table
CrossTable(
  mydata$DV,
  mydata$IV,
  prop.chisq = FALSE,
  prop.t = FALSE,
  prop.r = FALSE
)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  600 
## 
##  
##              | mydata$IV 
##    mydata$DV |         1. High school or less | 2. Some college/college degree |                      Row Total | 
## -------------|--------------------------------|--------------------------------|--------------------------------|
##     1. Never |                            208 |                             32 |                            240 | 
##              |                          0.581 |                          0.132 |                                | 
## -------------|--------------------------------|--------------------------------|--------------------------------|
## 2. Some days |                            120 |                             85 |                            205 | 
##              |                          0.335 |                          0.351 |                                | 
## -------------|--------------------------------|--------------------------------|--------------------------------|
## 3. Every day |                             30 |                            125 |                            155 | 
##              |                          0.084 |                          0.517 |                                | 
## -------------|--------------------------------|--------------------------------|--------------------------------|
## Column Total |                            358 |                            242 |                            600 | 
##              |                          0.597 |                          0.403 |                                | 
## -------------|--------------------------------|--------------------------------|--------------------------------|
## 
##

# Run the chi-squared test
options(scipen = 999)
chitestresults <- chisq.test(mydata$DV, mydata$IV)
chitestresults

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$DV and mydata$IV
## X-squared = 177.48, df = 2, p-value < 0.00000000000000022

Question 6: Does Knowledge depend on NewsCat?

# Installing required packages
if (!require("dplyr"))
  install.packages("dplyr")
if (!require("tidyverse"))
  install.packages("tidyverse")
library(dplyr)
library(ggplot2)
options(scipen = 999)

# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv

# Specify the DV and IV
mydata$DV <- mydata$Knowledge
mydata$IV <- mydata$NewsCat

# Graph the group distributions and averages
averages <- group_by(mydata, IV) %>%
  summarise(mean = mean(DV, na.rm = TRUE))
ggplot(mydata, aes(x = DV)) +
  geom_histogram() +
  facet_grid(IV ~ .) +
  geom_histogram(color = "black", fill = "#1f78b4") +
  geom_vline(data = averages, aes(xintercept = mean, ))

# Calculate and show the group counts, means, standard
# deviations, minimums, and maximums
group_by(mydata, IV) %>%
  summarise(
    count = n(),
    mean = mean(DV, na.rm = TRUE),
    sd = sd(DV, na.rm = TRUE),
    min = min(DV, na.rm = TRUE),
    max = max(DV, na.rm = TRUE)
  )

## # A tibble: 3 × 6
##   IV           count  mean    sd   min   max
##   <chr>        <int> <dbl> <dbl> <int> <int>
## 1 1. Never       240  8.93  2.72     1    17
## 2 2. Some days   205 10.1   3.27     1    17
## 3 3. Every day   155 11.1   2.87     4    18

# If the group distributions look non-normal, consider checking for non-normality
# running a Shapiro-Wilk test for each group. Significant results indicate
# non-normality.

mydata %>%
  group_by(IV) %>%
  summarise(`W Statistic` = shapiro.test(DV)$statistic,
            `p-value` = shapiro.test(DV)$p.value)

## # A tibble: 3 × 3
##   IV           `W Statistic` `p-value`
##   <chr>                <dbl>     <dbl>
## 1 1. Never             0.986   0.0202 
## 2 2. Some days         0.979   0.00334
## 3 3. Every day         0.983   0.0575

# If one or more of the group distributions are non-normal, consider using a
# Kruskal-Wallis test instead of an ANOVA and, if the result is significant, a
# Dunn's Test instead of a Tukey's HSD as a post-hoc test.

if (!require("FSA"))
  install.packages("FSA")
library(FSA)
kruskal.test(DV ~ IV, data = mydata)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  DV by IV
## Kruskal-Wallis chi-squared = 48.758, df = 2, p-value = 0.00000000002585

dunnTest(DV ~ IV, data = mydata)

##                    Comparison         Z              P.unadj
## 1     1. Never - 2. Some days -4.041275 0.000053161290666391
## 2     1. Never - 3. Every day -6.872282 0.000000000006318296
## 3 2. Some days - 3. Every day -3.042198 0.002348572278282858
##                 P.adj
## 1 0.00010632258133278
## 2 0.00000000001895489
## 3 0.00234857227828286

# If all three group distributions are normal,
# though, you may use ANOVA. Also, ANOVA is
# powerful enough to work as long as the
# non-normality isn't severe.

options(scipen = 999)
oneway.test(mydata$DV ~ mydata$IV,
            var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  mydata$DV and mydata$IV
## F = 29.485, num df = 2.00, denom df = 365.32, p-value =
## 0.000000000001346

# If the ANOVA detects significant difference, run
# this post-hoc procedure to learn which
# group pairs differed significantly.

anova_1 <- aov(mydata$DV ~ mydata$IV)
TukeyHSD(anova_1)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = mydata$DV ~ mydata$IV)
## 
## $`mydata$IV`
##                               diff       lwr      upr     p adj
## 2. Some days-1. Never     1.128659 0.4681836 1.789134 0.0001977
## 3. Every day-1. Never     2.197581 1.4819549 2.913206 0.0000000
## 3. Every day-2. Some days 1.068922 0.3297129 1.808131 0.0020916

Data Analysis Exercise

Dr. Ken Blake

2023-11-04