Question 1: Does knowledge differ significantly from knowledge2?
# Installing required packages
if (!require("dplyr"))
install.packages("dplyr")
if (!require("tidyverse"))
install.packages("tidyverse")
library(dplyr)
library(ggplot2)
options(scipen = 999)
# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv
# Specify the two variables involved
mydata$V1 <- mydata$Knowledge
mydata$V2 <- mydata$Knowledge2
# Look at the distribution of the pair differences
mydata$PairDifferences <- mydata$V2 - mydata$V1
ggplot(mydata, aes(x = PairDifferences)) +
geom_histogram(color = "black", fill = "#1f78b4") +
geom_vline(aes(xintercept = mean(PairDifferences)))
# Get descriptive statistics for pair differences
mydata %>%
select(PairDifferences) %>%
summarise(
count = n(),
mean = mean(PairDifferences, na.rm = TRUE),
sd = sd(PairDifferences, na.rm = TRUE),
min = min(PairDifferences, na.rm = TRUE),
max = max(PairDifferences, na.rm = TRUE)
)
## count mean sd min max
## 1 600 3.106667 2.903907 -6 12
# If pair differences look non-normal, you can use a Shapiro-Wilk test to check
# whether their distribution differs significantly from normal. If the
# Shapiro-Wilk test p-value is less than 0.05, #use a Wilcoxon signed rank test
# instead of a paired-samples t-test.
# Shapiro-Wilk test
options(scipen = 999)
shapiro.test(mydata$PairDifferences)
##
## Shapiro-Wilk normality test
##
## data: mydata$PairDifferences
## W = 0.98799, p-value = 0.00007779
# If the pair distribution is non-normal, consider # using a Wilcoxon signed rank test instead of a
# paired-samples t-test.
mydata %>%
select(V1, V2) %>%
summarise_all(list(Mean = mean, SD = sd))
## V1_Mean V2_Mean V1_SD V2_SD
## 1 9.878333 12.985 3.079394 4.157754
wilcox.test(mydata$V1, mydata$V2, paired = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: mydata$V1 and mydata$V2
## V = 9099, p-value < 0.00000000000000022
## alternative hypothesis: true location shift is not equal to 0
# If the pair differences are normally distributed,
# though, you may use a paired-samples t-test.
mydata %>%
select(V1, V2) %>%
summarise_all(list(Mean = mean, SD = sd))
## V1_Mean V2_Mean V1_SD V2_SD
## 1 9.878333 12.985 3.079394 4.157754
options(scipen = 999)
t.test(mydata$V2, mydata$V1,
paired = TRUE)
##
## Paired t-test
##
## data: mydata$V2 and mydata$V1
## t = 26.205, df = 599, p-value < 0.00000000000000022
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 2.873840 3.339494
## sample estimates:
## mean difference
## 3.106667
Question 2: Does Voted depend on Education?
# Installing required packages
if (!require("tidyverse"))
install.packages("tidyverse")
library(ggplot2)
# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv
# Specify the DV and IV
mydata$DV <- mydata$Voted #Edit YOURDVNAME
mydata$IV <- mydata$Education #Edit YOURIVNAME
# Look at the DV and IV
ggplot(mydata, aes(x = DV)) + geom_bar(color = "black", fill = "#1f78b4")
ggplot(mydata, aes(x = IV)) + geom_histogram(color = "black", fill = "#1f78b4")
# Logistic regression plot
ggplot(mydata, aes(x = IV,
y = DV))+
geom_jitter(height = .03,
alpha = .5) +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE,
color = "#1f78b4")
# Run the logistic regression and view the summary results
options(scipen = 999)
log.ed <- glm(DV ~ IV, data = mydata, family = "binomial")
summary(log.ed)
##
## Call:
## glm(formula = DV ~ IV, family = "binomial", data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8396 -1.2929 0.8337 0.9458 1.3939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.25914 0.42907 -2.935 0.00334 **
## IV 0.15265 0.03601 4.239 0.0000225 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 789.68 on 599 degrees of freedom
## Residual deviance: 770.93 on 598 degrees of freedom
## AIC: 774.93
##
## Number of Fisher Scoring iterations: 4
p <- .50
Inflection_point <- (log(p/(1-p))-coef(log.ed)[1])/coef(log.ed)[2]
Inflection_point
## (Intercept)
## 8.248492
Question 3: Does Knowledge depend on EdCat?
# Installing required packages
if (!require("dplyr"))
install.packages("dplyr")
if (!require("tidyverse"))
install.packages("tidyverse")
library(dplyr)
library(ggplot2)
options(scipen = 999)
# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv
# Specify the DV and IV
mydata$DV <- mydata$Knowledge
mydata$IV <- mydata$EdCat
# Graph the group distributions and averages
averages <- group_by(mydata, IV) %>%
summarise(mean = mean(DV, na.rm = TRUE))
ggplot(mydata, aes(x = DV)) +
geom_histogram() +
facet_grid(IV ~ .) +
geom_histogram(color = "black", fill = "#1f78b4") +
geom_vline(data = averages, aes(xintercept = mean, ))
# Calculate and show the group counts, means, standard
# deviations, minimums, and maximums
group_by(mydata, IV) %>%
summarise(
count = n(),
mean = mean(DV, na.rm = TRUE),
sd = sd(DV, na.rm = TRUE),
min = min(DV, na.rm = TRUE),
max = max(DV, na.rm = TRUE)
)
## # A tibble: 2 × 6
## IV count mean sd min max
## <chr> <int> <dbl> <dbl> <int> <int>
## 1 1. High school or less 358 8.73 2.69 1 17
## 2 2. Some college/college degree 242 11.6 2.83 3 18
# If you see evidence of non-normality in the distributions of one or both groups, use a Shapiro-Wilk test.
mydata %>%
group_by(IV) %>%
summarise(`W Statistic` = shapiro.test(DV)$statistic,
`p-value` = shapiro.test(DV)$p.value)
## # A tibble: 2 × 3
## IV `W Statistic` `p-value`
## <chr> <dbl> <dbl>
## 1 1. High school or less 0.982 0.000224
## 2 2. Some college/college degree 0.977 0.000681
# A significant Shapiro-Wilk test for one or both groups means you should use a Wilcoxon signed rank test rather thana t-test.
options(scipen = 999)
wilcox.test(mydata$DV ~ mydata$IV)
##
## Wilcoxon rank sum test with continuity correction
##
## data: mydata$DV by mydata$IV
## W = 19846, p-value < 0.00000000000000022
## alternative hypothesis: true location shift is not equal to 0
# If both groups are normally distributed, though, you may run a t-test.
options(scipen = 999)
t.test(mydata$DV ~ mydata$IV,
var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: mydata$DV by mydata$IV
## t = -12.277, df = 500.02, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group 1. High school or less and group 2. Some college/college degree is not equal to 0
## 95 percent confidence interval:
## -3.289388 -2.381834
## sample estimates:
## mean in group 1. High school or less
## 8.734637
## mean in group 2. Some college/college degree
## 11.570248
Question 4: Does Knowledge depend on Education?
# Installing required packages
if (!require("dplyr"))
install.packages("dplyr")
if (!require("tidyverse"))
install.packages("tidyverse")
library(dplyr)
library(ggplot2)
# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv
# Specify the DV and IV
mydata$DV <- mydata$Knowledge #Edit YOURDVNAME
mydata$IV <- mydata$Education #Edit YOURIVNAME
# Look at the DV and IV
ggplot(mydata, aes(x = DV)) + geom_histogram(color = "black", fill = "#1f78b4")
ggplot(mydata, aes(x = IV)) + geom_histogram(color = "black", fill = "#1f78b4")
# Creating and summarizing an initial regression model called myreg, and checking for bivariate outliers.
options(scipen = 999)
myreg <- lm(DV ~ IV,
data = mydata)
summary(myreg)
##
## Call:
## lm(formula = DV ~ IV, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0741 -1.5344 -0.0741 1.6338 9.2974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.45498 0.51516 2.824 0.0049 **
## IV 0.70794 0.04241 16.694 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.545 on 598 degrees of freedom
## Multiple R-squared: 0.3179, Adjusted R-squared: 0.3168
## F-statistic: 278.7 on 1 and 598 DF, p-value: < 0.00000000000000022
plot(mydata$IV, mydata$DV)
abline(lm(mydata$DV ~ mydata$IV))
leverage <- as.data.frame(hatvalues(myreg))
view(leverage)
Question 5: Does NewsCat depend on EdCat?
# Installing required packages
if (!require("tidyverse"))
install.packages("tidyverse")
if (!require("gmodels"))
install.packages("gmodels")
library(gmodels)
library(ggplot2)
# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv
# Specify the DV and IV
mydata$DV <- mydata$NewsCat #Edit YOURDVNAME
mydata$IV <- mydata$EdCat #Edit YOURIVNAME
# Look at the DV and IV
ggplot(mydata, aes(x = IV, fill = DV)) +
geom_bar(colour = "black") +
scale_fill_brewer(palette = "Paired")
# Make the crosstab table
CrossTable(
mydata$DV,
mydata$IV,
prop.chisq = FALSE,
prop.t = FALSE,
prop.r = FALSE
)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 600
##
##
## | mydata$IV
## mydata$DV | 1. High school or less | 2. Some college/college degree | Row Total |
## -------------|--------------------------------|--------------------------------|--------------------------------|
## 1. Never | 208 | 32 | 240 |
## | 0.581 | 0.132 | |
## -------------|--------------------------------|--------------------------------|--------------------------------|
## 2. Some days | 120 | 85 | 205 |
## | 0.335 | 0.351 | |
## -------------|--------------------------------|--------------------------------|--------------------------------|
## 3. Every day | 30 | 125 | 155 |
## | 0.084 | 0.517 | |
## -------------|--------------------------------|--------------------------------|--------------------------------|
## Column Total | 358 | 242 | 600 |
## | 0.597 | 0.403 | |
## -------------|--------------------------------|--------------------------------|--------------------------------|
##
##
# Run the chi-squared test
options(scipen = 999)
chitestresults <- chisq.test(mydata$DV, mydata$IV)
chitestresults
##
## Pearson's Chi-squared test
##
## data: mydata$DV and mydata$IV
## X-squared = 177.48, df = 2, p-value < 0.00000000000000022
Question 6: Does Knowledge depend on NewsCat?
# Installing required packages
if (!require("dplyr"))
install.packages("dplyr")
if (!require("tidyverse"))
install.packages("tidyverse")
library(dplyr)
library(ggplot2)
options(scipen = 999)
# Read the data
mydata <- read.csv("DataForExercise.csv") #Edit YOURFILENAME.csv
# Specify the DV and IV
mydata$DV <- mydata$Knowledge
mydata$IV <- mydata$NewsCat
# Graph the group distributions and averages
averages <- group_by(mydata, IV) %>%
summarise(mean = mean(DV, na.rm = TRUE))
ggplot(mydata, aes(x = DV)) +
geom_histogram() +
facet_grid(IV ~ .) +
geom_histogram(color = "black", fill = "#1f78b4") +
geom_vline(data = averages, aes(xintercept = mean, ))
# Calculate and show the group counts, means, standard
# deviations, minimums, and maximums
group_by(mydata, IV) %>%
summarise(
count = n(),
mean = mean(DV, na.rm = TRUE),
sd = sd(DV, na.rm = TRUE),
min = min(DV, na.rm = TRUE),
max = max(DV, na.rm = TRUE)
)
## # A tibble: 3 × 6
## IV count mean sd min max
## <chr> <int> <dbl> <dbl> <int> <int>
## 1 1. Never 240 8.93 2.72 1 17
## 2 2. Some days 205 10.1 3.27 1 17
## 3 3. Every day 155 11.1 2.87 4 18
# If the group distributions look non-normal, consider checking for non-normality
# running a Shapiro-Wilk test for each group. Significant results indicate
# non-normality.
mydata %>%
group_by(IV) %>%
summarise(`W Statistic` = shapiro.test(DV)$statistic,
`p-value` = shapiro.test(DV)$p.value)
## # A tibble: 3 × 3
## IV `W Statistic` `p-value`
## <chr> <dbl> <dbl>
## 1 1. Never 0.986 0.0202
## 2 2. Some days 0.979 0.00334
## 3 3. Every day 0.983 0.0575
# If one or more of the group distributions are non-normal, consider using a
# Kruskal-Wallis test instead of an ANOVA and, if the result is significant, a
# Dunn's Test instead of a Tukey's HSD as a post-hoc test.
if (!require("FSA"))
install.packages("FSA")
library(FSA)
kruskal.test(DV ~ IV, data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: DV by IV
## Kruskal-Wallis chi-squared = 48.758, df = 2, p-value = 0.00000000002585
dunnTest(DV ~ IV, data = mydata)
## Comparison Z P.unadj
## 1 1. Never - 2. Some days -4.041275 0.000053161290666391
## 2 1. Never - 3. Every day -6.872282 0.000000000006318296
## 3 2. Some days - 3. Every day -3.042198 0.002348572278282858
## P.adj
## 1 0.00010632258133278
## 2 0.00000000001895489
## 3 0.00234857227828286
# If all three group distributions are normal,
# though, you may use ANOVA. Also, ANOVA is
# powerful enough to work as long as the
# non-normality isn't severe.
options(scipen = 999)
oneway.test(mydata$DV ~ mydata$IV,
var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: mydata$DV and mydata$IV
## F = 29.485, num df = 2.00, denom df = 365.32, p-value =
## 0.000000000001346
# If the ANOVA detects significant difference, run
# this post-hoc procedure to learn which
# group pairs differed significantly.
anova_1 <- aov(mydata$DV ~ mydata$IV)
TukeyHSD(anova_1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = mydata$DV ~ mydata$IV)
##
## $`mydata$IV`
## diff lwr upr p adj
## 2. Some days-1. Never 1.128659 0.4681836 1.789134 0.0001977
## 3. Every day-1. Never 2.197581 1.4819549 2.913206 0.0000000
## 3. Every day-2. Some days 1.068922 0.3297129 1.808131 0.0020916