midterm <- anes2024[c("V241025","V241457","V242067","V241169", "V241173", "V241177", "V241458x", "V242135")]
names(midterm)[1] <- "partyreg"
names(midterm)[2] <- "ageatsurvey"
names(midterm)[3] <- "votedfor"
names(midterm)[4] <- "likesdems"
names(midterm)[5] <- "likesreps"
names(midterm)[6] <- "ideology"
names(midterm)[7] <- "ageonelection"
names(midterm)[8] <- "vancescore"
#I had this above because I was playing with a bunch of different variables; please ignore
nrow(midterm)
## [1] 5521
# 5521 rows
midterm <- filter(midterm, !ageonelection %in% "-2")
nrow(midterm)
## [1] 5242
# filtering to only respondents with age; now only 5242 rows
midterm <- filter(midterm, !vancescore %in% c("-9", "-7", "-6", "5", "-4", "-1", "998", "999", "200", "-5"))
nrow(midterm)
## [1] 4452
# filtering out respondents that refused or didn't know answer; brought down rows to 4452
ggplot(midterm, mapping=aes(x=ageonelection)) + geom_density(alpha=0.3)
# bulk of respondents were in their mid-60's or mid-40's
ggplot(midterm, mapping=aes(x=vancescore)) + geom_density(alpha=0.3)
# major heaping around 0 and 50
tabyl(midterm$ageonelection)
## midterm$ageonelection n percent
## 18 21 0.004716981
## 19 18 0.004043127
## 20 19 0.004267745
## 21 36 0.008086253
## 22 30 0.006738544
## 23 25 0.005615454
## 24 27 0.006064690
## 25 34 0.007637017
## 26 53 0.011904762
## 27 53 0.011904762
## 28 62 0.013926325
## 29 54 0.012129380
## 30 59 0.013252471
## 31 60 0.013477089
## 32 70 0.015723270
## 33 67 0.015049416
## 34 71 0.015947889
## 35 77 0.017295597
## 36 68 0.015274034
## 37 69 0.015498652
## 38 70 0.015723270
## 39 78 0.017520216
## 40 95 0.021338724
## 41 76 0.017070979
## 42 79 0.017744834
## 43 78 0.017520216
## 44 77 0.017295597
## 45 82 0.018418688
## 46 59 0.013252471
## 47 64 0.014375562
## 48 71 0.015947889
## 49 63 0.014150943
## 50 74 0.016621743
## 51 64 0.014375562
## 52 40 0.008984726
## 53 70 0.015723270
## 54 87 0.019541779
## 55 66 0.014824798
## 56 67 0.015049416
## 57 65 0.014600180
## 58 67 0.015049416
## 59 69 0.015498652
## 60 74 0.016621743
## 61 98 0.022012579
## 62 89 0.019991015
## 63 83 0.018643306
## 64 88 0.019766397
## 65 91 0.020440252
## 66 108 0.024258760
## 67 94 0.021114106
## 68 87 0.019541779
## 69 86 0.019317161
## 70 91 0.020440252
## 71 82 0.018418688
## 72 84 0.018867925
## 73 84 0.018867925
## 74 70 0.015723270
## 75 81 0.018194070
## 76 75 0.016846361
## 77 71 0.015947889
## 78 53 0.011904762
## 79 43 0.009658580
## 80 286 0.064240791
tabyl(midterm$vancescore)
## midterm$vancescore n percent
## 0 1301 0.2922282120
## 1 19 0.0042677448
## 2 14 0.0031446541
## 3 3 0.0006738544
## 4 2 0.0004492363
## 6 5 0.0011230907
## 7 3 0.0006738544
## 9 1 0.0002246181
## 10 87 0.0195417790
## 11 2 0.0004492363
## 12 1 0.0002246181
## 15 309 0.0694070081
## 16 1 0.0002246181
## 18 1 0.0002246181
## 20 39 0.0087601078
## 25 19 0.0042677448
## 26 1 0.0002246181
## 30 173 0.0388589398
## 32 1 0.0002246181
## 34 1 0.0002246181
## 35 17 0.0038185085
## 40 159 0.0357142857
## 41 1 0.0002246181
## 43 1 0.0002246181
## 45 12 0.0026954178
## 47 1 0.0002246181
## 50 548 0.1230907457
## 51 2 0.0004492363
## 53 1 0.0002246181
## 55 8 0.0017969452
## 58 1 0.0002246181
## 59 1 0.0002246181
## 60 237 0.0532345013
## 62 1 0.0002246181
## 65 16 0.0035938904
## 66 2 0.0004492363
## 69 2 0.0004492363
## 70 226 0.0507637017
## 72 1 0.0002246181
## 75 53 0.0119047619
## 77 1 0.0002246181
## 78 3 0.0006738544
## 79 1 0.0002246181
## 80 68 0.0152740341
## 82 1 0.0002246181
## 85 393 0.0882749326
## 86 3 0.0006738544
## 88 1 0.0002246181
## 89 3 0.0006738544
## 90 132 0.0296495957
## 92 3 0.0006738544
## 95 51 0.0114555256
## 96 2 0.0004492363
## 97 2 0.0004492363
## 98 3 0.0006738544
## 99 9 0.0020215633
## 100 503 0.1129829290
# Just another way to illustrate the distribution
hist(midterm$ageonelection)
# similar to the chart above
hist(midterm$vancescore)
# very similar to the other plot for this variable
summary(midterm$ageonelection)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 39.00 55.00 53.72 68.00 80.00
# median age is 55, which makes sense given how this looked when plotted
summary(midterm$vancescore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 50.00 43.18 80.00 100.00
# median is 50, which makes sense given what we saw before
boxplot(midterm$ageonelection)
#shows slight skew towards older americans
boxplot(midterm$vancescore)
# if I'm interpreting it right, the lower bound isn't visible because of the heaping around 0
midterm$quartile_age <- cut(midterm$ageonelection,
breaks = quantile(midterm$ageonelection, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE),
include.lowest = TRUE,
labels = c("Q1", "Q2", "Q3", "Q4"))
midterm %>%
select(vancescore, quartile_age) %>%
tbl_summary(
by = quartile_age,
statistic = list(
all_continuous() ~ "Mean = {mean}, SD = {sd}, Median = {median}"
),
digits = all_continuous() ~ 2)
| Characteristic | Q1 N = 1,1211 |
Q2 N = 1,1451 |
Q3 N = 1,0801 |
Q4 N = 1,1061 |
|---|---|---|---|---|
| vancescore | Mean = 41.82, SD = 34.98, Median = 50.00 | Mean = 42.36, SD = 36.24, Median = 50.00 | Mean = 43.77, SD = 38.09, Median = 50.00 | Mean = 44.83, SD = 39.15, Median = 50.00 |
| 1 Mean = Mean, SD = SD, Median = Median | ||||
#Decided to split the ages into quartiles and you can see that the mean score for JD Vance increases slightly with age, though not as sharply as I would've thought.
boxplot(vancescore ~ quartile_age, midterm,
main = "Scores of JD Vance (2024) by Age Quartiles",
xlab = "Age Quartiles",
ylab = "Scores/Feelings Towards JD Vance")
#Again, slight increase on feelings towards JD Vance as respondents increase in age
mean_age <- mean(midterm$ageonelection)
mean_age
## [1] 53.7208
mean_vance <- mean(midterm$vancescore)
mean_vance
## [1] 43.17902
n_age <- length(midterm$ageonelection)
n_vance <- length(midterm$vancescore)
sd_age <- sd(midterm$ageonelection)
sd_age
## [1] 17.22181
# mean is 53, so most respondents are between 36 and 70
sd_vance <- sd(midterm$vancescore)
sd_vance
## [1] 37.13463
# so if the mean is 43, then most respondents scored him between 26 and 60
pooled_sd <- sqrt(((n_age - 1)*sd_age^2 + (n_vance - 1)*sd_vance^2) / (n_age + n_vance - 2))
t_stat <- (mean_age - mean_vance) / (pooled_sd * sqrt(1/n_age + 1/n_vance))
t_stat
## [1] 17.18344
# t statistic is showing as 17.18
df <- n_age + n_vance - 2
p_value <- 2 * pt(-abs(t_stat), df)
p_value
## [1] 3.941692e-65
#super small p value (yay!)
cat("t =", round(t_stat, 3), "\n")
## t = 17.183
cat("df =", df, "\n")
## df = 8902
cat("p-value =", round(p_value, 4), "\n")
## p-value = 0
# I messed up somewhere along the way and can't figure out where; not sure why it's giving me a P of 0 unless it's just not showing a bunch of decimal places for some reason; at least the t statistic lines up with what I got above
quantiles_age <- midterm$ageonelection
qnt <- quantile(quantiles_age, probs=c(.25, .75), na.rm = T)
caps <- quantile(quantiles_age, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(quantiles_age, na.rm = T)
quantiles_age[quantiles_age < (qnt[1] - H)] <- caps[1]
quantiles_age[quantiles_age > (qnt[2] + H)] <- caps[2]
caps
## 5% 95%
## 26 80
# extremely low values are around 25 and upper end is 80
qnt
## 25% 75%
## 39 68
# lower quantile hovers around 39 and upper hovers around 68 years
quantiles_vance <- midterm$vancescore
qnt2 <- quantile(quantiles_vance, probs=c(.25, .75), na.rm = T)
caps2 <- quantile(quantiles_vance, probs=c(.05, .95), na.rm = T)
H2 <- 1.5 * IQR(quantiles_vance, na.rm = T)
quantiles_vance[quantiles_vance < (qnt[1] - H)] <- caps2[1]
quantiles_vance[quantiles_vance > (qnt[2] + H)] <- caps2[2]
caps2
## 5% 95%
## 0 100
# extremely low values are 0 and 100; surprised that the upper end wasn't in the mid 90's given how the distribution looked
qnt2
## 25% 75%
## 0 80
# lower quantile is 0 and upper is 80; this makes more sense to me
mean_val_age <- mean(quantiles_age, na.rm = TRUE)
se_age <- sd(quantiles_age, na.rm = TRUE) / sqrt(length(quantiles_age))
ci_age <- mean_val_age + c(-1.96, 1.96) * se_age
# confidence intervals = 53.2 and 54.2 at 95% confidence; so true mean is likely 54
mean_val_vance <- mean(quantiles_vance, na.rm = TRUE)
se_vance <- sd(quantiles_vance, na.rm = TRUE) / sqrt(length(quantiles_vance))
ci_vance <- mean_val_vance + c(-1.96, 1.96) * se_vance
#confidence intervals = 42.0 and 44.2 at 95% confidence; so we're pretty sure the true mean hovers around 43
stem(quantiles_age, scale = 1, width = 80, atom = 1e-08)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 1 | 888888888888888888888999999999999999999
## 2 | 00000000000000000001111111111111111111111111111111111112222222222222+57
## 2 | 55555555555555555555555555555555556666666666666666666666666666666666+176
## 3 | 00000000000000000000000000000000000000000000000000000000000111111111+247
## 3 | 55555555555555555555555555555555555555555555555555555555555555555555+282
## 4 | 00000000000000000000000000000000000000000000000000000000000000000000+325
## 4 | 55555555555555555555555555555555555555555555555555555555555555555555+259
## 5 | 00000000000000000000000000000000000000000000000000000000000000000000+255
## 5 | 55555555555555555555555555555555555555555555555555555555555555555566+254
## 6 | 00000000000000000000000000000000000000000000000000000000000000000000+352
## 6 | 55555555555555555555555555555555555555555555555555555555555555555555+386
## 7 | 00000000000000000000000000000000000000000000000000000000000000000000+331
## 7 | 55555555555555555555555555555555555555555555555555555555555555555555+243
## 8 | 00000000000000000000000000000000000000000000000000000000000000000000+206
#stem and leaf plot for age; you can see the heaping around 45 and 65 very clearly
stem(quantiles_vance, scale = 1, width = 80, atom = 1e-08)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 0 | 00000000000000000000000000000000000000000000000000000000000000000000+1268
## 1 | 00000000000000000000000000000000000000000000000000000000000000000000+321
## 2 | 00000000000000000000000000000000000000055555555555555555556
## 3 | 00000000000000000000000000000000000000000000000000000000000000000000+112
## 4 | 00000000000000000000000000000000000000000000000000000000000000000000+94
## 5 | 00000000000000000000000000000000000000000000000000000000000000000000+481
## 6 | 00000000000000000000000000000000000000000000000000000000000000000000+178
## 7 | 00000000000000000000000000000000000000000000000000000000000000000000+205
## 8 | 00000000000000000000000000000000000000000000000000000000000000000000+389
## 9 | 00000000000000000000000000000000000000000000000000000000000000000000+122
## 10 | 00000000000000000000000000000000000000000000000000000000000000000000+423
#here you can see the heaping around the bottom
#adding logrithmic stuff below; unsure if needed but included anyway
midterm$logage<-log(midterm$ageonelection)
summary(midterm$logage)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.890 3.664 4.007 3.924 4.220 4.382
sd(midterm$logage)
## [1] 0.3622443
ggplot(data=midterm, mapping=aes(x=logage)) + geom_density(alpha=0.3)
# logrithm smooths it out a lot;
midterm$logvance<-log(midterm$vancescore)
summary(midterm$logvance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -Inf -Inf 3.912 -Inf 4.382 4.605
sd(midterm$logvance)
## [1] NaN
ggplot(data=midterm, mapping=aes(x=logvance)) + geom_density(alpha=0.3)
## Warning: Removed 1301 rows containing non-finite outside the scale range
## (`stat_density()`).
# interesting that this one shows the heaping differently distributed visually; unsure of why it is now looking more like it's skewed towards the top of the range...
chart <-midterm %>%
ggplot(aes(x = ageonelection)) +
geom_bar(aes(y = (..count..) / sum(..count..)), fill = "skyblue") +
geom_text(
aes(
y = (..count..) / sum(..count..),
label = scales::percent((..count..) / sum(..count..), accuracy=1)
),
stat = "count",
vjust = -0.5
) +
scale_y_continuous(labels = scales::percent) +
labs(
y = "Percentage",
x = "Category",
title = "Bar chart of Age on Election"
) +
theme_minimal()
chart
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# bar graph of the age on election made me realize that the upper bound is off because it is grouping those 80 and older
chart2 <-midterm %>%
ggplot(aes(x = vancescore)) +
geom_bar(aes(y = (..count..) / sum(..count..)), fill = "skyblue") +
geom_text(
aes(
y = (..count..) / sum(..count..),
label = scales::percent((..count..) / sum(..count..), accuracy=1)
),
stat = "count",
vjust = -0.5
) +
scale_y_continuous(labels = scales::percent) +
labs(
y = "Percentage",
x = "Category",
title = "Bar chart of Temperature Score for JD Vance"
) +
theme_minimal()
chart2
#very useful to see where the heaping really is
ggplot(midterm, aes(x = ageonelection, y = vancescore)) +
geom_point(color = "darkred", size = 3) +
labs(title = "Scatterplot of Age versus Ideology",
x = "Age on Election",
y = "Feeling Towards JD Vance") +
theme_minimal()
# normal scatterplot doesn't show much correlation between age and ideology
mean_diff <- mean_age - mean_vance
n1 <- length(midterm$ageonelection)
n2 <- length(midterm$vancescore)
sd1 <- sd(midterm$ageonelection)
sd2 <- sd(midterm$vancescore)
se <- sqrt((sd1^2 / n1) + (sd2^2 / n2))
mu <- 0
x <- seq(mu - 4*se, mu + 4*se, length = 100)
y <- dnorm(x, mean = mu, sd = se)
plot(x, y, type = "l", lwd = 2, col = "gray",
main = "Normal Curve of Mean Difference",
xlab = "Mean Difference", ylab = "Density")
abline(v = mu, col = "red", lwd = 2)
text(mu, dnorm(mu, mu, se), "Hypothesized = 0", pos = 2, col = "red")
abline(v = mean_diff, col = "blue", lwd = 2)
text(mean_diff, dnorm(mean_diff, mu, se), paste("Observed =", round(mean_diff, 2)), pos = 4, col = "blue")
#I cannot figure out why the observed line is not showing on the chart
midterm <- midterm %>%
mutate(age_quartile = ntile(ageonelection, 4))
# Summary table of vancescore by age quartile
midterm %>%
select(age_quartile, vancescore) %>%
tbl_summary(
by = age_quartile,
statistic = all_continuous() ~ "{mean} ({sd})",
digits = all_continuous() ~ 2,
label = list(vancescore ~ "Vance Score")
) %>%
add_p(test = all_continuous() ~ "t.test") %>%
modify_header(label ~ "**Age Quartile**") %>%
modify_caption("**Descriptive Statistics and T-Test for Vance Score by Age Quartile**")
## The following errors were returned during `modify_caption()`:
## ✖ For variable `vancescore` (`age_quartile`) and "estimate", "statistic",
## "p.value", "parameter", "conf.low", and "conf.high" statistics: grouping
## factor must have exactly 2 levels
| Age Quartile | 1 N = 1,1131 |
2 N = 1,1131 |
3 N = 1,1131 |
4 N = 1,1131 |
p-value2 |
|---|---|---|---|---|---|
| Vance Score | 41.87 (35.02) | 41.72 (36.18) | 44.43 (38.00) | 44.69 (39.15) | |
| 1 Mean (SD) | |||||
| 2 NA | |||||
#hypothesis: older voters scored JD Vance higher for the 2020 election
#null hypothesis: older voters did not score JD Vance higher for the 2020 election
#The theory here is that older voters typically lean conservative and tend to favor the MAGA movement. As such, one would expect that they would score JD Vance higher than younger voters who are more likely to lean liberal/progressive. For the purposes of this assignment, the sentiment towards JD Vance is considered a reflection of affective polarization between generations/age groups.
cat("VanceScoreᵢ = β₀ + β₁·Ageᵢ + εᵢ")
## VanceScoreᵢ = β₀ + β₁·Ageᵢ + εᵢ
# Notation for the regression^^
model <- lm(vancescore ~ ageonelection, data = midterm)
summary(model)
##
## Call:
## lm(formula = vancescore ~ ageonelection, data = midterm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.366 -41.537 5.133 38.546 59.794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.70737 1.82212 21.243 <2e-16 ***
## ageonelection 0.08324 0.03230 2.577 0.01 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.11 on 4450 degrees of freedom
## Multiple R-squared: 0.00149, Adjusted R-squared: 0.001266
## F-statistic: 6.641 on 1 and 4450 DF, p-value: 0.009996
# confirms that there's a slight increase in JD Vance scoring the older the respondents get
# R-squared shows the model doesn't explain much; the P-value is decent but different from the previous section...
# Create prediction data
new_data <- data.frame(ageonelection = seq(min(midterm$ageonelection, na.rm = TRUE),
max(midterm$ageonelection, na.rm = TRUE),
length.out = 100))
# Get predicted values with confidence intervals
predictions <- predict(model, newdata = new_data, interval = "confidence")
plot_data <- cbind(new_data, predictions)
ggplot(plot_data, aes(x = ageonelection, y = fit)) +
geom_line(color = "blue", size = 1.2) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.3, fill = "skyblue") +
labs(title = "Predicted Vance Score by Age with 95% Confidence Interval",
x = "Age",
y = "Predicted Vance Score") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# the plot aligns with what we've seen previously with this model, though the Y axis is spaced in a way
# that kind of makes it look more drastic than it is in reality
# t-value
t_value <- (mean_age - mean_vance) / se
t_value
## [1] 17.18344
#t = 17.1; means the observed effect is 17.1 times larger than the expected variation due to random chance; good for statistical significance and rejecting the null hypothesis
# Degrees of freedom (approximate)
df <- n1 + n2 - 2
# Two-tailed p-value
p_value <- 2 * pt(-abs(t_value), df)
p_value
## [1] 3.941692e-65
# Not sure why the P isn't aligning with what I got from the regression... but interpretation: highly statistically significant and would allow me to reject the null hypothesis