Stats_midterm

midterm <- anes2024[c("V241025","V241457","V242067","V241169", "V241173", "V241177", "V241458x", "V242135")]

names(midterm)[1] <- "partyreg"

names(midterm)[2] <- "ageatsurvey"

names(midterm)[3] <- "votedfor"

names(midterm)[4] <- "likesdems"

names(midterm)[5] <- "likesreps"

names(midterm)[6] <- "ideology"

names(midterm)[7] <- "ageonelection"

names(midterm)[8] <- "vancescore"

#I had this above because I was playing with a bunch of different variables; please ignore

nrow(midterm)

## [1] 5521

# 5521 rows

midterm <- filter(midterm, !ageonelection %in% "-2")
nrow(midterm)

## [1] 5242

# filtering to only respondents with age; now only 5242 rows

midterm <- filter(midterm, !vancescore %in% c("-9", "-7", "-6", "5", "-4", "-1", "998", "999", "200", "-5"))
nrow(midterm)

## [1] 4452

# filtering out respondents that refused or didn't know answer; brought down rows to 4452

ggplot(midterm, mapping=aes(x=ageonelection)) + geom_density(alpha=0.3)

# bulk of respondents were in their mid-60's or mid-40's

ggplot(midterm, mapping=aes(x=vancescore)) + geom_density(alpha=0.3)

# major heaping around 0 and 50

tabyl(midterm$ageonelection)

##  midterm$ageonelection   n     percent
##                     18  21 0.004716981
##                     19  18 0.004043127
##                     20  19 0.004267745
##                     21  36 0.008086253
##                     22  30 0.006738544
##                     23  25 0.005615454
##                     24  27 0.006064690
##                     25  34 0.007637017
##                     26  53 0.011904762
##                     27  53 0.011904762
##                     28  62 0.013926325
##                     29  54 0.012129380
##                     30  59 0.013252471
##                     31  60 0.013477089
##                     32  70 0.015723270
##                     33  67 0.015049416
##                     34  71 0.015947889
##                     35  77 0.017295597
##                     36  68 0.015274034
##                     37  69 0.015498652
##                     38  70 0.015723270
##                     39  78 0.017520216
##                     40  95 0.021338724
##                     41  76 0.017070979
##                     42  79 0.017744834
##                     43  78 0.017520216
##                     44  77 0.017295597
##                     45  82 0.018418688
##                     46  59 0.013252471
##                     47  64 0.014375562
##                     48  71 0.015947889
##                     49  63 0.014150943
##                     50  74 0.016621743
##                     51  64 0.014375562
##                     52  40 0.008984726
##                     53  70 0.015723270
##                     54  87 0.019541779
##                     55  66 0.014824798
##                     56  67 0.015049416
##                     57  65 0.014600180
##                     58  67 0.015049416
##                     59  69 0.015498652
##                     60  74 0.016621743
##                     61  98 0.022012579
##                     62  89 0.019991015
##                     63  83 0.018643306
##                     64  88 0.019766397
##                     65  91 0.020440252
##                     66 108 0.024258760
##                     67  94 0.021114106
##                     68  87 0.019541779
##                     69  86 0.019317161
##                     70  91 0.020440252
##                     71  82 0.018418688
##                     72  84 0.018867925
##                     73  84 0.018867925
##                     74  70 0.015723270
##                     75  81 0.018194070
##                     76  75 0.016846361
##                     77  71 0.015947889
##                     78  53 0.011904762
##                     79  43 0.009658580
##                     80 286 0.064240791

tabyl(midterm$vancescore)

##  midterm$vancescore    n      percent
##                   0 1301 0.2922282120
##                   1   19 0.0042677448
##                   2   14 0.0031446541
##                   3    3 0.0006738544
##                   4    2 0.0004492363
##                   6    5 0.0011230907
##                   7    3 0.0006738544
##                   9    1 0.0002246181
##                  10   87 0.0195417790
##                  11    2 0.0004492363
##                  12    1 0.0002246181
##                  15  309 0.0694070081
##                  16    1 0.0002246181
##                  18    1 0.0002246181
##                  20   39 0.0087601078
##                  25   19 0.0042677448
##                  26    1 0.0002246181
##                  30  173 0.0388589398
##                  32    1 0.0002246181
##                  34    1 0.0002246181
##                  35   17 0.0038185085
##                  40  159 0.0357142857
##                  41    1 0.0002246181
##                  43    1 0.0002246181
##                  45   12 0.0026954178
##                  47    1 0.0002246181
##                  50  548 0.1230907457
##                  51    2 0.0004492363
##                  53    1 0.0002246181
##                  55    8 0.0017969452
##                  58    1 0.0002246181
##                  59    1 0.0002246181
##                  60  237 0.0532345013
##                  62    1 0.0002246181
##                  65   16 0.0035938904
##                  66    2 0.0004492363
##                  69    2 0.0004492363
##                  70  226 0.0507637017
##                  72    1 0.0002246181
##                  75   53 0.0119047619
##                  77    1 0.0002246181
##                  78    3 0.0006738544
##                  79    1 0.0002246181
##                  80   68 0.0152740341
##                  82    1 0.0002246181
##                  85  393 0.0882749326
##                  86    3 0.0006738544
##                  88    1 0.0002246181
##                  89    3 0.0006738544
##                  90  132 0.0296495957
##                  92    3 0.0006738544
##                  95   51 0.0114555256
##                  96    2 0.0004492363
##                  97    2 0.0004492363
##                  98    3 0.0006738544
##                  99    9 0.0020215633
##                 100  503 0.1129829290

# Just another way to illustrate the distribution


hist(midterm$ageonelection)

# similar to the chart above

hist(midterm$vancescore)

# very similar to the other plot for this variable

summary(midterm$ageonelection)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   39.00   55.00   53.72   68.00   80.00

# median age is 55, which makes sense given how this looked when plotted
summary(midterm$vancescore)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   50.00   43.18   80.00  100.00

# median is 50, which makes sense given what we saw before

boxplot(midterm$ageonelection)

#shows slight skew towards older americans

boxplot(midterm$vancescore)

# if I'm interpreting it right, the lower bound isn't visible because of the heaping around 0



midterm$quartile_age <- cut(midterm$ageonelection, 
                          breaks = quantile(midterm$ageonelection, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE),
                          include.lowest = TRUE,
                          labels = c("Q1", "Q2", "Q3", "Q4"))

midterm %>%
  select(vancescore, quartile_age) %>%
  tbl_summary(
    by = quartile_age,  
    statistic = list(
      all_continuous() ~ "Mean = {mean}, SD = {sd}, Median = {median}"
    ),
    digits = all_continuous() ~ 2)

Characteristic	Q1 N = 1,121¹	Q2 N = 1,145¹	Q3 N = 1,080¹	Q4 N = 1,106¹
vancescore	Mean = 41.82, SD = 34.98, Median = 50.00	Mean = 42.36, SD = 36.24, Median = 50.00	Mean = 43.77, SD = 38.09, Median = 50.00	Mean = 44.83, SD = 39.15, Median = 50.00
¹ Mean = Mean, SD = SD, Median = Median

#Decided to split the ages into quartiles and you can see that the mean score for JD Vance increases slightly with age, though not as sharply as I would've thought.

boxplot(vancescore ~ quartile_age, midterm,            
  main = "Scores of JD Vance (2024) by Age Quartiles",
  xlab = "Age Quartiles",
  ylab = "Scores/Feelings Towards JD Vance")

#Again, slight increase on feelings towards JD Vance as respondents increase in age



mean_age <- mean(midterm$ageonelection)
mean_age

## [1] 53.7208

mean_vance <- mean(midterm$vancescore)
mean_vance

## [1] 43.17902

n_age <- length(midterm$ageonelection)
n_vance <- length(midterm$vancescore)

sd_age <- sd(midterm$ageonelection)
sd_age

## [1] 17.22181

# mean is 53, so most respondents are between 36 and 70

sd_vance <- sd(midterm$vancescore)
sd_vance

## [1] 37.13463

# so if the mean is 43, then most respondents scored him between 26 and 60

pooled_sd <- sqrt(((n_age - 1)*sd_age^2 + (n_vance - 1)*sd_vance^2) / (n_age + n_vance - 2))
t_stat <- (mean_age - mean_vance) / (pooled_sd * sqrt(1/n_age + 1/n_vance))
t_stat

## [1] 17.18344

# t statistic is showing as 17.18

df <- n_age + n_vance - 2
p_value <- 2 * pt(-abs(t_stat), df)
p_value

## [1] 3.941692e-65

#super small p value (yay!)  

cat("t =", round(t_stat, 3), "\n")

## t = 17.183

cat("df =", df, "\n")

## df = 8902

cat("p-value =", round(p_value, 4), "\n")

## p-value = 0

# I messed up somewhere along the way and can't figure out where; not sure why it's giving me a P of 0 unless it's just not showing a bunch of decimal places for some reason; at least the t statistic lines up with what I got above

quantiles_age <- midterm$ageonelection
qnt <- quantile(quantiles_age, probs=c(.25, .75), na.rm = T)
caps <- quantile(quantiles_age, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(quantiles_age, na.rm = T)
quantiles_age[quantiles_age < (qnt[1] - H)] <- caps[1]
quantiles_age[quantiles_age > (qnt[2] + H)] <- caps[2]
caps

##  5% 95% 
##  26  80

# extremely low values are around 25 and upper end is 80
qnt

## 25% 75% 
##  39  68

# lower quantile hovers around 39 and upper hovers around 68 years

quantiles_vance <- midterm$vancescore
qnt2 <- quantile(quantiles_vance, probs=c(.25, .75), na.rm = T)
caps2 <- quantile(quantiles_vance, probs=c(.05, .95), na.rm = T)
H2 <- 1.5 * IQR(quantiles_vance, na.rm = T)
quantiles_vance[quantiles_vance < (qnt[1] - H)] <- caps2[1]
quantiles_vance[quantiles_vance > (qnt[2] + H)] <- caps2[2]
caps2

##  5% 95% 
##   0 100

# extremely low values are 0 and 100; surprised that the upper end wasn't in the mid 90's given how the distribution looked
qnt2

## 25% 75% 
##   0  80

# lower quantile is 0 and upper is 80; this makes more sense to me

mean_val_age <- mean(quantiles_age, na.rm = TRUE)
se_age <- sd(quantiles_age, na.rm = TRUE) / sqrt(length(quantiles_age))
ci_age <- mean_val_age + c(-1.96, 1.96) * se_age
# confidence intervals = 53.2 and 54.2 at 95% confidence; so true mean is likely 54

mean_val_vance <- mean(quantiles_vance, na.rm = TRUE)
se_vance <- sd(quantiles_vance, na.rm = TRUE) / sqrt(length(quantiles_vance))
ci_vance <- mean_val_vance + c(-1.96, 1.96) * se_vance
#confidence intervals = 42.0 and 44.2 at 95% confidence; so we're pretty sure the true mean hovers around 43


stem(quantiles_age, scale = 1, width = 80, atom = 1e-08)

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   1 | 888888888888888888888999999999999999999
##   2 | 00000000000000000001111111111111111111111111111111111112222222222222+57
##   2 | 55555555555555555555555555555555556666666666666666666666666666666666+176
##   3 | 00000000000000000000000000000000000000000000000000000000000111111111+247
##   3 | 55555555555555555555555555555555555555555555555555555555555555555555+282
##   4 | 00000000000000000000000000000000000000000000000000000000000000000000+325
##   4 | 55555555555555555555555555555555555555555555555555555555555555555555+259
##   5 | 00000000000000000000000000000000000000000000000000000000000000000000+255
##   5 | 55555555555555555555555555555555555555555555555555555555555555555566+254
##   6 | 00000000000000000000000000000000000000000000000000000000000000000000+352
##   6 | 55555555555555555555555555555555555555555555555555555555555555555555+386
##   7 | 00000000000000000000000000000000000000000000000000000000000000000000+331
##   7 | 55555555555555555555555555555555555555555555555555555555555555555555+243
##   8 | 00000000000000000000000000000000000000000000000000000000000000000000+206

#stem and leaf plot for age; you can see the heaping around 45 and 65 very clearly

stem(quantiles_vance, scale = 1, width = 80, atom = 1e-08)

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    0 | 00000000000000000000000000000000000000000000000000000000000000000000+1268
##    1 | 00000000000000000000000000000000000000000000000000000000000000000000+321
##    2 | 00000000000000000000000000000000000000055555555555555555556
##    3 | 00000000000000000000000000000000000000000000000000000000000000000000+112
##    4 | 00000000000000000000000000000000000000000000000000000000000000000000+94
##    5 | 00000000000000000000000000000000000000000000000000000000000000000000+481
##    6 | 00000000000000000000000000000000000000000000000000000000000000000000+178
##    7 | 00000000000000000000000000000000000000000000000000000000000000000000+205
##    8 | 00000000000000000000000000000000000000000000000000000000000000000000+389
##    9 | 00000000000000000000000000000000000000000000000000000000000000000000+122
##   10 | 00000000000000000000000000000000000000000000000000000000000000000000+423

#here you can see the heaping around the bottom



#adding logrithmic stuff below; unsure if needed but included anyway
midterm$logage<-log(midterm$ageonelection)
summary(midterm$logage)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.890   3.664   4.007   3.924   4.220   4.382

sd(midterm$logage)

## [1] 0.3622443

ggplot(data=midterm, mapping=aes(x=logage)) + geom_density(alpha=0.3)

# logrithm smooths it out a lot; 

midterm$logvance<-log(midterm$vancescore)
summary(midterm$logvance)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -Inf    -Inf   3.912    -Inf   4.382   4.605

sd(midterm$logvance)

## [1] NaN

ggplot(data=midterm, mapping=aes(x=logvance)) + geom_density(alpha=0.3)

## Warning: Removed 1301 rows containing non-finite outside the scale range
## (`stat_density()`).

# interesting that this one shows the heaping differently distributed visually; unsure of why it is now looking more like it's skewed towards the top of the range...


chart <-midterm %>%
  ggplot(aes(x = ageonelection)) +
  geom_bar(aes(y = (..count..) / sum(..count..)), fill = "skyblue") +
  geom_text(
    aes(
      y = (..count..) / sum(..count..),
      label = scales::percent((..count..) / sum(..count..), accuracy=1)
    ),
    stat = "count",
    vjust = -0.5
  ) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    y = "Percentage",
    x = "Category",
    title = "Bar chart of Age on Election"
  ) +
  theme_minimal()

chart

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# bar graph of the age on election made me realize that the upper bound is off because it is grouping those 80 and older


chart2 <-midterm %>%
  ggplot(aes(x = vancescore)) +
  geom_bar(aes(y = (..count..) / sum(..count..)), fill = "skyblue") +
  geom_text(
    aes(
      y = (..count..) / sum(..count..),
      label = scales::percent((..count..) / sum(..count..), accuracy=1)
    ),
    stat = "count",
    vjust = -0.5
  ) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    y = "Percentage",
    x = "Category",
    title = "Bar chart of Temperature Score for JD Vance"
  ) +
  theme_minimal()

chart2

#very useful to see where the heaping really is


ggplot(midterm, aes(x = ageonelection, y = vancescore)) +
  geom_point(color = "darkred", size = 3) +
  labs(title = "Scatterplot of Age versus Ideology",
       x = "Age on Election",
       y = "Feeling Towards JD Vance") +
  theme_minimal()

# normal scatterplot doesn't show much correlation between age and ideology 



mean_diff <- mean_age - mean_vance
n1 <- length(midterm$ageonelection)
n2 <- length(midterm$vancescore)
sd1 <- sd(midterm$ageonelection)
sd2 <- sd(midterm$vancescore)
se <- sqrt((sd1^2 / n1) + (sd2^2 / n2))

mu <- 0
x <- seq(mu - 4*se, mu + 4*se, length = 100)
y <- dnorm(x, mean = mu, sd = se)

plot(x, y, type = "l", lwd = 2, col = "gray",
     main = "Normal Curve of Mean Difference",
     xlab = "Mean Difference", ylab = "Density")

abline(v = mu, col = "red", lwd = 2)
text(mu, dnorm(mu, mu, se), "Hypothesized = 0", pos = 2, col = "red")
abline(v = mean_diff, col = "blue", lwd = 2)
text(mean_diff, dnorm(mean_diff, mu, se), paste("Observed =", round(mean_diff, 2)), pos = 4, col = "blue")

#I cannot figure out why the observed line is not showing on the chart

midterm <- midterm %>%
  mutate(age_quartile = ntile(ageonelection, 4))

# Summary table of vancescore by age quartile
midterm %>%
  select(age_quartile, vancescore) %>%
  tbl_summary(
    by = age_quartile,
    statistic = all_continuous() ~ "{mean} ({sd})",
    digits = all_continuous() ~ 2,
    label = list(vancescore ~ "Vance Score")
  ) %>%
  add_p(test = all_continuous() ~ "t.test") %>%
  modify_header(label ~ "**Age Quartile**") %>%
  modify_caption("**Descriptive Statistics and T-Test for Vance Score by Age Quartile**")

## The following errors were returned during `modify_caption()`:
## ✖ For variable `vancescore` (`age_quartile`) and "estimate", "statistic",
##   "p.value", "parameter", "conf.low", and "conf.high" statistics: grouping
##   factor must have exactly 2 levels

**Descriptive Statistics and T-Test for Vance Score by Age Quartile**
Age Quartile	1 N = 1,113¹	2 N = 1,113¹	3 N = 1,113¹	4 N = 1,113¹
Vance Score	41.87 (35.02)	41.72 (36.18)	44.43 (38.00)	44.69 (39.15)
¹ Mean (SD)
² NA

#hypothesis: older voters scored JD Vance higher for the 2020 election
#null hypothesis: older voters did not score JD Vance higher for the 2020 election

#The theory here is that older voters typically lean conservative and tend to favor the MAGA movement. As such, one would expect that they would score JD Vance higher than younger voters who are more likely to lean liberal/progressive. For the purposes of this assignment, the sentiment towards JD Vance is considered a reflection of affective polarization between generations/age groups. 

cat("VanceScoreᵢ = β₀ + β₁·Ageᵢ + εᵢ")

## VanceScoreᵢ = β₀ + β₁·Ageᵢ + εᵢ

# Notation for the regression^^


model <- lm(vancescore ~ ageonelection, data = midterm)
summary(model)

## 
## Call:
## lm(formula = vancescore ~ ageonelection, data = midterm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.366 -41.537   5.133  38.546  59.794 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   38.70737    1.82212  21.243   <2e-16 ***
## ageonelection  0.08324    0.03230   2.577     0.01 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.11 on 4450 degrees of freedom
## Multiple R-squared:  0.00149,    Adjusted R-squared:  0.001266 
## F-statistic: 6.641 on 1 and 4450 DF,  p-value: 0.009996

# confirms that there's a slight increase in JD Vance scoring the older the respondents get
# R-squared shows the model doesn't explain much; the P-value is decent but different from the previous section...

# Create prediction data
new_data <- data.frame(ageonelection = seq(min(midterm$ageonelection, na.rm = TRUE),
                                 max(midterm$ageonelection, na.rm = TRUE),
                                 length.out = 100))

# Get predicted values with confidence intervals
predictions <- predict(model, newdata = new_data, interval = "confidence")

plot_data <- cbind(new_data, predictions)

ggplot(plot_data, aes(x = ageonelection, y = fit)) +
  geom_line(color = "blue", size = 1.2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.3, fill = "skyblue") +
  labs(title = "Predicted Vance Score by Age with 95% Confidence Interval",
       x = "Age",
       y = "Predicted Vance Score") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# the plot aligns with what we've seen previously with this model, though the Y axis is spaced in a way
# that kind of makes it look more drastic than it is in reality 


# t-value
t_value <- (mean_age - mean_vance) / se
t_value

## [1] 17.18344

#t = 17.1; means the observed effect is 17.1 times larger than the expected variation due to random chance; good for statistical significance and rejecting the null hypothesis


# Degrees of freedom (approximate)
df <- n1 + n2 - 2

# Two-tailed p-value
p_value <- 2 * pt(-abs(t_value), df)
p_value

## [1] 3.941692e-65

# Not sure why the P isn't aligning with what I got from the regression... but interpretation: highly statistically significant and would allow me to reject the null hypothesis

Stats_midterm

Matthew Reyes

2025-10-13