Thesis: Democrat win for a state is predicted by education level and marital status and polling data.(<30%+ with bachelors degree, 15+ single) https://rpubs.com/schen0181/1232361
People in college tends to be democrats and people on public assistance(usually no bachelors)also tends to be democrats because they give more support for assistance programs.
cor(polls_joined)
## Warning in cor(polls_joined): the standard deviation is zero
## state percent households_number population_number
## state 1 NA NA NA
## percent NA 1.0000000 0.1765633 0.1753554
## households_number NA 0.1765633 1.0000000 0.9970344
## population_number NA 0.1753554 0.9970344 1.0000000
## over_15_and_never_married NA 0.6349567 0.4156117 0.4174108
## bachelors_degree_or_higher NA 0.7852374 0.1378334 0.1361334
## votes NA 0.1954136 0.9577294 0.9551741
## dem_win NA 0.8191977 0.1268895 0.1278574
## over_15_and_never_married bachelors_degree_or_higher
## state NA NA
## percent 0.6349567 0.7852374
## households_number 0.4156117 0.1378334
## population_number 0.4174108 0.1361334
## over_15_and_never_married 1.0000000 0.4000547
## bachelors_degree_or_higher 0.4000547 1.0000000
## votes 0.4089223 0.1623111
## dem_win 0.5451682 0.6442623
## votes dem_win
## state NA NA
## percent 0.1954136 0.8191977
## households_number 0.9577294 0.1268895
## population_number 0.9551741 0.1278574
## over_15_and_never_married 0.4089223 0.5451682
## bachelors_degree_or_higher 0.1623111 0.6442623
## votes 1.0000000 0.1592046
## dem_win 0.1592046 1.0000000
lm_dem <- lm(percent ~ bachelors_degree_or_higher + over_15_and_never_married, data = polls_joined)
summary(lm_dem)
##
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher + over_15_and_never_married,
## data = polls_joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1583 -2.8512 -0.6446 1.9136 12.0665
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -31.998 6.111 -5.236 9.50e-07 ***
## bachelors_degree_or_higher 107.319 9.602 11.177 < 2e-16 ***
## over_15_and_never_married 135.378 20.056 6.750 1.08e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.751 on 97 degrees of freedom
## Multiple R-squared: 0.7391, Adjusted R-squared: 0.7338
## F-statistic: 137.4 on 2 and 97 DF, p-value: < 2.2e-16
hist(lm_dem$residuals)
correlations
cor.test(polls_joined$dem_win, polls_joined$households_number)
##
## Pearson's product-moment correlation
##
## data: polls_joined$dem_win and polls_joined$households_number
## t = 1.2664, df = 98, p-value = 0.2084
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07130583 0.31544561
## sample estimates:
## cor
## 0.1268895
pairs( select( polls_joined, percent, bachelors_degree_or_higher, over_15_and_never_married, households_number, votes, population_number) )
ggplot for residual
training and testing
test01 <- sample(x = c(0, 1), #give vector of what i want to sample
nrow(polls_joined), #how big is sample going to be
replace = TRUE, #if i pull out a 0, dont delete it
prob = c(0.8, 0.2))
train <- polls_joined[test01, ]
test <- filter(polls_joined, test01 == 1)
lm_dem <- lm(percent ~ bachelors_degree_or_higher + over_15_and_never_married, data = polls_joined)
#trained <- train %>%
# mutate(predicted = predict(lm_dem, newdata = polls_joined),
# residuals = predicted - percent)
lm_edu <- lm(percent ~ bachelors_degree_or_higher, data = polls_joined)
summary(lm_edu)
##
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher, data = polls_joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.3682 -3.5071 -0.3971 3.7524 13.1523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.792 3.664 1.035 0.303
## bachelors_degree_or_higher 133.247 10.614 12.554 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.73 on 98 degrees of freedom
## Multiple R-squared: 0.6166, Adjusted R-squared: 0.6127
## F-statistic: 157.6 on 1 and 98 DF, p-value: < 2.2e-16
polls_joined_edu_pred <- polls_joined %>%
mutate(prediction = predict(lm_edu, newdata = polls_joined))
ggplot(data = polls_joined_edu_pred)+
geom_boxplot(mapping = aes(x = prediction, y = percent))
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
plot(polls_joined$bachelors_degree_or_higher, polls_joined$percent, xlab = "bachelors", ylab = "percent that voted Demcrat from those polled")
lm_dem <- lm(percent ~ bachelors_degree_or_higher + votes, data = polls_joined)
summary(lm_dem)
##
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher + votes, data = polls_joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0539 -3.4929 -0.7866 3.1535 13.6931
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.855e+00 3.660e+00 1.053 0.295
## bachelors_degree_or_higher 1.313e+02 1.074e+01 12.222 <2e-16 ***
## votes 3.821e-07 3.466e-07 1.102 0.273
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.724 on 97 degrees of freedom
## Multiple R-squared: 0.6213, Adjusted R-squared: 0.6135
## F-statistic: 79.58 on 2 and 97 DF, p-value: < 2.2e-16
polls_historical_1 <- president_polls_historical_raw %>%
janitor::clean_names() %>%
select(pollster_id, pollster, -sponsors, sample_size, -population, pollster_rating_id, -methodology, -transparency_score, state, -race_id, -ranked_choice_reallocated, party, answer, pct) %>%
filter(!is.na(state)) %>%
mutate(is_dem = ifelse(party == "DEM", 1, 0)) %>%
filter(party == "DEM" | party == "REP") %>%
filter(state != "Maine CD-1") %>%
filter(state != "Maine CD-2") %>%
group_by(state) %>%
summarize(percent = mean(pct))
polls_joined_1 <- polls_historical_1 %>%
#inner_join(pollster_combined, by = "pollster_rating_id") %>%
inner_join(states_raw, by = "state") %>%
inner_join(state_win , join_by("state"))
lm_dem <- lm(percent ~ bachelors_degree_or_higher + over_15_and_never_married + dem_win, data = polls_joined_1)
summary(lm_dem)
##
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher + over_15_and_never_married +
## dem_win, data = polls_joined_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1411 -0.4190 0.1325 0.6034 1.7244
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.44786 1.43424 35.871 <2e-16 ***
## bachelors_degree_or_higher -2.59425 2.18538 -1.187 0.2381
## over_15_and_never_married -8.00545 4.16439 -1.922 0.0575 .
## dem_win 0.02119 0.25795 0.082 0.9347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8997 on 96 degrees of freedom
## Multiple R-squared: 0.09484, Adjusted R-squared: 0.06656
## F-statistic: 3.353 on 3 and 96 DF, p-value: 0.02214