Thesis: Democrat win for a state is predicted by education level and marital status and polling data.(<30%+ with bachelors degree, 15+ single) https://rpubs.com/schen0181/1232361

People in college tends to be democrats and people on public assistance(usually no bachelors)also tends to be democrats because they give more support for assistance programs.

cor(polls_joined)
## Warning in cor(polls_joined): the standard deviation is zero
##                            state   percent households_number population_number
## state                          1        NA                NA                NA
## percent                       NA 1.0000000         0.1765633         0.1753554
## households_number             NA 0.1765633         1.0000000         0.9970344
## population_number             NA 0.1753554         0.9970344         1.0000000
## over_15_and_never_married     NA 0.6349567         0.4156117         0.4174108
## bachelors_degree_or_higher    NA 0.7852374         0.1378334         0.1361334
## votes                         NA 0.1954136         0.9577294         0.9551741
## dem_win                       NA 0.8191977         0.1268895         0.1278574
##                            over_15_and_never_married bachelors_degree_or_higher
## state                                             NA                         NA
## percent                                    0.6349567                  0.7852374
## households_number                          0.4156117                  0.1378334
## population_number                          0.4174108                  0.1361334
## over_15_and_never_married                  1.0000000                  0.4000547
## bachelors_degree_or_higher                 0.4000547                  1.0000000
## votes                                      0.4089223                  0.1623111
## dem_win                                    0.5451682                  0.6442623
##                                votes   dem_win
## state                             NA        NA
## percent                    0.1954136 0.8191977
## households_number          0.9577294 0.1268895
## population_number          0.9551741 0.1278574
## over_15_and_never_married  0.4089223 0.5451682
## bachelors_degree_or_higher 0.1623111 0.6442623
## votes                      1.0000000 0.1592046
## dem_win                    0.1592046 1.0000000
lm_dem <- lm(percent ~ bachelors_degree_or_higher + over_15_and_never_married, data = polls_joined)
summary(lm_dem)
## 
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher + over_15_and_never_married, 
##     data = polls_joined)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1583  -2.8512  -0.6446   1.9136  12.0665 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -31.998      6.111  -5.236 9.50e-07 ***
## bachelors_degree_or_higher  107.319      9.602  11.177  < 2e-16 ***
## over_15_and_never_married   135.378     20.056   6.750 1.08e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.751 on 97 degrees of freedom
## Multiple R-squared:  0.7391, Adjusted R-squared:  0.7338 
## F-statistic: 137.4 on 2 and 97 DF,  p-value: < 2.2e-16
hist(lm_dem$residuals)

correlations

cor.test(polls_joined$dem_win, polls_joined$households_number)
## 
##  Pearson's product-moment correlation
## 
## data:  polls_joined$dem_win and polls_joined$households_number
## t = 1.2664, df = 98, p-value = 0.2084
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07130583  0.31544561
## sample estimates:
##       cor 
## 0.1268895
pairs( select( polls_joined, percent, bachelors_degree_or_higher, over_15_and_never_married, households_number, votes, population_number) )

ggplot for residual

training and testing

test01 <- sample(x = c(0, 1),  #give vector of what i want to sample
                 nrow(polls_joined),    #how big is sample going to be
                 replace = TRUE,   #if i pull out a 0, dont delete it
                 prob = c(0.8, 0.2))



train <- polls_joined[test01, ]
test <- filter(polls_joined, test01 == 1)
lm_dem <- lm(percent ~ bachelors_degree_or_higher + over_15_and_never_married, data = polls_joined)

#trained <- train %>% 
#  mutate(predicted = predict(lm_dem, newdata = polls_joined),
#         residuals = predicted - percent)
lm_edu <- lm(percent ~ bachelors_degree_or_higher, data = polls_joined)
summary(lm_edu)
## 
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher, data = polls_joined)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.3682  -3.5071  -0.3971   3.7524  13.1523 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   3.792      3.664   1.035    0.303    
## bachelors_degree_or_higher  133.247     10.614  12.554   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.73 on 98 degrees of freedom
## Multiple R-squared:  0.6166, Adjusted R-squared:  0.6127 
## F-statistic: 157.6 on 1 and 98 DF,  p-value: < 2.2e-16
polls_joined_edu_pred <- polls_joined %>% 
  mutate(prediction = predict(lm_edu, newdata = polls_joined))


ggplot(data = polls_joined_edu_pred)+
  geom_boxplot(mapping = aes(x = prediction, y = percent))
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

plot(polls_joined$bachelors_degree_or_higher, polls_joined$percent, xlab = "bachelors", ylab = "percent that voted Demcrat from those polled")

lm_dem <- lm(percent ~ bachelors_degree_or_higher + votes, data = polls_joined)
summary(lm_dem)
## 
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher + votes, data = polls_joined)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0539  -3.4929  -0.7866   3.1535  13.6931 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.855e+00  3.660e+00   1.053    0.295    
## bachelors_degree_or_higher 1.313e+02  1.074e+01  12.222   <2e-16 ***
## votes                      3.821e-07  3.466e-07   1.102    0.273    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.724 on 97 degrees of freedom
## Multiple R-squared:  0.6213, Adjusted R-squared:  0.6135 
## F-statistic: 79.58 on 2 and 97 DF,  p-value: < 2.2e-16
polls_historical_1 <- president_polls_historical_raw %>% 
  janitor::clean_names() %>% 
  select(pollster_id, pollster, -sponsors, sample_size, -population, pollster_rating_id, -methodology, -transparency_score, state, -race_id, -ranked_choice_reallocated, party, answer, pct) %>% 
  filter(!is.na(state)) %>% 
  mutate(is_dem = ifelse(party == "DEM", 1, 0)) %>% 
  filter(party == "DEM" | party == "REP") %>% 
  filter(state != "Maine CD-1") %>% 
  filter(state != "Maine CD-2") %>% 
  group_by(state) %>% 
  summarize(percent = mean(pct))

polls_joined_1 <- polls_historical_1 %>% 
  #inner_join(pollster_combined, by = "pollster_rating_id") %>% 
  inner_join(states_raw, by = "state") %>% 
  inner_join(state_win , join_by("state"))

lm_dem <- lm(percent ~ bachelors_degree_or_higher + over_15_and_never_married + dem_win, data = polls_joined_1)
summary(lm_dem)
## 
## Call:
## lm(formula = percent ~ bachelors_degree_or_higher + over_15_and_never_married + 
##     dem_win, data = polls_joined_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1411 -0.4190  0.1325  0.6034  1.7244 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                51.44786    1.43424  35.871   <2e-16 ***
## bachelors_degree_or_higher -2.59425    2.18538  -1.187   0.2381    
## over_15_and_never_married  -8.00545    4.16439  -1.922   0.0575 .  
## dem_win                     0.02119    0.25795   0.082   0.9347    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8997 on 96 degrees of freedom
## Multiple R-squared:  0.09484,    Adjusted R-squared:  0.06656 
## F-statistic: 3.353 on 3 and 96 DF,  p-value: 0.02214