#import libraries
library(car)
## Loading required package: carData
#install.packages("ggcorrplot")
library(ggcorrplot)
## Loading required package: ggplot2
#import data set
d <- read.csv("C:/Users/Dani Grant/Dropbox/graduate school records/fall 2021/R programming/hw 3/Prim2008.csv", header = T)
head(d)
## town County Dean04 Kerry04 Clinton08 Edwards08 Obama08 diebold
## 1 Acworth Sullivan 0.4731707 0.2341463 0.2676580 0.1784387 0.4386617 -1
## 2 Albany Carroll 0.3082707 0.3759399 0.3005181 0.1761658 0.4248705 -1
## 3 Alexandria Grafton 0.2434783 0.4043478 0.3420290 0.2405797 0.3507246 -1
## 4 Allenstown Merrimac 0.1935028 0.4816384 0.4656566 0.2141414 0.2585859 1
## 5 Alstead Cheshire 0.4563107 0.2233010 0.2854369 0.1281553 0.4660194 -1
## 6 Alton Belknap 0.2786145 0.3207831 0.3712821 0.1938462 0.3805128 1
## lat long TotPop Medianage PerCapitaInc. MedianInc. ClintonCampaignPresence
## 1 43.19 72.29 907 42.6 18132 37386 -1
## 2 43.95 71.17 714 37.4 20690 36635 -1
## 3 43.61 71.79 1501 40.3 19323 42667 -1
## 4 41.13 71.39 4951 35.5 18851 41958 -1
## 5 43.15 72.36 2045 39.5 20444 43191 -1
## 6 43.45 71.22 5109 41.4 25940 43451 -1
## PopDensity PcntHS.Grad PcntColl.Grad Tot.LaborF Tot.Unemp
## 1 23.256410 85.9 28.6 448 13
## 2 9.431968 80.4 18.3 365 13
## 3 34.426606 84.1 18.4 823 28
## 4 240.339806 79.8 9.7 2859 103
## 5 44.915440 86.7 27.4 1218 37
## 6 74.043478 87.9 21.5 2755 86
# Dean04 = % voted for Dean in 2004 election
# Kerry04 - % votes for Kerry in 2004
# for comparison to older election
d$KD_diff <- d$Kerry04 - d$Dean04
# diebold - if the county used these voting machines (= 1), if not (= -1)
# lat, long - latitude and longitude
#Clinton08 = % votes for Clinton in 2008 primaries
#Obama08 = % votes for Obama in 2008 primaries
d$CO_diff <- d$Clinton08 - d$Obama08 # create difference score Clinton votes - Obama votes
d$Punem <- d$Tot.Unemp/d$Tot.LaborF # % unemployed
# PopDensity - person/mile^2
# LogPopDens <- log(PopDensity) # (this of course assumes you have attached the dataset)
d$LogPopDens <- log(d$PopDensity) # take log of population density
d$LogTotPop <- log(d$TotPop) #assign to new variable
d$LogMedianInc <- log(d$MedianInc.) #assign to new variable
mean(d$CO_diff)
## [1] -0.0197
sd(d$CO_diff)
## [1] 0.1413792
mean(d$TotPop)
## [1] 5618.154
sd(d$TotPop)
## [1] 10732.43
weighted.mean(d$CO_diff, d$TotPop) # get the weighted mean of difference between clinton-obama give the total population size
## [1] 0.04445515
mean(d$Punem)
## [1] 0.02886504
sd(d$Punem)
## [1] 0.01308751
op <- par(mfrow = c(2, 2)) #so I can see 4 graphs at once!
hist(d$CO_diff) #normal distribution
hist(d$LogPopDens) #normal distribution
hist(d$Punem) #not normal distribution
hist(d$Medianage) #not normal distribution
hist(d$PcntHS.Grad) #skewed left
hist(asin(sqrt(d$PcntHS.Grad/100))) #use sqrt() to normalize better!
hist(d$PcntColl.Grad) #right skewed
hist(log(d$PcntColl.Grad)) #use log to transform to a more normal-ish distribution
hist(d$PerCapitaInc.) #skewed right
hist(log(d$PerCapitaInc.)) #use log to transform to a more normal distribution
hist(d$MedianInc.) #skewed
hist(log(d$MedianInc.)) #more normal!
hist(d$TotPop) #yikes very skewed right
hist(log(d$TotPop)) #more normal!
plot(d$CO_diff ~ d$Punem)
abline(lm(d$CO_diff ~ d$Punem)) #weird, all kinds of bunching at zero
plot(d$CO_diff ~ d$PcntColl.Grad)
abline(lm(d$CO_diff ~ d$PcntColl.Grad)) #strong correlation
plot(d$CO_diff ~ d$LogMedianInc)
abline(lm(d$CO_diff ~ d$LogMedianInc)) #almost no correlation = 0
plot(d$CO_diff ~ d$Medianage)
abline(lm(d$CO_diff ~ d$Medianage)) #slight correlatioin
par(op)
c <- data.frame(d$CO_diff)
colnames(c)[colnames(c)=="d.CO_diff"] <- "CO_diff"
c$LogMedianInc <- d$LogMedianInc
c$LogTotPop <- d$LogTotPop
c$Punem <- d$Punem
c$PcntColl.Grad <- d$PcntColl.Grad
c$Medianage <- d$Medianage
c$LogPopDens <- d$LogPopDens
c$lat <- d$lat
c$long <- d$long
ggcorrplot(cor(c, use = "complete.obs"),
type = "upper",
lab = TRUE,
title = "correlations")
ANSWER
all but LogMedianInc are related to CO_diff I will include in my analyses LogTotPop, Punem, PcntColl.Grad, Medianage, LogPopDens, lat, and long
# diebold - if the county used these voting machines (= 1/2), if not (= -1/2)
d$diebold[d$diebold == -1] <- -1/2 #recode so just 1 unit wide contrast codes
d$diebold[d$diebold == 1] <- 1/2
m2008a <- lm(CO_diff ~ diebold,
data = d)
summary(m2008a)
##
## Call:
## lm(formula = CO_diff ~ diebold, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39897 -0.09419 0.00131 0.10352 0.32542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.014438 0.008803 -1.640 0.102
## diebold 0.096253 0.017607 5.467 1.19e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1336 on 231 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1146, Adjusted R-squared: 0.1107
## F-statistic: 29.89 on 1 and 231 DF, p-value: 1.185e-07
ANSWER
Clinton is more likely in counties using diebold voting machines, b = .10, t(231) = 5.47, p < .001
#Run a model controlling for demographic variables.
m2008b <- lm(CO_diff ~ diebold + PcntColl.Grad + Punem + Medianage + LogPopDens + LogMedianInc,
na.action = na.exclude,
data = d)
summary(m2008b)
##
## Call:
## lm(formula = CO_diff ~ diebold + PcntColl.Grad + Punem + Medianage +
## LogPopDens + LogMedianInc, data = d, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.245569 -0.064519 -0.004626 0.056204 0.228537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.5269465 0.3792002 -4.027 7.77e-05 ***
## diebold 0.0435496 0.0148157 2.939 0.00364 **
## PcntColl.Grad -0.0080621 0.0006107 -13.202 < 2e-16 ***
## Punem 2.3371565 0.5213950 4.483 1.18e-05 ***
## Medianage -0.0013604 0.0016980 -0.801 0.42391
## LogPopDens 0.0163865 0.0055303 2.963 0.00338 **
## LogMedianInc 0.1528195 0.0357827 4.271 2.89e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0896 on 221 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6159, Adjusted R-squared: 0.6055
## F-statistic: 59.06 on 6 and 221 DF, p-value: < 2.2e-16
ANSWER
Even while controlling for other variables, the diebold effect remains significant, b = .04, t(221) = 2.95, p = .004. Also, median age is not sig, so I remove it in the next model.
m2008c <- lm(CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + LogMedianInc,
na.action = na.exclude,
data = d)
summary(m2008c)
##
## Call:
## lm(formula = CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens +
## LogMedianInc, data = d, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.253948 -0.063460 -0.005563 0.054350 0.232221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.5967131 0.3687675 -4.330 2.26e-05 ***
## diebold 0.0464185 0.0143648 3.231 0.00142 **
## PcntColl.Grad -0.0080987 0.0006085 -13.310 < 2e-16 ***
## Punem 2.3935868 0.5161983 4.637 6.04e-06 ***
## LogPopDens 0.0172609 0.0054171 3.186 0.00165 **
## LogMedianInc 0.1539268 0.0357272 4.308 2.47e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08953 on 222 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6148, Adjusted R-squared: 0.6061
## F-statistic: 70.86 on 5 and 222 DF, p-value: < 2.2e-16
ANSWER
Even while controlling for other variables, the diebold effect remains significant, b = .05, t(222) = 3.23, p = .001.
m2004a <- lm(KD_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + LogMedianInc,
na.action = na.exclude,
data = d)
summary(m2004a)
##
## Call:
## lm(formula = KD_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens +
## LogMedianInc, data = d, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33507 -0.08112 0.00453 0.08277 0.38661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.9298366 0.5000435 -3.859 0.000149 ***
## diebold 0.0854954 0.0193978 4.407 1.63e-05 ***
## PcntColl.Grad -0.0046533 0.0008203 -5.672 4.38e-08 ***
## Punem 3.4324286 0.6959579 4.932 1.60e-06 ***
## LogPopDens 0.0121531 0.0073233 1.659 0.098434 .
## LogMedianInc 0.1825087 0.0484352 3.768 0.000211 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1207 on 221 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.4332, Adjusted R-squared: 0.4204
## F-statistic: 33.79 on 5 and 221 DF, p-value: < 2.2e-16
ANSWER
The 2004 election had a similar results to 2008 election, except logPopDens is marginally significant.
m2004b <- lm(CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + LogMedianInc + KD_diff,
na.action = na.exclude,
data = d)
summary(m2004b)
##
## Call:
## lm(formula = CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens +
## LogMedianInc + KD_diff, data = d, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.280715 -0.054778 0.002407 0.053082 0.230357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.1645655 0.3628110 -3.210 0.00153 **
## diebold 0.0247737 0.0142088 1.744 0.08264 .
## PcntColl.Grad -0.0069585 0.0006166 -11.285 < 2e-16 ***
## Punem 1.5632930 0.5149521 3.036 0.00269 **
## LogPopDens 0.0139075 0.0051750 2.687 0.00775 **
## LogMedianInc 0.1130768 0.0350907 3.222 0.00146 **
## KD_diff 0.2438234 0.0472404 5.161 5.47e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08476 on 220 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.6543, Adjusted R-squared: 0.6449
## F-statistic: 69.4 on 6 and 220 DF, p-value: < 2.2e-16
ANSWER
The effect of diebold in the 2008 election is still marginally significant even when controlling for the 2004 election difference.
I’m not too sure. There is a bias–votes for Clinton took place most in diebold areas– but does this imply fraud? It could be other factors need to be accounted for, and it looks to mirror the 2004 election patterns too–Kerry had more diebolds as well. Also, even when controlling for difference between Kerry and Baker, diebold effect moves to marginal significance.