Lecture/HW 13: Learning R Markdown

HW Problem 2

(a) import libraries and data sets

#import libraries
library(car)
## Loading required package: carData
#install.packages("ggcorrplot")
library(ggcorrplot)
## Loading required package: ggplot2
#import data set
d <- read.csv("C:/Users/Dani Grant/Dropbox/graduate school records/fall 2021/R programming/hw 3/Prim2008.csv", header = T)

- view dataset

head(d)
##         town   County    Dean04   Kerry04 Clinton08 Edwards08   Obama08 diebold
## 1    Acworth Sullivan 0.4731707 0.2341463 0.2676580 0.1784387 0.4386617      -1
## 2     Albany  Carroll 0.3082707 0.3759399 0.3005181 0.1761658 0.4248705      -1
## 3 Alexandria  Grafton 0.2434783 0.4043478 0.3420290 0.2405797 0.3507246      -1
## 4 Allenstown Merrimac 0.1935028 0.4816384 0.4656566 0.2141414 0.2585859       1
## 5    Alstead Cheshire 0.4563107 0.2233010 0.2854369 0.1281553 0.4660194      -1
## 6      Alton  Belknap 0.2786145 0.3207831 0.3712821 0.1938462 0.3805128       1
##     lat  long TotPop Medianage PerCapitaInc. MedianInc. ClintonCampaignPresence
## 1 43.19 72.29    907      42.6         18132      37386                      -1
## 2 43.95 71.17    714      37.4         20690      36635                      -1
## 3 43.61 71.79   1501      40.3         19323      42667                      -1
## 4 41.13 71.39   4951      35.5         18851      41958                      -1
## 5 43.15 72.36   2045      39.5         20444      43191                      -1
## 6 43.45 71.22   5109      41.4         25940      43451                      -1
##   PopDensity PcntHS.Grad PcntColl.Grad Tot.LaborF Tot.Unemp
## 1  23.256410        85.9          28.6        448        13
## 2   9.431968        80.4          18.3        365        13
## 3  34.426606        84.1          18.4        823        28
## 4 240.339806        79.8           9.7       2859       103
## 5  44.915440        86.7          27.4       1218        37
## 6  74.043478        87.9          21.5       2755        86

- create variables

# Dean04 = % voted for Dean in 2004 election
# Kerry04 - % votes for Kerry in 2004
# for comparison to older election
d$KD_diff <- d$Kerry04 - d$Dean04

# diebold - if the county used these voting machines (= 1), if not (= -1)

# lat, long - latitude and longitude

#Clinton08 = % votes for Clinton in 2008 primaries
#Obama08 = % votes for Obama in 2008 primaries
d$CO_diff <- d$Clinton08 - d$Obama08 # create difference score Clinton votes - Obama votes

d$Punem <- d$Tot.Unemp/d$Tot.LaborF # % unemployed

# PopDensity - person/mile^2
# LogPopDens <- log(PopDensity)   # (this of course assumes you have attached the dataset)
d$LogPopDens <- log(d$PopDensity) # take log of population density

d$LogTotPop <- log(d$TotPop) #assign to new variable

d$LogMedianInc <- log(d$MedianInc.) #assign to new variable

(b) analyze the data

- Clinton - Obama difference score

mean(d$CO_diff)
## [1] -0.0197
sd(d$CO_diff)
## [1] 0.1413792

- total population

mean(d$TotPop)
## [1] 5618.154
sd(d$TotPop)
## [1] 10732.43

- weighted mean

weighted.mean(d$CO_diff, d$TotPop) # get the weighted mean of difference between clinton-obama give the total population size
## [1] 0.04445515

- percent unemployed

mean(d$Punem)
## [1] 0.02886504
sd(d$Punem)
## [1] 0.01308751

(c) graphs

op <- par(mfrow = c(2, 2)) #so I can see 4 graphs at once!

hist(d$CO_diff) #normal distribution

hist(d$LogPopDens) #normal distribution

hist(d$Punem) #not normal distribution

hist(d$Medianage) #not normal distribution

hist(d$PcntHS.Grad) #skewed left
hist(asin(sqrt(d$PcntHS.Grad/100))) #use sqrt() to normalize better!

hist(d$PcntColl.Grad) #right skewed
hist(log(d$PcntColl.Grad)) #use log to transform to a more normal-ish distribution

hist(d$PerCapitaInc.) #skewed right
hist(log(d$PerCapitaInc.)) #use log to transform to a more normal distribution

hist(d$MedianInc.) #skewed
hist(log(d$MedianInc.)) #more normal!

hist(d$TotPop) #yikes very skewed right
hist(log(d$TotPop)) #more normal!

plot(d$CO_diff ~ d$Punem)
abline(lm(d$CO_diff ~ d$Punem)) #weird, all kinds of bunching at zero

plot(d$CO_diff ~ d$PcntColl.Grad)
abline(lm(d$CO_diff ~ d$PcntColl.Grad)) #strong correlation

plot(d$CO_diff ~ d$LogMedianInc)
abline(lm(d$CO_diff ~ d$LogMedianInc)) #almost no correlation = 0

plot(d$CO_diff ~ d$Medianage)
abline(lm(d$CO_diff ~ d$Medianage)) #slight correlatioin

par(op)

c <- data.frame(d$CO_diff)
colnames(c)[colnames(c)=="d.CO_diff"] <- "CO_diff"
c$LogMedianInc <- d$LogMedianInc
c$LogTotPop <- d$LogTotPop
c$Punem <- d$Punem
c$PcntColl.Grad <- d$PcntColl.Grad
c$Medianage <- d$Medianage
c$LogPopDens <- d$LogPopDens
c$lat <- d$lat
c$long <- d$long

ggcorrplot(cor(c, use = "complete.obs"), 
           type = "upper", 
           lab = TRUE, 
           title = "correlations")


ANSWER

all but LogMedianInc are related to CO_diff I will include in my analyses LogTotPop, Punem, PcntColl.Grad, Medianage, LogPopDens, lat, and long


# diebold - if the county used these voting machines (= 1/2), if not (= -1/2)
d$diebold[d$diebold == -1] <- -1/2 #recode so just 1 unit wide contrast codes
d$diebold[d$diebold == 1] <- 1/2

m2008a <- lm(CO_diff ~ diebold, 
             data = d)

summary(m2008a)
## 
## Call:
## lm(formula = CO_diff ~ diebold, data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39897 -0.09419  0.00131  0.10352  0.32542 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.014438   0.008803  -1.640    0.102    
## diebold      0.096253   0.017607   5.467 1.19e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1336 on 231 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1146, Adjusted R-squared:  0.1107 
## F-statistic: 29.89 on 1 and 231 DF,  p-value: 1.185e-07

ANSWER

Clinton is more likely in counties using diebold voting machines, b = .10, t(231) = 5.47, p < .001


#Run a model controlling for demographic variables. 
m2008b <- lm(CO_diff ~ diebold + PcntColl.Grad + Punem + Medianage + LogPopDens + LogMedianInc, 
             na.action = na.exclude, 
             data = d)
summary(m2008b)
## 
## Call:
## lm(formula = CO_diff ~ diebold + PcntColl.Grad + Punem + Medianage + 
##     LogPopDens + LogMedianInc, data = d, na.action = na.exclude)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.245569 -0.064519 -0.004626  0.056204  0.228537 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.5269465  0.3792002  -4.027 7.77e-05 ***
## diebold        0.0435496  0.0148157   2.939  0.00364 ** 
## PcntColl.Grad -0.0080621  0.0006107 -13.202  < 2e-16 ***
## Punem          2.3371565  0.5213950   4.483 1.18e-05 ***
## Medianage     -0.0013604  0.0016980  -0.801  0.42391    
## LogPopDens     0.0163865  0.0055303   2.963  0.00338 ** 
## LogMedianInc   0.1528195  0.0357827   4.271 2.89e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0896 on 221 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.6159, Adjusted R-squared:  0.6055 
## F-statistic: 59.06 on 6 and 221 DF,  p-value: < 2.2e-16

ANSWER

Even while controlling for other variables, the diebold effect remains significant, b = .04, t(221) = 2.95, p = .004. Also, median age is not sig, so I remove it in the next model.


m2008c <- lm(CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + LogMedianInc, 
             na.action = na.exclude,
             data = d)
summary(m2008c)
## 
## Call:
## lm(formula = CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + 
##     LogMedianInc, data = d, na.action = na.exclude)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.253948 -0.063460 -0.005563  0.054350  0.232221 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.5967131  0.3687675  -4.330 2.26e-05 ***
## diebold        0.0464185  0.0143648   3.231  0.00142 ** 
## PcntColl.Grad -0.0080987  0.0006085 -13.310  < 2e-16 ***
## Punem          2.3935868  0.5161983   4.637 6.04e-06 ***
## LogPopDens     0.0172609  0.0054171   3.186  0.00165 ** 
## LogMedianInc   0.1539268  0.0357272   4.308 2.47e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08953 on 222 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.6148, Adjusted R-squared:  0.6061 
## F-statistic: 70.86 on 5 and 222 DF,  p-value: < 2.2e-16

ANSWER

Even while controlling for other variables, the diebold effect remains significant, b = .05, t(222) = 3.23, p = .001.


m2004a <- lm(KD_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + LogMedianInc, 
             na.action = na.exclude, 
             data = d)

summary(m2004a) 
## 
## Call:
## lm(formula = KD_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + 
##     LogMedianInc, data = d, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33507 -0.08112  0.00453  0.08277  0.38661 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.9298366  0.5000435  -3.859 0.000149 ***
## diebold        0.0854954  0.0193978   4.407 1.63e-05 ***
## PcntColl.Grad -0.0046533  0.0008203  -5.672 4.38e-08 ***
## Punem          3.4324286  0.6959579   4.932 1.60e-06 ***
## LogPopDens     0.0121531  0.0073233   1.659 0.098434 .  
## LogMedianInc   0.1825087  0.0484352   3.768 0.000211 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1207 on 221 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.4332, Adjusted R-squared:  0.4204 
## F-statistic: 33.79 on 5 and 221 DF,  p-value: < 2.2e-16

ANSWER

The 2004 election had a similar results to 2008 election, except logPopDens is marginally significant.


m2004b <- lm(CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + LogMedianInc + KD_diff, 
             na.action = na.exclude, 
             data = d)

summary(m2004b)
## 
## Call:
## lm(formula = CO_diff ~ diebold + PcntColl.Grad + Punem + LogPopDens + 
##     LogMedianInc + KD_diff, data = d, na.action = na.exclude)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.280715 -0.054778  0.002407  0.053082  0.230357 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.1645655  0.3628110  -3.210  0.00153 ** 
## diebold        0.0247737  0.0142088   1.744  0.08264 .  
## PcntColl.Grad -0.0069585  0.0006166 -11.285  < 2e-16 ***
## Punem          1.5632930  0.5149521   3.036  0.00269 ** 
## LogPopDens     0.0139075  0.0051750   2.687  0.00775 ** 
## LogMedianInc   0.1130768  0.0350907   3.222  0.00146 ** 
## KD_diff        0.2438234  0.0472404   5.161 5.47e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08476 on 220 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.6543, Adjusted R-squared:  0.6449 
## F-statistic:  69.4 on 6 and 220 DF,  p-value: < 2.2e-16

ANSWER

The effect of diebold in the 2008 election is still marginally significant even when controlling for the 2004 election difference.

I’m not too sure. There is a bias–votes for Clinton took place most in diebold areas– but does this imply fraud? It could be other factors need to be accounted for, and it looks to mirror the 2004 election patterns too–Kerry had more diebolds as well. Also, even when controlling for difference between Kerry and Baker, diebold effect moves to marginal significance.