Question 1

Question 1.1

First, I open up relevant libraries for the assignment. I then set the working directory and load the data set ca2006.csv.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
rm(list=ls())
setwd("~/Desktop/PS150B")

dat <- read.csv("ca2006.csv")

Question 1.2

Second, I create a plot of the proportion of votes for the Democratic candidate (prop_d), against the proportion of the two-party vote for the Democratic presidential candidate in 2004 (dem pres 2004) in the district.

plot <- ggplot(dat, aes(x=dem_pres_2004, y=prop_d)) +
          geom_point()+
          ggtitle("Proportion of Vote Share In CA Districts") + 
          labs(x="2-Party Vote Share Dem Presidential Candidate (2004)", 
               y="Vote Share Dem Congressional Candidate (2006)") +
          theme_minimal() 
plot 

Question 1.3

Third, I print out regression results and plot the regression line.

## fit linear regression
linear1 <- lm(prop_d ~ dem_pres_2004, data = dat) 
summary(linear1)
## 
## Call:
## lm(formula = prop_d ~ dem_pres_2004, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36168 -0.04314 -0.00830  0.01233  0.44754 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.15390    0.05978  -2.574    0.013 *  
## dem_pres_2004  1.38268    0.10291  13.436   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1125 on 51 degrees of freedom
## Multiple R-squared:  0.7797, Adjusted R-squared:  0.7754 
## F-statistic: 180.5 on 1 and 51 DF,  p-value: < 2.2e-16
## plot regression line
plot + geom_smooth(method=lm, se=FALSE)

Question 1.4

Fourth, I create a simple function to predict vote share for the Democratic candidate when dem_pres_2004 = .5. I find that the predicted vote share is .537.

## the function
predict.me <- function(mod, star) {
  beta.hat <- mod$coefficients
  x.star <- star
  dot.prod <- beta.hat %*% star
  return(dot.prod)
}

## load up star 1, then predict
star1 <- c("(Intercept)" = 1, "dem_pres_2004" = .5)
predict.me(linear1, star1)
##           [,1]
## [1,] 0.5374445

Question 1.5

Now, I regress prop_d against: dem_pres_2004, dem_pres_2000, and dem_inc. The summary is provided.

linear2 <- lm(prop_d ~ dem_pres_2004 + dem_pres_2000 + dem_inc,
                 data = dat) 
summary(linear2)
## 
## Call:
## lm(formula = prop_d ~ dem_pres_2004 + dem_pres_2000 + dem_inc, 
##     data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34354 -0.04843 -0.01123  0.01629  0.33092 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   -0.036772   0.079456  -0.463  0.64556   
## dem_pres_2004 -0.007185   0.470300  -0.015  0.98787   
## dem_pres_2000  0.963297   0.497967   1.934  0.05884 . 
## dem_inc        0.173460   0.054945   3.157  0.00273 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09862 on 49 degrees of freedom
## Multiple R-squared:  0.8373, Adjusted R-squared:  0.8274 
## F-statistic: 84.09 on 3 and 49 DF,  p-value: < 2.2e-16

Question 1.6

Using the multivariate regression from 1.5 and the function from 1.4, I predict the vote share for the Democratic candidate when: dem_pres_2004 = 0.5, dem_pres_2000 = 0.5, and dem_inc = 1. I predict the vote share will be .615.

## load up x star (aka star2), then predict
star2 <- c("(Intercept)" = 1, "dem_pres_2004" = .5, "dem_pres_2002" = .5,
           "dem_inc" = 1)
predict.me(linear2, star2)
##           [,1]
## [1,] 0.6147444

Question 1.7

In question 1.7, I implement the bootstrap to characterize the uncertainty for the response variable predictions. Parts a though d are nested inside of the for loop.

set.seed(2)

## Set modifiable parameters
B <- 10000
n <- length(dat$district)

## Create objects that will store stuff
boot.predict1 <- rep(NA, B)
boot.predict2 <- rep(NA, B)
df <- dat

## for loop for the bootstrap
for (i in 1:B) {
  
  #### Question 1.7a -- create bootstrapped data set
    x.b <- df[sample(nrow(dat), n, replace = TRUE),]
    
    #### Question 1.7b -- fit specified regressions
    boot.lin1 <- lm(prop_d ~ dem_pres_2004, data = x.b) 
    boot.lin2 <- lm(prop_d ~ dem_pres_2004 + dem_pres_2000 + dem_inc,
                 data = x.b)
    
    #### Question 1.7c and d -- make and store predictions
  boot.predict1[i] <- predict.me(boot.lin1, star1)
  boot.predict2[i] <- predict.me(boot.lin2, star2)
}

Question 1.8

Here, I report the confidence intervals and create histograms for both models.

For model 1:

## 95% Confidence Interval
quantile(boot.predict1, probs = c(.025, 0.975))
##      2.5%     97.5% 
## 0.5055570 0.5720341
## Histogram
ggplot()+ aes(boot.predict1) + 
  geom_histogram(alpha = .7, colour="black", fill="blue", bins=40) +
  labs(title="Histogram for Model 1 Predictions", x = "Model 1 Predictions") +
  theme_minimal()

For model 2:

## 95% Confidence Interval
quantile(boot.predict2, probs = c(.025, 0.975))
##      2.5%     97.5% 
## 0.5503845 0.6945578
## Histogram
ggplot()+ aes(boot.predict2) + 
  geom_histogram(alpha = .7, colour="black", fill="red", bins = 40) +
  labs(title="Histogram for Model 2 Predictions", x = "Model 2 Predictions") +
  theme_minimal()

Question 1.9

What proportion of time does each model predict the incumbent will win?

Model 1: 98.86% of time. Model 2: 99.98% of the time.

## boot model 1
sum(boot.predict1 > .5)/B
## [1] 0.9886
## Boot model 2
sum(boot.predict2 > .5)/B
## [1] 0.9998

Question 2

Question 2.1

First, I load the data for the second question.

cat <- read.csv("vote92.csv")

Question 2.2

The proportion of respondents that reported voting for Bill Clinton was .458.

sum(cat$clintonvote)/length(cat$clintonvote) 
## [1] 0.4576458

Question 2.3

Using a logistic regression, I regress clintonvote on dem, female, and clintondist. The summary is reported.

logit <- glm(clintonvote ~ dem + female + clintondist, family = 
               binomial(link = "logit"), data = cat)
summary(logit)
## 
## Call:
## glm(formula = clintonvote ~ dem + female + clintondist, family = binomial(link = "logit"), 
##     data = cat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9866  -0.6183  -0.3559   0.6317   2.7461  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.40692    0.18758  -7.500 6.37e-14 ***
## dem          3.05648    0.18687  16.357  < 2e-16 ***
## female       0.17417    0.18413   0.946    0.344    
## clintondist -0.14482    0.02777  -5.215 1.84e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1253.61  on 908  degrees of freedom
## Residual deviance:  769.17  on 905  degrees of freedom
## AIC: 777.17
## 
## Number of Fisher Scoring iterations: 5

Question 2.4

In this question, I write a function to predict the probability that a voter supports Clinton based on a logistic regression.

## take in model and variables
predict.clint <- function(mod, party, gender, dist) {
  beta.cat <- mod$coefficients
  c.star <- c("(Intercept)" = 1, "dem" = party, "female" = gender, 
              "clintondist" = dist)
  dot.c <- beta.cat %*% c.star
  pred.prob <- exp(dot.c)/(1+exp(dot.c))
  return(pred.prob)
}

Question 2.5

Using my function from 2.4, I determine the probability a female, Democrat, with clintondist = 1 votes for Clinton. The probability is .8428.

predict.clint(logit, 1, 1, 1)
##           [,1]
## [1,] 0.8427606

Question 2.6

Now, I use a linear regression to predict clintonvote as a function of dem, female, and clintondist.

## store length of clintonvote vector
C <- length(cat$clintonvote)

## linear predict prob
linear.clint <- lm(clintonvote ~ dem + female + clintondist, data = cat)
summary(linear.clint)
## 
## Call:
## lm(formula = clintonvote ~ dem + female + clintondist, data = cat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8472 -0.1876 -0.0467  0.1899  1.0760 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.205695   0.025706   8.002 3.74e-15 ***
## dem          0.621171   0.025950  23.937  < 2e-16 ***
## female       0.020339   0.024311   0.837    0.403    
## clintondist -0.017433   0.003322  -5.247 1.92e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3627 on 905 degrees of freedom
## Multiple R-squared:  0.4724, Adjusted R-squared:  0.4706 
## F-statistic: 270.1 on 3 and 905 DF,  p-value: < 2.2e-16
## create empty vector
pred.prob.lin <- rep(NA, C)

## make predictions using earlier function
for (i in 1:C) {
  party1 <- cat$dem[i]
  gender1 <- cat$female[i]
  dist1 <- cat$clintondist[i]
  star.clint <- c(1, party1, gender1, dist1)
  pred.prob.lin[i] <- predict.me(linear.clint, star.clint)
}

I do the same for the logistic regression, below.

pred.prob.log <- rep(NA, C)

## logit predict prob
for (i in 1:C) {
  party1 <- cat$dem[i]
  gender1 <- cat$female[i]
  dist1 <- cat$clintondist[i]
  pred.prob.log[i] <- predict.clint(logit, party1, gender1, dist1)
}

Finally, I plot the predicted probabilities from the logistic regression (on the x-axis) against those from the linear regression (on the y-axis).

## create data frame, storing predicted probabilities
prob.plot <- data.frame(cbind(pred.prob.log, pred.prob.lin))

## plot
ggplot(prob.plot, aes(x=pred.prob.log, y=pred.prob.lin)) +
          geom_point()+
          labs(title = "Predicted Probabilities of clintonvote") + 
          xlab("Probabilities from Logistic Regression") +
          ylab("Probabilities from Linear Regression") +
          theme_minimal() 

Bonus

For this question, I use the predicted probabilities for all voters from a logistic regression and visualize how well they perform using a calibration plot.

## put in predicted probs from logistic function into data set
cat$pred.prob.log <- pred.prob.log

## create empty vectors
mean.bin <- rep(NA,10)
prop.bin <- rep(NA,10)

## calculate mean predicted probs and actual proportion of positives
## for each of ten bins
for (i in 1:10) {
  a.group <- subset(cat, 
                  (pred.prob.log < .1*i & pred.prob.log >= .1*(i-1)), 
                  select=c(clintonvote,pred.prob.log))
  mean.bin[i] <- mean(a.group$pred.prob.log)
  prop.bin[i] <- sum(a.group$clintonvote)/length(a.group$clintonvote)
}

## store mean.bin and prop.bin in data frame
bonus.df <- data.frame(cbind(mean.bin,prop.bin))

## plot
ggplot(bonus.df, aes(x=mean.bin, y=prop.bin)) +
          geom_point()+
          geom_line()+
          geom_abline(intercept = 0, slope = 1, linetype = "dashed", 
                      colour = "red") +
          labs(title="Average Predicted Probability versus Actual Proportion", 
               x="Mean Predicted Probability", y="Actual Proportion") +
          theme_minimal() 
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_path).