First, I open up relevant libraries for the assignment. I then set the working directory and load the data set ca2006.csv.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rm(list=ls())
setwd("~/Desktop/PS150B")
dat <- read.csv("ca2006.csv")
Second, I create a plot of the proportion of votes for the Democratic candidate (prop_d), against the proportion of the two-party vote for the Democratic presidential candidate in 2004 (dem pres 2004) in the district.
plot <- ggplot(dat, aes(x=dem_pres_2004, y=prop_d)) +
geom_point()+
ggtitle("Proportion of Vote Share In CA Districts") +
labs(x="2-Party Vote Share Dem Presidential Candidate (2004)",
y="Vote Share Dem Congressional Candidate (2006)") +
theme_minimal()
plot
Third, I print out regression results and plot the regression line.
## fit linear regression
linear1 <- lm(prop_d ~ dem_pres_2004, data = dat)
summary(linear1)
##
## Call:
## lm(formula = prop_d ~ dem_pres_2004, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36168 -0.04314 -0.00830 0.01233 0.44754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15390 0.05978 -2.574 0.013 *
## dem_pres_2004 1.38268 0.10291 13.436 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1125 on 51 degrees of freedom
## Multiple R-squared: 0.7797, Adjusted R-squared: 0.7754
## F-statistic: 180.5 on 1 and 51 DF, p-value: < 2.2e-16
## plot regression line
plot + geom_smooth(method=lm, se=FALSE)
Fourth, I create a simple function to predict vote share for the Democratic candidate when dem_pres_2004 = .5. I find that the predicted vote share is .537.
## the function
predict.me <- function(mod, star) {
beta.hat <- mod$coefficients
x.star <- star
dot.prod <- beta.hat %*% star
return(dot.prod)
}
## load up star 1, then predict
star1 <- c("(Intercept)" = 1, "dem_pres_2004" = .5)
predict.me(linear1, star1)
## [,1]
## [1,] 0.5374445
Now, I regress prop_d against: dem_pres_2004, dem_pres_2000, and dem_inc. The summary is provided.
linear2 <- lm(prop_d ~ dem_pres_2004 + dem_pres_2000 + dem_inc,
data = dat)
summary(linear2)
##
## Call:
## lm(formula = prop_d ~ dem_pres_2004 + dem_pres_2000 + dem_inc,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34354 -0.04843 -0.01123 0.01629 0.33092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.036772 0.079456 -0.463 0.64556
## dem_pres_2004 -0.007185 0.470300 -0.015 0.98787
## dem_pres_2000 0.963297 0.497967 1.934 0.05884 .
## dem_inc 0.173460 0.054945 3.157 0.00273 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09862 on 49 degrees of freedom
## Multiple R-squared: 0.8373, Adjusted R-squared: 0.8274
## F-statistic: 84.09 on 3 and 49 DF, p-value: < 2.2e-16
Using the multivariate regression from 1.5 and the function from 1.4, I predict the vote share for the Democratic candidate when: dem_pres_2004 = 0.5, dem_pres_2000 = 0.5, and dem_inc = 1. I predict the vote share will be .615.
## load up x star (aka star2), then predict
star2 <- c("(Intercept)" = 1, "dem_pres_2004" = .5, "dem_pres_2002" = .5,
"dem_inc" = 1)
predict.me(linear2, star2)
## [,1]
## [1,] 0.6147444
In question 1.7, I implement the bootstrap to characterize the uncertainty for the response variable predictions. Parts a though d are nested inside of the for loop.
set.seed(2)
## Set modifiable parameters
B <- 10000
n <- length(dat$district)
## Create objects that will store stuff
boot.predict1 <- rep(NA, B)
boot.predict2 <- rep(NA, B)
df <- dat
## for loop for the bootstrap
for (i in 1:B) {
#### Question 1.7a -- create bootstrapped data set
x.b <- df[sample(nrow(dat), n, replace = TRUE),]
#### Question 1.7b -- fit specified regressions
boot.lin1 <- lm(prop_d ~ dem_pres_2004, data = x.b)
boot.lin2 <- lm(prop_d ~ dem_pres_2004 + dem_pres_2000 + dem_inc,
data = x.b)
#### Question 1.7c and d -- make and store predictions
boot.predict1[i] <- predict.me(boot.lin1, star1)
boot.predict2[i] <- predict.me(boot.lin2, star2)
}
Here, I report the confidence intervals and create histograms for both models.
For model 1:
## 95% Confidence Interval
quantile(boot.predict1, probs = c(.025, 0.975))
## 2.5% 97.5%
## 0.5055570 0.5720341
## Histogram
ggplot()+ aes(boot.predict1) +
geom_histogram(alpha = .7, colour="black", fill="blue", bins=40) +
labs(title="Histogram for Model 1 Predictions", x = "Model 1 Predictions") +
theme_minimal()
For model 2:
## 95% Confidence Interval
quantile(boot.predict2, probs = c(.025, 0.975))
## 2.5% 97.5%
## 0.5503845 0.6945578
## Histogram
ggplot()+ aes(boot.predict2) +
geom_histogram(alpha = .7, colour="black", fill="red", bins = 40) +
labs(title="Histogram for Model 2 Predictions", x = "Model 2 Predictions") +
theme_minimal()
What proportion of time does each model predict the incumbent will win?
Model 1: 98.86% of time. Model 2: 99.98% of the time.
## boot model 1
sum(boot.predict1 > .5)/B
## [1] 0.9886
## Boot model 2
sum(boot.predict2 > .5)/B
## [1] 0.9998
First, I load the data for the second question.
cat <- read.csv("vote92.csv")
The proportion of respondents that reported voting for Bill Clinton was .458.
sum(cat$clintonvote)/length(cat$clintonvote)
## [1] 0.4576458
Using a logistic regression, I regress clintonvote on dem, female, and clintondist. The summary is reported.
logit <- glm(clintonvote ~ dem + female + clintondist, family =
binomial(link = "logit"), data = cat)
summary(logit)
##
## Call:
## glm(formula = clintonvote ~ dem + female + clintondist, family = binomial(link = "logit"),
## data = cat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9866 -0.6183 -0.3559 0.6317 2.7461
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.40692 0.18758 -7.500 6.37e-14 ***
## dem 3.05648 0.18687 16.357 < 2e-16 ***
## female 0.17417 0.18413 0.946 0.344
## clintondist -0.14482 0.02777 -5.215 1.84e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1253.61 on 908 degrees of freedom
## Residual deviance: 769.17 on 905 degrees of freedom
## AIC: 777.17
##
## Number of Fisher Scoring iterations: 5
In this question, I write a function to predict the probability that a voter supports Clinton based on a logistic regression.
## take in model and variables
predict.clint <- function(mod, party, gender, dist) {
beta.cat <- mod$coefficients
c.star <- c("(Intercept)" = 1, "dem" = party, "female" = gender,
"clintondist" = dist)
dot.c <- beta.cat %*% c.star
pred.prob <- exp(dot.c)/(1+exp(dot.c))
return(pred.prob)
}
Using my function from 2.4, I determine the probability a female, Democrat, with clintondist = 1 votes for Clinton. The probability is .8428.
predict.clint(logit, 1, 1, 1)
## [,1]
## [1,] 0.8427606
Now, I use a linear regression to predict clintonvote as a function of dem, female, and clintondist.
## store length of clintonvote vector
C <- length(cat$clintonvote)
## linear predict prob
linear.clint <- lm(clintonvote ~ dem + female + clintondist, data = cat)
summary(linear.clint)
##
## Call:
## lm(formula = clintonvote ~ dem + female + clintondist, data = cat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8472 -0.1876 -0.0467 0.1899 1.0760
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.205695 0.025706 8.002 3.74e-15 ***
## dem 0.621171 0.025950 23.937 < 2e-16 ***
## female 0.020339 0.024311 0.837 0.403
## clintondist -0.017433 0.003322 -5.247 1.92e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3627 on 905 degrees of freedom
## Multiple R-squared: 0.4724, Adjusted R-squared: 0.4706
## F-statistic: 270.1 on 3 and 905 DF, p-value: < 2.2e-16
## create empty vector
pred.prob.lin <- rep(NA, C)
## make predictions using earlier function
for (i in 1:C) {
party1 <- cat$dem[i]
gender1 <- cat$female[i]
dist1 <- cat$clintondist[i]
star.clint <- c(1, party1, gender1, dist1)
pred.prob.lin[i] <- predict.me(linear.clint, star.clint)
}
I do the same for the logistic regression, below.
pred.prob.log <- rep(NA, C)
## logit predict prob
for (i in 1:C) {
party1 <- cat$dem[i]
gender1 <- cat$female[i]
dist1 <- cat$clintondist[i]
pred.prob.log[i] <- predict.clint(logit, party1, gender1, dist1)
}
Finally, I plot the predicted probabilities from the logistic regression (on the x-axis) against those from the linear regression (on the y-axis).
## create data frame, storing predicted probabilities
prob.plot <- data.frame(cbind(pred.prob.log, pred.prob.lin))
## plot
ggplot(prob.plot, aes(x=pred.prob.log, y=pred.prob.lin)) +
geom_point()+
labs(title = "Predicted Probabilities of clintonvote") +
xlab("Probabilities from Logistic Regression") +
ylab("Probabilities from Linear Regression") +
theme_minimal()
For this question, I use the predicted probabilities for all voters from a logistic regression and visualize how well they perform using a calibration plot.
## put in predicted probs from logistic function into data set
cat$pred.prob.log <- pred.prob.log
## create empty vectors
mean.bin <- rep(NA,10)
prop.bin <- rep(NA,10)
## calculate mean predicted probs and actual proportion of positives
## for each of ten bins
for (i in 1:10) {
a.group <- subset(cat,
(pred.prob.log < .1*i & pred.prob.log >= .1*(i-1)),
select=c(clintonvote,pred.prob.log))
mean.bin[i] <- mean(a.group$pred.prob.log)
prop.bin[i] <- sum(a.group$clintonvote)/length(a.group$clintonvote)
}
## store mean.bin and prop.bin in data frame
bonus.df <- data.frame(cbind(mean.bin,prop.bin))
## plot
ggplot(bonus.df, aes(x=mean.bin, y=prop.bin)) +
geom_point()+
geom_line()+
geom_abline(intercept = 0, slope = 1, linetype = "dashed",
colour = "red") +
labs(title="Average Predicted Probability versus Actual Proportion",
x="Mean Predicted Probability", y="Actual Proportion") +
theme_minimal()
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_path).