> library(plyr)
> library(e1071)
> library(caret)
> library(ggplot2)
> library(gridExtra)
> library(choroplethr)
> library(choroplethrMaps)

The data.

USA Counties features over 6,600 data items for the United States, States and counties from a variety of sources. Files include data published for 2010 population as well as many other items from the 2010 Census of Population and Housing, the 1990 census, the 1980 census and the 2007, 2002, 1997, and 1992 economic censuses.1

The Cook Partisan Voting Index.

The Cook Partisan Voting Index (Cook PVI) is a measurement of how strongly a United States congressional district leans toward the Democratic or Republican Party, compared to the nation as a whole.2.

\[ pvi\_2008_{area\_name} = dem\_2004_{area\_name} - rep\_2004_{area\_name} - (dem\_2004_{national} - rep\_2004_{national}) \]

> set.seed(1234)
> df <- read.csv("clean_data.csv", sep=",")

Partisanship in the 2008 election.

First we explore how democratic or republican counties are using a choropleth map. Deep red counties are strongly republican while deep blue counties are strongly democratic.

> mapdf <- df[c("fips_code", "pvi_2008")]
> names(mapdf) <- c("region", "value")
> map <- county_choropleth(mapdf, title="Partisanship in the 2008 election")
> map <- map + scale_fill_brewer("", palette="RdBu", 
+                                labels=c("strong_republican",
+                                         "lean_republican",
+                                         "swing_county",
+                                         "lean_democratic",
+                                         "strong_democratic")); map

How the democratic vote varied by region in the 2008 election.

First we calculate the skewnews of the democratic vote in the 2008 elections by region and explore how the democratic vote was distributed in the 2008 election by region.

We closely explored the number of counties that voted democratic or republican by region in the 2008 election.

> skew_west <- skewness(df[df$region=="West",]$dem_2008) 
> skew_northeast <- skewness(df[df$region=="Northeast",]$dem_2008) 
> skew_south <- skewness(df[df$region=="South",]$dem_2008) 
> skew_west <- skewness(df[df$region=="West",]$dem_2008) 
> 
> d <- data.frame(region=c("West","Northeast","South","West"),
+                 skewness=c(skew_west,skew_northeast,skew_south,skew_west)); d
##      region   skewness
## 1      West -0.1268280
## 2 Northeast  0.3176624
## 3     South  0.7160284
## 4      West -0.1268280
> data <- df[c("pvi_2008","region","sub_region","dem_2008","rep_2008","resident_population","pop_per_sq_mile")]
> data$winner <- ifelse(data$dem_2008 > data$rep_2008, "democrat", "republican")
> data$affiliation_2008 <- with(data, cut(pvi_2008,
+                                         breaks=quantile(pvi_2008, probs=seq(0,1, by=0.50)),
+                                         labels=c("republican", "democratic"),
+                                         include.lowest=TRUE, ordered_result=TRUE))
> 
> cdf <- ddply(df, "region", summarise, d.mean=mean(dem_2008))
> 
> plot1 <- ggplot(data, aes(x=dem_2008)) 
> plot1 <- plot1 + geom_histogram(colour="white", binwidth=10)
> plot1 <- plot1 + geom_vline(data=cdf, aes(xintercept=d.mean),
+                             linetype="dashed", size=1, colour="red")
> plot1 <- plot1 + facet_wrap(~region, nrow=1, ncol=4)
> plot1 <- plot1 + ggtitle("Counts of democratic votes in 2008")
> 
> plot2 <- ggplot(data, aes(x=winner))
> plot2 <- plot2 + geom_bar(fill=c("blue","red"), colour="white")
> plot2 <- plot2 + facet_wrap(~region, nrow=1, ncol=4)
> plot2 <- plot2 + labs(title="2008 winner by region")
> 
> grid.arrange(plot1, plot2, ncol=1)

How do other variables vary against the Partisanship?

Since the affiliation_2008 is just a factor variable derived from pvi_2008, we can explore how it varies with some other variables.

NOTE: The affiliation_2008 was derived by assigning factor variables to the quatiles of pvi_2008.

> feature_df <- df[c("median_household_income",
+                    "percent_in_poverty",
+                    "percent_unemployed",
+                    "federal_gov_expenditure",
+                    "percent_below_65_no_insurance",
+                    "infant_deaths_per_1000",
+                    "affiliation_2008")]
> 
> featurePlot(x=feature_df[,1:6], 
+             y=feature_df$affiliation_2008, 
+             plot="box",
+             scales=list(x=list(rot=90), y=list(relation="free")),
+             adjust=1.5,pch="|", 
+             layout=c(3,2), 
+             auto.key=list(columns=3))

Party affiliation and percentage of people in poverty.

\(H_0\): Party affiliation is independent of percentage of people in poverty.

\(H_1\): Party affiliation is not independent of percentage of people in poverty.

> tab <- table(df$poverty_levels, df$affiliation_2008); tab
##         
##          lean_democratic lean_republican strong_democratic
##   high               194             195               207
##   low                231             194               259
##   medium             199             250               149
##         
##          strong_republican swing
##   high                 262   193
##   low                  143   214
##   medium               233   217
> chisq <- function(obs){
+     expected <- outer(rowSums(obs), colSums(obs))/sum(obs)
+     sum((obs-expected)^2/expected)
+ }
> observed <- chisq(tab); observed
## [1] 80.9962
> n <- 10^4 - 1
> result <- numeric(n)
> for(i in 1:n){
+     permutation <- sample(df$affiliation_2008)
+     the_table <- table(df$poverty_levels, permutation)
+     result[i] <- chisq(the_table)
+ }
> d <- which(result >= observed)
> pval <- (length(d)+1)/(length(result)+1); pval
## [1] 1e-04
> plot2 <- ggplot(data=NULL, aes(x=result))
> plot2 <- plot2 + geom_histogram(colour="white", binwidth=1)
> plot2 <- plot2 + aes(y=..density..)
> plot2 <- plot2 + geom_density(adjust=2, colour="blue")
> plot2 <- plot2 + geom_vline(x=observed, colour="red")
> plot2 <- plot2 + ylab("density")
> plot2 <- plot2 + xlab("Chi-square statistic"); plot2

Since the p-value \(1e-04\) is less than the level of significance \(0.05\), we cannot accept the null hypothesis. therefore, we conclude that there is a relationship between party affiliation and percentage of people in poverty.

Party affiliation and infant mortality rates.

\(H_0\): Party affiliation is independent of infant mortality rates.

\(H_1\): Party affiliation is not independent of infant mortality rates.

> tab <- table(df$infant_death_levels, df$affiliation_2008); tab
##         
##          lean_democratic lean_republican strong_democratic
##   high               192             212               175
##   low                192             236               210
##   medium             240             191               230
##         
##          strong_republican swing
##   high                 293   194
##   low                  201   214
##   medium               144   216
> chisq <- function(obs){
+     expected <- outer(rowSums(obs), colSums(obs))/sum(obs)
+     sum((obs-expected)^2/expected)
+ }
> observed <- chisq(tab); observed
## [1] 73.3772
> n <- 10^4 - 1
> result <- numeric(n)
> for(i in 1:n){
+     permutation <- sample(df$affiliation_2008)
+     the_table <- table(df$infant_death_levels, permutation)
+     result[i] <- chisq(the_table)
+ }
> d <- which(result >= observed)
> pval <- (length(d)+1)/(length(result)+1); pval
## [1] 1e-04
> plot3 <- ggplot(data=NULL, aes(x=result))
> plot3 <- plot3 + geom_histogram(colour="white", binwidth=1)
> plot3 <- plot3 + aes(y=..density..)
> plot3 <- plot3 + geom_density(adjust=2, colour="blue")
> plot3 <- plot3 + geom_vline(x=observed, colour="red")
> plot3 <- plot3 + ylab("density")
> plot3 <- plot3 + xlab("Chi-square statistic"); plot3

Since the p-value \(1e-04\) is less than the level of significance \(0.05\), we cannot accept the null hypothesis. therefore, we conclude that there is a relationship between party affiliation and infant mortality levels.

Difference in means between democratic and republican states.

\(H_0\): Federal government expenditure between democratic and republican counties is propotional.

\(H_1\): Federal government expenditure between democratic and republican counties is not propotional.

To find the value for \(z_{1 - \alpha /2}\) for 99% confidence interval, we take \(\alpha = 0.01\) i.e. 100(1 - 0.01) = 99% and calculate \(1 - \alpha /2 = 0.995\) and plug it into qnorm to find \(z_{1 - \alpha /2} = 2.575829\).

The confidence intervals is the given by \(CI = \bar{x} \pm z_{1 - \alpha /2} \times SE[\bar{x}]\) where \(\bar{x}\) is the sample mean and \(SE[\bar{x}]\) is the sample standard error estimate.

> index <- sample(nrow(df), 250)
> data <- df[index,]
> data$affiliation_2008 <- with(data, cut(pvi_2008,
+                                         breaks=quantile(pvi_2008, probs=seq(0,1, by=0.50)),
+                                         labels=c("republican", "democratic"),
+                                         include.lowest=TRUE, ordered_result=TRUE))
> ## Our sample
> fed_gov_expenditure <- data$federal_gov_expenditure
> 
> ## 99% confidence interval
> critical_value <- qnorm(0.995, lower.tail = TRUE)
> low_99 <- mean(fed_gov_expenditure) - sd(fed_gov_expenditure)/sqrt(250)*critical_value
> upper_99 <- mean(fed_gov_expenditure) + sd(fed_gov_expenditure)/sqrt(250)*critical_value
> 
> dem_county_exp <- data[data$affiliation_2008=="democratic",]$federal_gov_expenditure
> rep_county_exp <- data[data$affiliation_2008=="republican",]$federal_gov_expenditure
> sample_mean <- mean(dem_county_exp)-mean(rep_county_exp)
> 
> standard_error <- sqrt((var(dem_county_exp)/length(dem_county_exp))+(var(rep_county_exp)/length(rep_county_exp)))
> 
> margin_of_error = critical_value * standard_error
> 
> test_statistic <- (mean(dem_county_exp)-mean(rep_county_exp))/standard_error
> 
> p_value <- 2*pnorm(test_statistic, lower.tail=FALSE)
> 
> N <- 10^5-1
> result <- numeric(N)
> for(i in 1:N){
+     index <- sample(length(fed_gov_expenditure), length(dem_county_exp), replace=FALSE)
+     dem_county_exp <- fed_gov_expenditure[index]
+     rep_county_exp <- fed_gov_expenditure[-index]
+     result[i] <- mean(dem_county_exp)-mean(rep_county_exp)
+ }
> 
> d <- data.frame(statistic=c("sample_mean","critical_value","low_99","upper_99","standard_error","margin_of_error","test_statistic","p_value"), value=c(sample_mean,critical_value,low_99,upper_99,standard_error,margin_of_error,test_statistic,p_value)); d
##         statistic        value
## 1     sample_mean 8.779551e+05
## 2  critical_value 2.575829e+00
## 3          low_99 3.581895e+05
## 4        upper_99 1.143861e+06
## 5  standard_error 3.005034e+05
## 6 margin_of_error 7.740456e+05
## 7  test_statistic 2.921614e+00
## 8         p_value 3.482224e-03
> plot4 <- ggplot(data=NULL, aes(x=result))
> plot4 <- plot4 + geom_histogram(binwidt=50000, colour="white")
> plot4 <- plot4 + aes(y=..density..)
> plot4 <- plot4 + xlab("democratic_mean - republican_mean") + ylab("density")
> plot4 <- plot4 + geom_density(adjust=2, colour="blue"); plot4

Since the P-value = 0.003482224 is less than the significance level 0.05, we cannot accept the null hypothesis.

Therefore, it is safe to conclude that the federal government expenditure between democratic and republican counties is not propotional.

Do election results follow an unknow distribution?

The old cliché is that “As Ohio goes, so goes the nation”, although the inverse of that is also true: “As the nation goes, so goes Ohio.”

The Buckeye State, long recognized as the premier presidential bellwether, deserves its status. In the 28 presidential elections since 1900, Ohio has correctly picked the winner 26 times.3

\(H_0:\) Distribution of counties amoung partisan groups follow the same distribution as that in Ohio.

\(H_1:\) Distribution of counties amoung partisan groups do not follow the same distribution as that in Ohio.

> ohiodf <- df[df$state=="OH",]
> number <- length(df$affiliation_2008)
> levels <- as.character(unique(df$affiliation_2008))
> 
> observed <- table(df$affiliation_2008); observed
## 
##   lean_democratic   lean_republican strong_democratic strong_republican 
##               624               639               615               638 
##             swing 
##               624
> expected <- table(ohiodf$affiliation_2008)/sum(table(ohiodf$affiliation_2008))*sum(observed); expected
## 
##   lean_democratic   lean_republican strong_democratic strong_republican 
##          785.0000          606.5909          570.9091          249.7727 
##             swing 
##          927.7273
> chisquare1 <- function(observed, expected){
+   sum((observed-expected)^2/expected)
+ }
> test_statistic <- chisquare1(observed, expected); test_statistic
## [1] 741.0241
> N <- 10^5-1
> result <- numeric(N)
> for(i in 1:N){
+   data = sample(levels, number, replace=TRUE)
+   counts = numeric(5)
+   counts[1] = sum(data=="lean_democratic")
+   counts[2] = sum(data=="lean_republican")
+   counts[3] = sum(data=="strong_democratic")
+   counts[4] = sum(data=="strong_republican")
+   counts[5] = sum(data=="swing")
+   result[i] = chisquare1(counts, expected)
+ }
> pval <- (sum(result >= test_statistic)+1)/(N+1); pval
## [1] 0.3407
> plot5 <- ggplot(data=NULL, aes(x=result))
> plot5 <- plot5 + geom_histogram(binwidt=10, colour="white")
> plot5 <- plot5 + aes(y=..density..)
> plot5 <- plot5 + ylab("density")
> plot5 <- plot5 + geom_vline(x=test_statistic, colour="red")
> plot5 <- plot5 + geom_density(adjust=2, colour="blue"); plot5

Since the p-value (0.3407) is more than the significance level (0.05), we state that there is sufficiently enough evidence to support the null hypothesis that the distribution of counties amoung partisan groups follow the same distribution as that in Ohio.

\(H_0:\) Distribution of counties amoung partisan groups are from a uniform distribution.

\(H_1:\) Distribution of counties amoung partisan groups are not from a uniform distribution.

> observed <- table(df$affiliation_2008); observed
## 
##   lean_democratic   lean_republican strong_democratic strong_republican 
##               624               639               615               638 
##             swing 
##               624
> chisquare2 <- function(observed){
+   expected <- rep(sum(observed)/length(observed),length(observed))
+   sum((observed-expected)^2/expected)
+ }
> test_statistic <- chisquare2(observed); test_statistic
## [1] 0.6719745
> N <- 10^5-1
> result <- numeric(N)
> for(i in 1:N){
+   data = sample(levels, number, replace=TRUE)
+   result[i] = chisquare2(table(data))
+ }
> pval <- (sum(result >= test_statistic)+1)/(N+1); pval
## [1] 0.95509
> plot6 <- ggplot(data=NULL, aes(x=result))
> plot6 <- plot6 + geom_histogram(binwidt=0.5, colour="white")
> plot6 <- plot6 + aes(y=..density..)
> plot6 <- plot6 + ylab("density")
> plot6 <- plot6 + geom_vline(x=test_statistic, colour="red")
> plot6 <- plot6 + geom_density(adjust=2, colour="blue"); plot6

Since the p-value (0.95334) is sufficiently more than the significance level (0.05), we state that there is enough evidence supporting the null hypothesis that the distribution of counties amoung partisan groups in the USA are from a uniform distribution.


  1. http://censtats.census.gov/usa/usa.shtml

  2. http://en.wikipedia.org/wiki/Cook_Partisan_Voting_Index

  3. http://www.centerforpolitics.org/crystalball/articles/kdk2011091502/