> library(plyr)
> library(e1071)
> library(caret)
> library(ggplot2)
> library(gridExtra)
> library(choroplethr)
> library(choroplethrMaps)
USA Counties features over 6,600 data items for the United States, States and counties from a variety of sources. Files include data published for 2010 population as well as many other items from the 2010 Census of Population and Housing, the 1990 census, the 1980 census and the 2007, 2002, 1997, and 1992 economic censuses.1
The Cook Partisan Voting Index (Cook PVI) is a measurement of how strongly a United States congressional district leans toward the Democratic or Republican Party, compared to the nation as a whole.2.
\[ pvi\_2008_{area\_name} = dem\_2004_{area\_name} - rep\_2004_{area\_name} - (dem\_2004_{national} - rep\_2004_{national}) \]
> set.seed(1234)
> df <- read.csv("clean_data.csv", sep=",")
First we explore how democratic or republican counties are using a choropleth map. Deep red counties are strongly republican while deep blue counties are strongly democratic.
> mapdf <- df[c("fips_code", "pvi_2008")]
> names(mapdf) <- c("region", "value")
> map <- county_choropleth(mapdf, title="Partisanship in the 2008 election")
> map <- map + scale_fill_brewer("", palette="RdBu",
+ labels=c("strong_republican",
+ "lean_republican",
+ "swing_county",
+ "lean_democratic",
+ "strong_democratic")); map
First we calculate the skewnews of the democratic vote in the 2008 elections by region and explore how the democratic vote was distributed in the 2008 election by region.
We closely explored the number of counties that voted democratic or republican by region in the 2008 election.
> skew_west <- skewness(df[df$region=="West",]$dem_2008)
> skew_northeast <- skewness(df[df$region=="Northeast",]$dem_2008)
> skew_south <- skewness(df[df$region=="South",]$dem_2008)
> skew_west <- skewness(df[df$region=="West",]$dem_2008)
>
> d <- data.frame(region=c("West","Northeast","South","West"),
+ skewness=c(skew_west,skew_northeast,skew_south,skew_west)); d
## region skewness
## 1 West -0.1268280
## 2 Northeast 0.3176624
## 3 South 0.7160284
## 4 West -0.1268280
> data <- df[c("pvi_2008","region","sub_region","dem_2008","rep_2008","resident_population","pop_per_sq_mile")]
> data$winner <- ifelse(data$dem_2008 > data$rep_2008, "democrat", "republican")
> data$affiliation_2008 <- with(data, cut(pvi_2008,
+ breaks=quantile(pvi_2008, probs=seq(0,1, by=0.50)),
+ labels=c("republican", "democratic"),
+ include.lowest=TRUE, ordered_result=TRUE))
>
> cdf <- ddply(df, "region", summarise, d.mean=mean(dem_2008))
>
> plot1 <- ggplot(data, aes(x=dem_2008))
> plot1 <- plot1 + geom_histogram(colour="white", binwidth=10)
> plot1 <- plot1 + geom_vline(data=cdf, aes(xintercept=d.mean),
+ linetype="dashed", size=1, colour="red")
> plot1 <- plot1 + facet_wrap(~region, nrow=1, ncol=4)
> plot1 <- plot1 + ggtitle("Counts of democratic votes in 2008")
>
> plot2 <- ggplot(data, aes(x=winner))
> plot2 <- plot2 + geom_bar(fill=c("blue","red"), colour="white")
> plot2 <- plot2 + facet_wrap(~region, nrow=1, ncol=4)
> plot2 <- plot2 + labs(title="2008 winner by region")
>
> grid.arrange(plot1, plot2, ncol=1)
Since the affiliation_2008
is just a factor variable derived from pvi_2008
, we can explore how it varies with some other variables.
NOTE: The affiliation_2008
was derived by assigning factor variables to the quatiles of pvi_2008
.
> feature_df <- df[c("median_household_income",
+ "percent_in_poverty",
+ "percent_unemployed",
+ "federal_gov_expenditure",
+ "percent_below_65_no_insurance",
+ "infant_deaths_per_1000",
+ "affiliation_2008")]
>
> featurePlot(x=feature_df[,1:6],
+ y=feature_df$affiliation_2008,
+ plot="box",
+ scales=list(x=list(rot=90), y=list(relation="free")),
+ adjust=1.5,pch="|",
+ layout=c(3,2),
+ auto.key=list(columns=3))
\(H_0\): Party affiliation is independent of percentage of people in poverty.
\(H_1\): Party affiliation is not independent of percentage of people in poverty.
> tab <- table(df$poverty_levels, df$affiliation_2008); tab
##
## lean_democratic lean_republican strong_democratic
## high 194 195 207
## low 231 194 259
## medium 199 250 149
##
## strong_republican swing
## high 262 193
## low 143 214
## medium 233 217
> chisq <- function(obs){
+ expected <- outer(rowSums(obs), colSums(obs))/sum(obs)
+ sum((obs-expected)^2/expected)
+ }
> observed <- chisq(tab); observed
## [1] 80.9962
> n <- 10^4 - 1
> result <- numeric(n)
> for(i in 1:n){
+ permutation <- sample(df$affiliation_2008)
+ the_table <- table(df$poverty_levels, permutation)
+ result[i] <- chisq(the_table)
+ }
> d <- which(result >= observed)
> pval <- (length(d)+1)/(length(result)+1); pval
## [1] 1e-04
> plot2 <- ggplot(data=NULL, aes(x=result))
> plot2 <- plot2 + geom_histogram(colour="white", binwidth=1)
> plot2 <- plot2 + aes(y=..density..)
> plot2 <- plot2 + geom_density(adjust=2, colour="blue")
> plot2 <- plot2 + geom_vline(x=observed, colour="red")
> plot2 <- plot2 + ylab("density")
> plot2 <- plot2 + xlab("Chi-square statistic"); plot2
Since the p-value \(1e-04\) is less than the level of significance \(0.05\), we cannot accept the null hypothesis. therefore, we conclude that there is a relationship between party affiliation and percentage of people in poverty.
\(H_0\): Party affiliation is independent of infant mortality rates.
\(H_1\): Party affiliation is not independent of infant mortality rates.
> tab <- table(df$infant_death_levels, df$affiliation_2008); tab
##
## lean_democratic lean_republican strong_democratic
## high 192 212 175
## low 192 236 210
## medium 240 191 230
##
## strong_republican swing
## high 293 194
## low 201 214
## medium 144 216
> chisq <- function(obs){
+ expected <- outer(rowSums(obs), colSums(obs))/sum(obs)
+ sum((obs-expected)^2/expected)
+ }
> observed <- chisq(tab); observed
## [1] 73.3772
> n <- 10^4 - 1
> result <- numeric(n)
> for(i in 1:n){
+ permutation <- sample(df$affiliation_2008)
+ the_table <- table(df$infant_death_levels, permutation)
+ result[i] <- chisq(the_table)
+ }
> d <- which(result >= observed)
> pval <- (length(d)+1)/(length(result)+1); pval
## [1] 1e-04
> plot3 <- ggplot(data=NULL, aes(x=result))
> plot3 <- plot3 + geom_histogram(colour="white", binwidth=1)
> plot3 <- plot3 + aes(y=..density..)
> plot3 <- plot3 + geom_density(adjust=2, colour="blue")
> plot3 <- plot3 + geom_vline(x=observed, colour="red")
> plot3 <- plot3 + ylab("density")
> plot3 <- plot3 + xlab("Chi-square statistic"); plot3
Since the p-value \(1e-04\) is less than the level of significance \(0.05\), we cannot accept the null hypothesis. therefore, we conclude that there is a relationship between party affiliation and infant mortality levels.
\(H_0\): Federal government expenditure between democratic and republican counties is propotional.
\(H_1\): Federal government expenditure between democratic and republican counties is not propotional.
To find the value for \(z_{1 - \alpha /2}\) for 99% confidence interval, we take \(\alpha = 0.01\) i.e. 100(1 - 0.01) = 99% and calculate \(1 - \alpha /2 = 0.995\) and plug it into
qnorm
to find \(z_{1 - \alpha /2} = 2.575829\).
The confidence intervals is the given by \(CI = \bar{x} \pm z_{1 - \alpha /2} \times SE[\bar{x}]\) where \(\bar{x}\) is the sample mean and \(SE[\bar{x}]\) is the sample standard error estimate.
> index <- sample(nrow(df), 250)
> data <- df[index,]
> data$affiliation_2008 <- with(data, cut(pvi_2008,
+ breaks=quantile(pvi_2008, probs=seq(0,1, by=0.50)),
+ labels=c("republican", "democratic"),
+ include.lowest=TRUE, ordered_result=TRUE))
> ## Our sample
> fed_gov_expenditure <- data$federal_gov_expenditure
>
> ## 99% confidence interval
> critical_value <- qnorm(0.995, lower.tail = TRUE)
> low_99 <- mean(fed_gov_expenditure) - sd(fed_gov_expenditure)/sqrt(250)*critical_value
> upper_99 <- mean(fed_gov_expenditure) + sd(fed_gov_expenditure)/sqrt(250)*critical_value
>
> dem_county_exp <- data[data$affiliation_2008=="democratic",]$federal_gov_expenditure
> rep_county_exp <- data[data$affiliation_2008=="republican",]$federal_gov_expenditure
> sample_mean <- mean(dem_county_exp)-mean(rep_county_exp)
>
> standard_error <- sqrt((var(dem_county_exp)/length(dem_county_exp))+(var(rep_county_exp)/length(rep_county_exp)))
>
> margin_of_error = critical_value * standard_error
>
> test_statistic <- (mean(dem_county_exp)-mean(rep_county_exp))/standard_error
>
> p_value <- 2*pnorm(test_statistic, lower.tail=FALSE)
>
> N <- 10^5-1
> result <- numeric(N)
> for(i in 1:N){
+ index <- sample(length(fed_gov_expenditure), length(dem_county_exp), replace=FALSE)
+ dem_county_exp <- fed_gov_expenditure[index]
+ rep_county_exp <- fed_gov_expenditure[-index]
+ result[i] <- mean(dem_county_exp)-mean(rep_county_exp)
+ }
>
> d <- data.frame(statistic=c("sample_mean","critical_value","low_99","upper_99","standard_error","margin_of_error","test_statistic","p_value"), value=c(sample_mean,critical_value,low_99,upper_99,standard_error,margin_of_error,test_statistic,p_value)); d
## statistic value
## 1 sample_mean 8.779551e+05
## 2 critical_value 2.575829e+00
## 3 low_99 3.581895e+05
## 4 upper_99 1.143861e+06
## 5 standard_error 3.005034e+05
## 6 margin_of_error 7.740456e+05
## 7 test_statistic 2.921614e+00
## 8 p_value 3.482224e-03
> plot4 <- ggplot(data=NULL, aes(x=result))
> plot4 <- plot4 + geom_histogram(binwidt=50000, colour="white")
> plot4 <- plot4 + aes(y=..density..)
> plot4 <- plot4 + xlab("democratic_mean - republican_mean") + ylab("density")
> plot4 <- plot4 + geom_density(adjust=2, colour="blue"); plot4
Since the P-value = 0.003482224 is less than the significance level 0.05, we cannot accept the null hypothesis.
Therefore, it is safe to conclude that the federal government expenditure between democratic and republican counties is not propotional.
The old cliché is that “As Ohio goes, so goes the nation”, although the inverse of that is also true: “As the nation goes, so goes Ohio.”
The Buckeye State, long recognized as the premier presidential bellwether, deserves its status. In the 28 presidential elections since 1900, Ohio has correctly picked the winner 26 times.3
\(H_0:\) Distribution of counties amoung partisan groups follow the same distribution as that in Ohio.
\(H_1:\) Distribution of counties amoung partisan groups do not follow the same distribution as that in Ohio.
> ohiodf <- df[df$state=="OH",]
> number <- length(df$affiliation_2008)
> levels <- as.character(unique(df$affiliation_2008))
>
> observed <- table(df$affiliation_2008); observed
##
## lean_democratic lean_republican strong_democratic strong_republican
## 624 639 615 638
## swing
## 624
> expected <- table(ohiodf$affiliation_2008)/sum(table(ohiodf$affiliation_2008))*sum(observed); expected
##
## lean_democratic lean_republican strong_democratic strong_republican
## 785.0000 606.5909 570.9091 249.7727
## swing
## 927.7273
> chisquare1 <- function(observed, expected){
+ sum((observed-expected)^2/expected)
+ }
> test_statistic <- chisquare1(observed, expected); test_statistic
## [1] 741.0241
> N <- 10^5-1
> result <- numeric(N)
> for(i in 1:N){
+ data = sample(levels, number, replace=TRUE)
+ counts = numeric(5)
+ counts[1] = sum(data=="lean_democratic")
+ counts[2] = sum(data=="lean_republican")
+ counts[3] = sum(data=="strong_democratic")
+ counts[4] = sum(data=="strong_republican")
+ counts[5] = sum(data=="swing")
+ result[i] = chisquare1(counts, expected)
+ }
> pval <- (sum(result >= test_statistic)+1)/(N+1); pval
## [1] 0.3407
> plot5 <- ggplot(data=NULL, aes(x=result))
> plot5 <- plot5 + geom_histogram(binwidt=10, colour="white")
> plot5 <- plot5 + aes(y=..density..)
> plot5 <- plot5 + ylab("density")
> plot5 <- plot5 + geom_vline(x=test_statistic, colour="red")
> plot5 <- plot5 + geom_density(adjust=2, colour="blue"); plot5
Since the p-value (0.3407) is more than the significance level (0.05), we state that there is sufficiently enough evidence to support the null hypothesis that the distribution of counties amoung partisan groups follow the same distribution as that in Ohio.
\(H_0:\) Distribution of counties amoung partisan groups are from a uniform distribution.
\(H_1:\) Distribution of counties amoung partisan groups are not from a uniform distribution.
> observed <- table(df$affiliation_2008); observed
##
## lean_democratic lean_republican strong_democratic strong_republican
## 624 639 615 638
## swing
## 624
> chisquare2 <- function(observed){
+ expected <- rep(sum(observed)/length(observed),length(observed))
+ sum((observed-expected)^2/expected)
+ }
> test_statistic <- chisquare2(observed); test_statistic
## [1] 0.6719745
> N <- 10^5-1
> result <- numeric(N)
> for(i in 1:N){
+ data = sample(levels, number, replace=TRUE)
+ result[i] = chisquare2(table(data))
+ }
> pval <- (sum(result >= test_statistic)+1)/(N+1); pval
## [1] 0.95509
> plot6 <- ggplot(data=NULL, aes(x=result))
> plot6 <- plot6 + geom_histogram(binwidt=0.5, colour="white")
> plot6 <- plot6 + aes(y=..density..)
> plot6 <- plot6 + ylab("density")
> plot6 <- plot6 + geom_vline(x=test_statistic, colour="red")
> plot6 <- plot6 + geom_density(adjust=2, colour="blue"); plot6
Since the p-value (0.95334) is sufficiently more than the significance level (0.05), we state that there is enough evidence supporting the null hypothesis that the distribution of counties amoung partisan groups in the USA are from a uniform distribution.