library(ggplot2)
library(Rcpp)
library(GGally)
library(scales)
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
##
## Attaching package: 'memisc'
## The following object is masked from 'package:scales':
##
## percent
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
library(lattice)
library(MASS)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:memisc':
##
## collect, recode, rename
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
library(tidyr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(knitr)
library(tidyr)
library(gridExtra)
library(lattice)
library(splitstackshape)
library(Rmisc)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:memisc':
##
## rename
library(ggthemes)
library(compare)
##
## Attaching package: 'compare'
## The following object is masked from 'package:base':
##
## isTRUE
library(scatterplot3d)
library(plot3D)
library(rgl)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:memisc':
##
## recode
library(installr)
## Loading required package: stringr
##
## Welcome to installr version 0.19.0
##
## More information is available on the installr project website:
## https://github.com/talgalili/installr/
##
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/installr/issues
##
## To suppress this message use:
## suppressPackageStartupMessages(library(installr))
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
##
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:car':
##
## ellipse
Lets keep exponenets out of our graphs.
options("scipen"=1000000)
Political writig usually discusses either the grand sums colected or statewide poling results. Here instead we will look at indications of enthusiasm at the zip code level. This paper will address the same questions to three different levels: all politcial parties lumped in together , Republicans v Democrats, and the candidates against each other regardless of party. We will look at the number of donors per zip, median donation per zip, highest participation level per zip, people who gave mu;ltiple times per zip.
Primarily we ill be working with the Federal Election Data for NY stat as provided by Udacity. We will suppliment this dataset with dataset (ZipPop2) which provides the population per zip code as per 2010 and lastly with a dataset that provides each candidates political party and that party’s color.
The Links to the two data sets that I provided. https://drive.google.com/open?id=0B-ygjCols7cANTR3WGFjbUM4UU0
https://drive.google.com/open?id=0B-ygjCols7cAaUZ2ME1RN0tZM1k
The NYS election data is very large so we will be working with a sample of it.
NY2016<-read.csv(“NY2016.csv”)
NYSample<-NY2016[sample(nrow(NY_1), 100000),]
NYSample<-read.csv("NYSample.csv")
ZipPop2<- read.csv("Zip_Pop2.csv")
ZipPop2<- read.csv("Zip_Pop2.csv")
CandidateParty<-read.csv("CandidateParty.csv")
Let us drop unneceassary columns
NY_1<- select(NYSample,cmte_id,contbr_nm,contbr_city,Zip,
contbr_employer,contbr_occupation,contb_receipt_amt,
contb_receipt_dt,cand_nm)
The Zip codes of NY2016 are 9 digits long so we will shorten them to 5 digits.
NY_1$Zip<- substr(NY_1$Zip , 0,5)
Some of the zips are only 3 numbers but these three numbers tell us it is a NYS zip.
Lets shorten the candidate’s name to just their last name.
NY_1<-cSplit(NY_1 , "cand_nm", sep=",")
Lets drop extra the now unneeded cand_nm_2
NY_1$cand_nm_2<-NULL
Lets rename cand_nm to Last_Name.
setnames(NY_1,"cand_nm_1", "Last_Name")
Now we will join our 3 datasets together. Joining Zip_Pop2 adds population per zip to our dataset. But first lets make sure all of the data types are the matching.
ZipPop2$Zip<-as.factor(ZipPop2$Zip)
NY_1<- NY_1%>% left_join(ZipPop2, by= c("Zip"="Zip"))
## Warning: Column `Zip` joining character vector and factor, coercing into
## character vector
Now lets join Candidate Party this will bring in the candidates respective political parties and colors.
NY_1<- NY_1%>% left_join(CandidateParty, by= c("Last_Name"="Last_Name"))
## Warning: Column `Last_Name` joining factors with different levels, coercing
## to character vector
Convert our dataframe to a tbl_df.
dplyr::tbl_df(NY_1)
## # A tibble: 100,000 x 12
## cmte_id contbr_nm contbr_city Zip
## <fctr> <fctr> <fctr> <chr>
## 1 C00575795 SHIMANSKY, REBA NEW YORK 10023
## 2 C00575795 WEBB, LOYANA RIDGEWOOD 11385
## 3 C00580100 FELDMAN, ROB COMMACK 11725
## 4 C00577130 KING, MICHAEL RIDGEWOOD 11385
## 5 C00577130 DUNLOP, DAVID S. NEWBURGH 12550
## 6 C00575795 PRIORE, PATRICK NEW YORK 10011
## 7 C00458844 GANZ, ZEV HEWLETT 11557
## 8 C00575795 SMAYLOVSKY, BELLA BROOKLYN 11229
## 9 C00575795 COOPER, ANDREW BROOKLYN 11226
## 10 C00577130 ALLISON, MATT BROOKLYN 11206
## # ... with 99,990 more rows, and 8 more variables: contbr_employer <fctr>,
## # contbr_occupation <fctr>, contb_receipt_amt <dbl>,
## # contb_receipt_dt <fctr>, Last_Name <chr>, Population <int>,
## # Party <fctr>, Party_Color <fctr>
str(NY_1)
## 'data.frame': 100000 obs. of 12 variables:
## $ cmte_id : Factor w/ 25 levels "C00458844","C00500587",..: 6 6 15 7 7 6 1 6 6 7 ...
## $ contbr_nm : Factor w/ 47556 levels " CALVEY, DONNA MORRIS",..: 39537 45251 12870 22387 11418 34406 14763 40339 8413 743 ...
## $ contbr_city : Factor w/ 1596 levels "11590 SALLELES D'AUDE",..: 971 1186 304 1186 976 971 619 174 174 174 ...
## $ Zip : chr "10023" "11385" "11725" "11385" ...
## $ contbr_employer : Factor w/ 17350 levels "","'SELF'","'TRANSPARENT'",..: 10102 13267 7217 5552 10158 904 7218 10824 13650 12002 ...
## $ contbr_occupation: Factor w/ 8689 levels "","''RETIRED''",..: 6685 4765 3917 720 852 3428 5638 63 734 2185 ...
## $ contb_receipt_amt: num 10 50 85.7 10 200 25 -2700 25 19 5 ...
## $ contb_receipt_dt : Factor w/ 652 levels "1-Apr-15","1-Apr-16",..: 396 63 648 414 285 652 502 306 26 456 ...
## $ Last_Name : chr "Clinton" "Clinton" "Trump" "Sanders" ...
## $ Population : int 60998 98592 29150 98592 54447 50984 7823 80018 101572 81677 ...
## $ Party : Factor w/ 5 levels "Conservative",..: 2 2 5 2 2 2 5 2 2 2 ...
## $ Party_Color : Factor w/ 5 levels "Blue","Green",..: 1 1 4 1 1 1 4 1 1 1 ...
Contrb_name came in as a factor. Needless, but I suppose but will change to character. Zip came back as chr and we specifically do not want that. Will change it to factor. Employer too needlessly came back as factor. Guess I will change that. Occupation cmae back as factor , that is good. Date came back as a factor. . Last_Name is chr. Specifically wanted this as a factor. Party and Party_Color came in as desired. Well that is something, I suppose.
NY_1$Last_Name <- as.factor(NY_1$Last_Name)
NY_1$Zip <- as.factor(NY_1$Zip)
NY_1$contbr_nm <- as.character(NY_1$contbr_nm)
NY_1$contbr_employer<-as.character(NY_1$contbr_employer)
NY_1$contbr_city<-as.character(NY_1$contbr_city)
Ok, now let us create Enthusiasm as a variable. Enthusiasm is the percentage of a population that donated to a candidate.
The first step is to get the number of donors per zip.
Frequency<-data.frame(table(NY_1$Zip))
Donors.Per.Zip<- data.frame(table(NY_1$Zip))
There are some blank observations which is odd since it is only a record of donors and all donors have to give a complete address. In any event lets drop any rows that contain a blank.
Let us revise Var1 to Zip as a column heading.
setnames(Donors.Per.Zip,"Var1","Zip")
Donors.Per.Zip<-Donors.Per.Zip%>%
filter(Zip !="")
Join Donors.Per.Zip to NY_1
NY_1<- NY_1%>% left_join(Donors.Per.Zip, by= c("Zip"="Zip"))
Lets create a variable called Enthusiasm which is the percentage of each zip that donated.
str(Donors.Per.Zip)
## 'data.frame': 1631 obs. of 2 variables:
## $ Zip : Factor w/ 1632 levels "","`1136","0",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Freq: int 1 2 4 1 883 628 1495 132 162 98 ...
NY_1<- transform(NY_1, Enthusiasm=(Donors.Per.Zip$Freq*100) / Population)
## Warning in (Donors.Per.Zip$Freq * 100)/Population: longer object length is
## not a multiple of shorter object length
Weirdly , some Enthusiasm levels are over 100%. Cannot imagine how.
NY_1<-filter(NY_1 , Enthusiasm<=100)
Next we will create varible of how many times each donore gave.
Donor.Times<- as.data.frame(count(NY_1$contbr_nm))
Revise the names:
setnames(Donor.Times,"x","contbr_nm")
setnames(Donor.Times,"freq","Times.Given")
Now let us join our new our new variable Multi_Donor to our dataset.
NY_1<- NY_1%>% left_join(Donor.Times,by = c("contbr_nm"="contbr_nm"))
## Warning: Column `contbr_nm` joining character vector and factor, coercing
## into character vector
Some donations are negative ,apparently they recieved money from the candidate rather than giving it. I will be sure to look into this for the next cycle. In any event , for our purposes let us limit the donations under consideration to those of $1 or more. In our sample the min donation is -$9300.
min(NY_1$contb_receipt_amt)
## [1] -9300
NY_1<- dplyr::filter(NY_1,contb_receipt_amt >=1)
Most of our data wrangling is now done. We sill assign our work a new name to “firewall”" so we do not inadvertantly alter it graphing it.
NY1 <- NY_1
How many donations per zip were made?
Donors.By.Zip<-ggplot(data = NY1, aes(x= Zip ))+
geom_bar(stat = "count")+
ggtitle("Donors Per Zip")+
scale_x_discrete(name="Zip") +
scale_y_continuous(name="Number of Donors")+
theme_wsj()
Somewhat skewed. We will put the count on log scale so we can see the data better.
Donors.By.Zip.Logged<-ggplot(data = NY1, aes(x= Zip ))+
geom_bar(stat = "count")+
scale_y_log10()+
ggtitle("Donors Per Zip(Logged)")+
theme_wsj()
Donors.By.Zip.Logged
What is the median donation per zip?
Med.Don.By.Zip<-
NY1%>%
group_by(Zip) %>%
dplyr::summarise( Median=median(contb_receipt_amt))%>%
arrange(Median)
Median.Donation.Per.Zip<-
ggplot(data =Med.Don.By.Zip , aes(x= Zip,y=Median ))+
geom_bar(stat="identity")+
geom_hline(yintercept = 27,color="red", size = 1 )+
theme_wsj()+
ggtitle("Med Don. Per Zip ")
Median.Donation.Per.Zip
Hmmm. Lets try logging that.
Median.Donation.Per.Zip.Log<-
ggplot(data =Med.Don.By.Zip , aes(x= Zip,y=Median ))+
geom_bar(stat="identity")+
geom_hline(yintercept = 27,color="red", size = 1 )+
scale_y_log10()+
theme_wsj()+
ggtitle("Med Don. Per Zip Logged")
Median.Donation.Per.Zip.Log
So the pattern reflects the population distribution of New York state. One of the most populous urban ares inthe US set inside an otherwise rural area. Hence the very skewed data. We are trying to get past this skesed characteristic in this project.
grid.arrange(Donors.By.Zip,Median.Donation.Per.Zip, ncol=2)
Which Zip codes were the most enthusiatic, i.e., which had the highest percentage of donors?
This is our Enthusiasm variable. We just need to drop duplicate zips and rank order the results on the Enthusiasm column.
Unique.Zips<-distinct(NY1,Zip, .keep_all=TRUE)
Rank.Enthusiasm<- Unique.Zips%>% arrange(desc(Enthusiasm))
Lets get the top ten most enthused zips
Top.Ten.Enthused.Zips<- Rank.Enthusiasm[1:10,]
ok , let us finally graph these most enthused zips.
Top.Ten.Enthused.Zips.Plot<- ggplot(data = Top.Ten.Enthused.Zips, aes(x=Zip, y=Enthusiasm ))+
geom_bar(stat = "identity")+
ggtitle("Zips with Highest Participation Rate")+
theme(panel.background = element_rect( colour = 'red'))
Top.Ten.Enthused.Zips.Plot
Our Second Question for the combined party phase is which occupations show the most enthuseiasm. In other words what are the most popular jobs.
Most.Enthused.Jobs<-as.data.frame(table(NY1$contbr_occupation))
setnames(Most.Enthused.Jobs,"Var1","Job")
setnames(Most.Enthused.Jobs,"Freq","Num.w.Job")
There is blank cell for Job with a count of 32. Lets change give that cell a name.
Most.Enthused.Jobs[1,1]="NA"
## Warning in `[<-.factor`(`*tmp*`, iseq, value = "NA"): invalid factor level,
## NA generated
View(Most.Enthused.Jobs)
Now lets arrange the jobs in descending order.
Rank.Jobs<-Most.Enthused.Jobs%>%arrange(desc(Num.w.Job))
View(Rank.Jobs)
Lets only look at the ten most enthused jobs. Top.Ten.Enthused.Zips<- Rank.Enthusiasm[1:10,]
Ten.Most.Enthused.Jobs<-Rank.Jobs[1:10,]
View(Ten.Most.Enthused.Jobs)
Rank.Jobs.Plot<- ggplot(data = Ten.Most.Enthused.Jobs, aes(x=Job, y=Num.w.Job ))+
geom_bar(stat = "identity")+
ggtitle("Jobs That Donated The Most Frequently")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" , y= "Number of Donations")
Rank.Jobs.Plot
Who were the people that donated the most number of times?
Most.Freq.Donors<- as.data.frame(count(NY1,"contbr_nm"))
Revise “frew” to “Donation.Times”
setnames(Most.Freq.Donors, "freq", "Donation.Times")
Next we will arrange them in descending order.
Most.Freq.Donors<-Most.Freq.Donors%>% arrange(desc(Donation.Times))
Most.Freq.Donors.Plot<-
ggplot(data=Most.Freq.Donors,aes(x=contbr_nm , y=Donation.Times))+
geom_bar(stat = "identity")
Most.Freq.Donors.Plot
Not what I was expecting. What is the range of the donors?
summary(Most.Freq.Donors)
## contbr_nm Donation.Times
## Length:46623 Min. : 1.000
## Class :character 1st Qu.: 1.000
## Mode :character Median : 1.000
## Mean : 2.094
## 3rd Qu.: 2.000
## Max. :199.000
View(Most.Freq.Donors)
Instead lets focus on the top 25 donors donors who gave not the most money but the most number of times.
TwentyFive.Most.Enthused.Donors<-Most.Freq.Donors[1:25,]
TwentyFive.Most.Enthused.Donors.Plot<-
ggplot(data=TwentyFive.Most.Enthused.Donors,aes(x=contbr_nm , y=Donation.Times))+
geom_bar(stat = "identity")+
coord_flip()
TwentyFive.Most.Enthused.Donors.Plot
Now lets see how similar the Republican and Democrat response to these same inqueries is. Our first question again is what zips each gave the most times. This is an absilute nuber and is not eh same as Enthusiasm s=which is a percentage of the zip population.
target <- c("Democrat","Republican" )
Rep.Dem.Only<-filter(NY1, Party %in% target)
count table zips color by party
Donations.By.Zip.Party<-ggplot(data = Rep.Dem.Only, aes(x= Zip ,fill=Party))+
geom_bar(stat = "count")+
ggtitle("Number of Donors By Zip")+
scale_x_discrete(name="Zip") +
scale_y_continuous(name="Number of Donors")+
theme_wsj()+
theme(panel.background = element_rect(fill = "black"))
Donations.By.Zip.Party
What would this look like logged?
Donations.By.Zip.Party.Log<-ggplot(data = Rep.Dem.Only, aes(x= Zip ,fill=Party))+
geom_bar(stat = "count")+
ggtitle("Number of Donors By Zip Logged")+
scale_x_discrete(name="Zip") +
scale_y_continuous(name="Number of Donors")+
theme_wsj()+
scale_y_log10()+
theme(panel.background = element_rect(fill = "black"))
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
Donations.By.Zip.Party.Log
Median Donation By Party Per Zip.
Republican<-filter(Rep.Dem.Only , Party=="Republican")
View(Republican)
Republican.Median.Donation.Per.Zip<-
Republican%>%
group_by(Zip)%>%
dplyr::summarise(Median=median(contb_receipt_amt))%>%
arrange(Median)
summary(Republican.Median.Donation.Per.Zip)
## Zip Median
## 10001 : 1 Min. : 2.0
## 10002 : 1 1st Qu.: 35.0
## 10003 : 1 Median : 50.0
## 10004 : 1 Mean : 114.7
## 10005 : 1 3rd Qu.: 100.0
## 10006 : 1 Max. :2700.0
## (Other):1132
Republican.Median.Donation.Per.Zip.Plot<-
ggplot(data =Republican.Median.Donation.Per.Zip , aes(x= Zip,y=Median))+
geom_bar(stat="identity", color="red")+
scale_y_log10()+
theme_wsj()+
geom_hline(yintercept = 50,color="black", size = 2 )+
ggtitle("Republican Median Donation Zip")+
annotate("text",x=500, y= 800,label="Median = 50")
Republican.Median.Donation.Per.Zip.Plot
Now regarding the Democrats:
Democrat<-filter(Rep.Dem.Only , Party=="Democrat")
Democratic.Median.Donation.Per.Zip<-
Democrat%>%
group_by(Zip)%>%
dplyr::summarise(Median=median(contb_receipt_amt))%>%
arrange(Median)
summary(Democratic.Median.Donation.Per.Zip)
## Zip Median
## 10001 : 1 Min. : 2.50
## 10002 : 1 1st Qu.: 25.00
## 10003 : 1 Median : 25.00
## 10004 : 1 Mean : 42.73
## 10005 : 1 3rd Qu.: 40.00
## 10006 : 1 Max. :2700.00
## (Other):1370
Democrat.Median.Donation.Per.Zip.Plot<-
ggplot(data =Democratic.Median.Donation.Per.Zip , aes(x= Zip,y=Median))+
geom_bar(stat="identity", color="blue")+
scale_y_log10()+
geom_hline(yintercept = 25,color="black", size = 2 )+
annotate("text",x=700, y= 800,label="Median = 25")+
theme_wsj()+
ggtitle("Democrat Median Donation Zip")
Democrat.Median.Donation.Per.Zip.Plot
And side by side?
grid.arrange(Republican.Median.Donation.Per.Zip.Plot,Democrat.Median.Donation.Per.Zip.Plot,ncol=2)
Which zip codes were the most enthusiastic but now we will have seperate answers for the Republicans and the Democrats.
Republican.Unique.Zips<-
filter(Unique.Zips , Party=="Republican")
Republican.Enthused.Zips<-Republican.Unique.Zips%>%
arrange(desc(Enthusiasm))
summary(Republican.Enthused.Zips)
## cmte_id contbr_nm contbr_city Zip
## C00580100:187 Length:336 Length:336 10165 : 1
## C00574624: 74 Class :character Class :character 10302 : 1
## C00573519: 49 Mode :character Mode :character 10314 : 1
## C00458844: 16 10454 : 1
## C00579458: 6 10470 : 1
## C00577981: 2 10509 : 1
## (Other) : 2 (Other):330
## contbr_employer contbr_occupation
## Length:336 RETIRED :141
## Class :character INFORMATION REQUESTED : 34
## Mode :character INFORMATION REQUESTED PER BEST EFFORTS: 11
## SALES : 5
## SELF-EMPLOYED : 5
## PHYSICIAN : 4
## (Other) :136
## contb_receipt_amt contb_receipt_dt Last_Name Population
## Min. : 2.00 12-Jul-16: 14 Trump :187 Min. : 2
## 1st Qu.: 25.75 11-Jul-16: 11 Cruz : 74 1st Qu.: 1592
## Median : 50.00 8-Aug-16 : 9 Carson : 49 Median : 3698
## Mean : 159.18 1-Jul-16 : 8 Rubio : 16 Mean : 9137
## 3rd Qu.: 100.00 9-Aug-16 : 8 Bush : 6 3rd Qu.:11456
## Max. :2700.00 19-Jul-16: 7 Huckabee: 2 Max. :99598
## (Other) :279 (Other) : 2
## Party Party_Color Freq Enthusiasm
## Conservative: 0 Blue : 0 Min. : 1.00 Min. : 0.00244
## Democrat : 0 Green : 0 1st Qu.: 3.00 1st Qu.: 0.08200
## Green : 0 Orange: 0 Median : 9.00 Median : 0.41503
## Libeterian : 0 Red :336 Mean : 24.37 Mean : 3.41790
## Republican :336 Yellow: 0 3rd Qu.: 30.00 3rd Qu.: 2.08818
## Max. :210.00 Max. :81.51724
##
## Times.Given
## Min. : 1.00
## 1st Qu.: 1.00
## Median : 1.00
## Mean : 1.78
## 3rd Qu.: 2.00
## Max. :36.00
##
TwentyFive.Rep.Enthused.Zips<-Republican.Enthused.Zips[1:25,]
TwentyFive.Rep.Enthused.Zips.Plot<-
ggplot(data=TwentyFive.Rep.Enthused.Zips,aes(x=Zip , y=Enthusiasm))+
geom_bar(stat = "identity", fill ="red")+
coord_flip()+
annotate("text",x=25, y= 25,label="Zips are in numerical order")+
geom_hline(yintercept = 38,color="black", size = 2 )+
ggtitle("Republicans Highest Participation Rate")
TwentyFive.Rep.Enthused.Zips.Plot
Democratic.Unique.Zips<-
filter(Unique.Zips , Party=="Democrat")
Democratic.Enthused.Zips<-Democratic.Unique.Zips%>%
arrange(desc(Enthusiasm))
summary(Democratic.Enthused.Zips)
## cmte_id contbr_nm contbr_city Zip
## C00575795:611 Length:1124 Length:1124 10001 : 1
## C00577130:513 Class :character Class :character 10002 : 1
## C00458844: 0 Mode :character Mode :character 10003 : 1
## C00500587: 0 10004 : 1
## C00573519: 0 10005 : 1
## C00574624: 0 10006 : 1
## (Other) : 0 (Other):1118
## contbr_employer contbr_occupation contb_receipt_amt
## Length:1124 RETIRED :183 Min. : 1.00
## Class :character NOT EMPLOYED :168 1st Qu.: 15.00
## Mode :character TEACHER : 36 Median : 27.00
## PROFESSOR : 24 Mean : 74.01
## INFORMATION REQUESTED: 19 3rd Qu.: 50.00
## ATTORNEY : 17 Max. :2700.00
## (Other) :677
## contb_receipt_dt Last_Name Population Party
## 29-Feb-16: 17 Clinton :611 Min. : 2 Conservative: 0
## 30-Apr-16: 14 Sanders :513 1st Qu.: 1670 Democrat :1124
## 31-May-16: 14 Bush : 0 Median : 5460 Green : 0
## 6-Nov-16 : 13 Carson : 0 Mean : 14403 Libeterian : 0
## 25-Oct-16: 12 Christie: 0 3rd Qu.: 19161 Republican : 0
## 30-Mar-16: 12 Cruz : 0 Max. :109931
## (Other) :1042 (Other) : 0
## Party_Color Freq Enthusiasm Times.Given
## Blue :1124 Min. : 1.00 Min. : 0.00107 Min. : 1.000
## Green : 0 1st Qu.: 6.00 1st Qu.: 0.06357 1st Qu.: 1.000
## Orange: 0 Median : 22.00 Median : 0.31572 Median : 3.000
## Red : 0 Mean : 80.47 Mean : 2.68706 Mean : 4.018
## Yellow: 0 3rd Qu.: 68.00 3rd Qu.: 1.52381 3rd Qu.: 5.000
## Max. :2582.00 Max. :100.00000 Max. :46.000
##
TwentyFive.Dem.Enthused.Zips<-Democratic.Enthused.Zips[1:25,]
TwentyFive.Dem.Enthused.Zips.Plot<-
ggplot(data=TwentyFive.Dem.Enthused.Zips,aes(x=Zip , y=Enthusiasm))+
geom_bar(stat = "identity", fill ="blue")+
coord_flip()+
annotate("text",x=25, y= 75,label="Zips are in numerical order")+
geom_hline(yintercept = 31,color="black", size = 2 )+
ggtitle("Democrats Highest Participation Rate")
TwentyFive.Dem.Enthused.Zips.Plot
Do the Republicans and Democrats have any Enthused zips in common? No.
intersect(TwentyFive.Rep.Enthused.Zips$Zip ,TwentyFive.Dem.Enthused.Zips$Zip)
## character(0)
multiplot(TwentyFive.Rep.Enthused.Zips.Plot ,TwentyFive.Dem.Enthused.Zips.Plot ,cols = 1 )
Do Republicans and Democrats appeal to different kinds of occupations?
Rep.Most.Enthused.Jobs<-as.data.frame(table(Republican$contbr_occupation))
View(Rep.Most.Enthused.Jobs)
setnames(Rep.Most.Enthused.Jobs,"Var1","contbr_occupation")
setnames(Rep.Most.Enthused.Jobs,"Freq","Numb.With.Job")
Rep.Rank.Jobs<-Rep.Most.Enthused.Jobs%>%arrange(desc(Numb.With.Job))
Lets only look at the ten most enthused jobs. Top.Ten.Enthused.Zips<- Rank.Enthusiasm[1:10,]
Rep.Ten.Most.Enthused.Jobs<-Rep.Rank.Jobs[1:10,]
View(Rep.Ten.Most.Enthused.Jobs)
Rep.Ten.Most.Enthused.Jobs.Plot<- ggplot(data = Rep.Ten.Most.Enthused.Jobs, aes(x=contbr_occupation, y=Numb.With.Job ))+
geom_bar(stat = "identity", fill="red")+
ggtitle("Jobs That Donated To The Republicans The Most Frequently")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" , y= "Number of Donations")
Rep.Ten.Most.Enthused.Jobs.Plot
Dem.Most.Enthused.Jobs<-as.data.frame(table(Democrat$contbr_occupation))
View(Dem.Most.Enthused.Jobs)
setnames(Dem.Most.Enthused.Jobs,"Var1","contbr_occupation")
setnames(Dem.Most.Enthused.Jobs,"Freq","Numb.With.Job")
Dem.Rank.Jobs<-Dem.Most.Enthused.Jobs%>%arrange(desc(Numb.With.Job))
Lets only look at the ten most enthused jobs. Top.Ten.Enthused.Zips<- Rank.Enthusiasm[1:10,]
Dem.Ten.Most.Enthused.Jobs<-Dem.Rank.Jobs[1:10,]
View(Dem.Ten.Most.Enthused.Jobs)
Dem.Ten.Most.Enthused.Jobs.Plot<- ggplot(data = Dem.Ten.Most.Enthused.Jobs, aes(x=contbr_occupation, y=Numb.With.Job ))+
geom_bar(stat = "identity", fill="blue")+
ggtitle("Jobs That Donated To The Republicans The Most Frequently")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" , y= "Number of Donations")
Dem.Ten.Most.Enthused.Jobs.Plot
Do the Republicans and Democrats share any occupations as their most freqeunt donors? Yes, Attorneys, Physiacians, Retired people, were both in each party’s 10 most frequent donors. More intersting though are what they lack in common. The republicans had Engineers, Homemakers, Sales, and Self Employed and the the Democrats did not. Converselym the Democrats had Not Employed, Professor, Teacher, and Writer, in their top 10 and the Republicans did not. I suspect these self-labels are , or should be, targeted cohorts for the parties.
Which party had the most multi donors?
View(Republican)
Rep.Multi.Donors<- as.data.frame(table(Republican$contbr_nm))
setnames(Rep.Multi.Donors,"Var1","contbr_nm")
Rep.Rank.Multi.Donors<-Rep.Multi.Donors%>%arrange(desc(Freq))
Rep.Rank.Multi.Donors<-Rep.Rank.Multi.Donors[1:10,]
Rep.Rank.Multi.Donors.Plot<- ggplot(data = Rep.Rank.Multi.Donors, aes(x=contbr_nm, y=Freq ))+
geom_bar(stat = "identity", fill="red")+
ggtitle("Indivuduals Who Donated to Rep Party the Most Times")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" , y= "Number of Donations")
Rep.Rank.Multi.Donors.Plot
View(Democrat)
Dem.Multi.Donors<- as.data.frame(table(Democrat$contbr_nm))
View(Dem.Multi.Donors)
setnames(Dem.Multi.Donors,"Var1","contbr_nm")
Dem.Rank.Multi.Donors<-Dem.Multi.Donors%>%arrange(desc(Freq))
Dem.Rank.Multi.Donors<-Dem.Rank.Multi.Donors[1:10,]
Dem.Rank.Multi.Donors.Plot<- ggplot(data = Dem.Rank.Multi.Donors, aes(x=contbr_nm, y=Freq ))+
geom_bar(stat = "identity", fill="blue")+
ggtitle("Individuals Who Donated to Dem Party the Most Times")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" , y= "Number of Donations")
Dem.Rank.Multi.Donors.Plot
Did the Republicans and Democrats have any contributors in common in their top ten list? No.
intersect(Rep.Rank.Multi.Donors$contbr_nm,Dem.Rank.Multi.Donors$contbr_nm)
## character(0)
Finally, let us put our four questions to the candidates themselves. If we focused on all 25 we would loose the forest for the trees but 2016 had the great fortune to have 3 viable candidates with seemingly very different bases of support. A mainstream democrat who leaned right, a populist republican something that has not been a main attraction since at least 1970, and a populist democrat who almost won the party nomination. Having these 3 viable candidates gioves us the rare opportunity to refract each in turn to see the others in different light. In other words instead of everyone lining up behind their usual more money forschools Democrats or increase the military budget republican this cycle we chose from a more moeny for schools demiocrat v. a free college and healthcasre for all democrat both of v a anti-trade, anti-elite, anti bring on the future but rather turn back the clock republican.
How do the candidates donations compare?
TCS<- NY1%>%filter(Last_Name==c("Trump","Clinton","Sanders"))
ggplot(TCS,aes(x=Last_Name,fill = Last_Name))+
geom_bar(stat="count")
Clinton certainly recieved more donations but lets look into this a little closer.
ggplot(TCS,aes(x=Last_Name,y=contb_receipt_amt, fill=Last_Name))+
geom_boxplot()+
scale_y_log10()
This is surprising. Trump has the highest median donation
Were some occupations more likely to give to one candidate thather than another?
Trump<-filter(NY1,Last_Name=="Trump")
Trump.Jobs<-as.data.frame(table(Trump$contbr_occupation))
View(Trump.Jobs)
setnames(Trump.Jobs,"Var1","Job")
Trump.Rank.Jobs<-Trump.Jobs%>%arrange(desc(Freq))
Trump.Rank.Jobs<-Trump.Rank.Jobs[1:5,]
View(Trump.Rank.Jobs)
Trump.Rank.Jobs.Plot<- ggplot(data = Trump.Rank.Jobs, aes(x=Job, y=Freq ))+
geom_bar(stat = "identity", fill="red")+
ggtitle("Occupations Who Most Supported Trump")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" )
Trump.Rank.Jobs.Plot
Peculiarly.Trump<- anti_join(Trump.Rank.Jobs,TCS,by=c("Job"="contbr_occupation"))
Peculiarly.Trump
## [1] Job Freq
## <0 rows> (or 0-length row.names)
Sanders<-filter(NY1,Last_Name=="Sanders")
Sanders.Jobs<-as.data.frame(table(Sanders$contbr_occupation))
setnames(Sanders.Jobs,"Var1","Job")
Sanders.Rank.Jobs<-Sanders.Jobs%>%arrange(desc(Freq))
Sanders.Rank.Jobs<-Sanders.Rank.Jobs[1:5,]
Sanders.Rank.Jobs.Plot<- ggplot(data = Sanders.Rank.Jobs, aes(x=Job, y=Freq ))+
geom_bar(stat = "identity", fill="blue")+
ggtitle("Occupations Who Most Supported Sanders")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" )
Sanders.Rank.Jobs.Plot
Clinton<-filter(NY1,Last_Name=="Clinton")
Clinton.Jobs<-as.data.frame(table(Clinton$contbr_occupation))
setnames(Clinton.Jobs,"Var1","Job")
Clinton.Rank.Jobs<-Clinton.Jobs%>%arrange(desc(Freq))
Clinton.Rank.Jobs<-Clinton.Rank.Jobs[1:5,]
Clinton.Rank.Jobs.Plot<- ggplot(data = Clinton.Rank.Jobs, aes(x=Job, y=Freq ))+
geom_bar(stat = "identity", fill="deepskyblue1")+
ggtitle("Occupations Who Most Supported Clinton")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x="Contribitor's Occupation" )
Clinton.Rank.Jobs.Plot
grid.arrange(Trump.Rank.Jobs.Plot,Clinton.Rank.Jobs.Plot,Sanders.Rank.Jobs.Plot,ncol=3)
VENN DIAGRAMS
Trump5<- c("Physician","Retired","Sales","SelfEmployed")
Sanders5<-c("Attorney","Not_Employed","Professor","Retired","Teacher")
Trump and Sanders
Trump.Sanders<-semi_join(Trump.Rank.Jobs,Sanders.Rank.Jobs,by=c("Job"="Job"))
Trump.Sanders
## Job Freq
## 1 RETIRED 1815
grid.newpage()
TS<-draw.pairwise.venn(area1 = 5,area2 = 5,cross.area =1,
category = c("Trump5","Sanders5"),fill=c("red","mediumblue"))
require(gridExtra)
grid.arrange(gTree(children=TS), top="Trump & Sanders Common Donor Occupation", bottom="Retired")
Sanders.Clinton<-semi_join(Clinton.Rank.Jobs,Sanders.Rank.Jobs,by=c("Job"="Job"))
Sanders.Clinton
## Job Freq
## 1 RETIRED 10457
## 2 ATTORNEY 3128
## 3 TEACHER 1403
grid.newpage()
SC<-draw.pairwise.venn(area1 = 5,area2 = 5,cross.area =3,
category = c("Clinton5","Sanders5"),fill = c("steelblue1","mediumblue"))
require(gridExtra)
grid.arrange(gTree(children=SC), top="Clinton & Sanders Common Donor Occupations", bottom="Retired, Attorney, Teacher")
Trump.Clinton<-semi_join(Clinton.Rank.Jobs,Trump.Rank.Jobs,by=c("Job"="Job"))
Trump.Clinton
## Job Freq
## 1 RETIRED 10457
## 2 INFORMATION REQUESTED 1370
grid.newpage()
TC<-draw.pairwise.venn(area1 = 5,area2 = 5,cross.area = 2,
category = c("Clinton5","Trump"),fill=c("red","steelblue1"))
require(gridExtra)
grid.arrange(gTree(children=TC), top="Trump & Clintons Common Donor Occupation", bottom="Retired, Information Requested")
Were there any jobs that were peculair to each candidate? Peculiar to Clinton? No.
SandersAndTrump<-filter(TCS,Last_Name==c("Sanders","Trump"))
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(Last_Name, c("Sanders", "Trump")): longer object
## length is not a multiple of shorter object length
Peculiarly.Clinton<- anti_join(Clinton.Rank.Jobs,SandersAndTrump,by=c("Job"="contbr_occupation"))
Peculiarly.Clinton
## [1] Job Freq
## <0 rows> (or 0-length row.names)
Peculiar to Trump? No.
SandersAndClinton<-filter(TCS,Last_Name==c("Sanders","Clinton"))
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(Last_Name, c("Sanders", "Clinton")): longer object
## length is not a multiple of shorter object length
Peculiarly.Trump<- anti_join(Trump.Rank.Jobs,SandersAndTrump,by=c("Job"="contbr_occupation"))
Peculiarly.Trump
## [1] Job Freq
## <0 rows> (or 0-length row.names)
Peculiar to Sanders? No.
TrumpAndClinton<-filter(TCS,Last_Name==c("Trump","Clinton"))
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(Last_Name, c("Trump", "Clinton")): longer object
## length is not a multiple of shorter object length
Peculiarly.Sanders<- anti_join(Sanders.Rank.Jobs, TrumpAndClinton,by=c("Job"="contbr_occupation"))
Peculiarly.Sanders
## [1] Job Freq
## <0 rows> (or 0-length row.names)
NY1$Last_Name.char <- as.character(NY1$Last_Name)
TCS<- NY1%>%filter(Last_Name==c("Trump","Clinton","Sanders"))
TCS$Last_Name<-as.factor(TCS$Last_Name)
ggplot(TCS,aes(x=Last_Name , y=contb_receipt_amt,color=Last_Name))+
geom_point()+
ggtitle("Comparing Clinton Sanders and Trumps Donations")
ggplot(TCS,aes(x=contb_receipt_amt,fill=Last_Name))+
geom_histogram()+
scale_y_log10()+
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 4 rows containing missing values (geom_bar).
#scale_x_continuous(breaks = c(5,10,25,100,500,1000,2000))
ggplot(TCS,aes(x=contb_receipt_amt,color=Last_Name))+
geom_freqpoly(size=2)+
scale_y_log10()+
scale_x_log10()+
scale_x_continuous(breaks = c(5,10,25,100,500,1000,2000))+
ggtitle("The Count of the Candidates Donations")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
Now let us compare our three candidates levels of Enthusiasm.