# Question Statement
# This is data from pollsters of 2008 US Presidential Election for the dates months of August and September.
# Would like to know how the polls have moved for both the candidates. Would also like to see whether the trends where set in line with outcome of election.
#read csv from github.
mydata=fread('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/Stat2Data/Pollster08.csv')
mydata=data.frame(mydata)
head(mydata)
## V1 PollTaker PollDates MidDate Days n Pop McCain Obama Margin
## 1 1 Rasmussen 8/28-30/08 8/29 1 3000 LV 46 49 3
## 2 2 Zogby 8/29-30/08 8/30 2 2020 LV 47 45 -2
## 3 3 Diageo/Hotline 8/29-31/08 8/30 2 805 RV 39 48 9
## 4 4 CBS 8/29-31/08 8/30 2 781 RV 40 48 8
## 5 5 CNN 8/29-31/08 8/30 2 927 RV 48 49 1
## 6 6 Rasmussen 8/30-9/1/08 8/31 3 3000 LV 45 51 6
## Charlie Meltdown
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
summary(mydata)
## V1 PollTaker PollDates
## Length:102 Length:102 Length:102
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## MidDate Days n Pop
## Length:102 Min. : 1.0 Min. : 697 Length:102
## Class :character 1st Qu.:11.0 1st Qu.: 912 Class :character
## Mode :character Median :18.0 Median :1085 Mode :character
## Mean :18.3 Mean :1523
## 3rd Qu.:26.0 3rd Qu.:2310
## Max. :33.0 Max. :4752
## NA's :3
## McCain Obama Margin Charlie
## Min. :36.0 Min. :40.00 Min. :-10.000 Min. :0.0000
## 1st Qu.:43.0 1st Qu.:45.25 1st Qu.: 0.000 1st Qu.:0.0000
## Median :45.0 Median :47.00 Median : 2.000 Median :1.0000
## Mean :44.6 Mean :46.92 Mean : 2.324 Mean :0.6275
## 3rd Qu.:46.0 3rd Qu.:49.00 3rd Qu.: 5.000 3rd Qu.:1.0000
## Max. :54.0 Max. :52.00 Max. : 11.000 Max. :1.0000
##
## Meltdown
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4902
## 3rd Qu.:1.0000
## Max. :1.0000
##
# Observation: On an average the polls were in line with final outcome of election.
# The Mean and median difference of polls between two candidates is approximately two points.
# Data Wrangling.
# New Data has columns Charlie and Meltdown dropped
newdata=subset(mydata, select = c(1:10))
head(newdata)
## V1 PollTaker PollDates MidDate Days n Pop McCain Obama Margin
## 1 1 Rasmussen 8/28-30/08 8/29 1 3000 LV 46 49 3
## 2 2 Zogby 8/29-30/08 8/30 2 2020 LV 47 45 -2
## 3 3 Diageo/Hotline 8/29-31/08 8/30 2 805 RV 39 48 9
## 4 4 CBS 8/29-31/08 8/30 2 781 RV 40 48 8
## 5 5 CNN 8/29-31/08 8/30 2 927 RV 48 49 1
## 6 6 Rasmussen 8/30-9/1/08 8/31 3 3000 LV 45 51 6
# Result Data has the column values LV- Likely Voters and RV - Reg Voters swapped
resultData<-as.data.frame(sapply(newdata,gsub,pattern="LV",replacement="Likely Voters",ignore.case = TRUE))
resultData<-as.data.frame(sapply(resultData,gsub,pattern="RV",replacement="Reg Voters",ignore.case = TRUE))
head(resultData)
## V1 PollTaker PollDates MidDate Days n Pop McCain
## 1 1 Rasmussen 8/28-30/08 8/29 1 3000 Likely Voters 46
## 2 2 Zogby 8/29-30/08 8/30 2 2020 Likely Voters 47
## 3 3 Diageo/Hotline 8/29-31/08 8/30 2 805 Reg Voters 39
## 4 4 CBS 8/29-31/08 8/30 2 781 Reg Voters 40
## 5 5 CNN 8/29-31/08 8/30 2 927 Reg Voters 48
## 6 6 Rasmussen 8/30-9/1/08 8/31 3 3000 Likely Voters 45
## Obama Margin
## 1 49 3
## 2 45 -2
## 3 48 9
## 4 48 8
## 5 49 1
## 6 51 6
#This Subset has data of polls where wither Obama is leading or tied with McCain.
subdata=subset(resultData, mydata$Margin>=0)
summary(subdata)
## V1 PollTaker PollDates MidDate
## 1 : 1 Diageo/Hotline : 9 9/1-3/08 : 4 9/23 : 7
## 10 : 1 Rasmussen : 9 9/22-24/08: 4 9/28 : 6
## 100 : 1 DailyKos.com : 8 9/27-29/08: 4 9/29 : 6
## 101 : 1 Gallup : 6 9/28-30/08: 4 9/20 : 5
## 102 : 1 ARG : 4 8/29-31/08: 3 9/21 : 5
## 11 : 1 Economist/YouGov: 4 9/19-21/08: 3 9/2 : 4
## (Other):72 (Other) :38 (Other) :56 (Other):45
## Days n Pop McCain Obama
## 26 : 7 3000 : 9 A : 5 45 :14 47 :16
## 31 : 6 1100 : 8 Likely Voters:44 46 :14 48 :16
## 32 : 6 1000 : 6 Reg Voters :29 43 :12 49 :14
## 23 : 5 1200 : 4 42 :11 46 : 9
## 24 : 5 1007 : 2 44 :10 50 : 7
## 30 : 4 (Other):47 40 : 5 45 : 5
## (Other):45 NA's : 2 (Other):12 (Other):11
## Margin
## 4 :15
## 5 :13
## 0 :10
## 1 :10
## 2 : 8
## 6 : 6
## (Other):16
#This Subset has data of polls among Reg Voters and Likely Voters.
regVot=subset(subdata, subdata$Pop=="Reg Voters")
likVot=subset(subdata, subdata$Pop=="Likely Voters")
summary(regVot)
## V1 PollTaker PollDates MidDate
## 100 : 1 Diageo/Hotline :9 8/29-31/08: 3 8/30 : 3
## 11 : 1 Gallup :6 9/1-3/08 : 2 9/17 : 3
## 14 : 1 Ipsos/McClatchy:3 9/16-18/08: 2 9/20 : 3
## 16 : 1 CBS :2 9/19-21/08: 2 9/23 : 3
## 20 : 1 CNN :2 9/22-24/08: 2 9/2 : 2
## 23 : 1 NBC/WSJ :2 9/25-27/08: 2 9/26 : 2
## (Other):23 (Other) :5 (Other) :16 (Other):13
## Days n Pop McCain Obama
## 2 : 3 902 : 2 A : 0 42 :8 46 :6
## 20 : 3 915 : 2 Likely Voters: 0 44 :4 48 :6
## 23 : 3 1007 : 1 Reg Voters :29 46 :4 45 :5
## 26 : 3 1038 : 1 40 :3 47 :5
## 29 : 2 1046 : 1 45 :3 49 :3
## 32 : 2 1085 : 1 39 :2 40 :1
## (Other):13 (Other):21 (Other):5 (Other):3
## Margin
## 0 :6
## 4 :6
## 1 :5
## 5 :5
## 6 :2
## 8 :2
## (Other):3
summary(likVot)
## V1 PollTaker PollDates MidDate Days
## 1 : 1 Rasmussen : 9 9/20-22/08 : 3 9/21 : 5 24 : 5
## 101 : 1 DailyKos.com: 8 9/27-29/08 : 3 9/27 : 4 30 : 4
## 102 : 1 ARG : 4 8/30-9/1/08: 2 9/28 : 4 31 : 4
## 12 : 1 Democracy : 3 9/11-13/08 : 2 9/29 : 4 32 : 4
## 13 : 1 ABC/Post : 2 9/17-19/08 : 2 9/12 : 3 15 : 3
## 24 : 1 CBS/Times : 2 9/19-22/08 : 2 9/23 : 3 26 : 3
## (Other):38 (Other) :16 (Other) :30 (Other):21 (Other):21
## n Pop McCain Obama Margin
## 3000 : 9 A : 0 45 :10 49 :11 4 : 9
## 1100 : 8 Likely Voters:44 46 :10 48 :10 5 : 7
## 1200 : 4 Reg Voters : 0 43 : 9 47 : 9 2 : 6
## 1000 : 2 44 : 6 50 : 6 0 : 4
## 1007 : 1 41 : 2 51 : 5 1 : 4
## (Other):18 42 : 2 46 : 2 6 : 4
## NA's : 2 (Other): 5 (Other): 1 (Other):10
# Observation : On comparing the summary of Likely and Reg Voters, Obama has done better among Likely Voters compared with Reg Voters.
Including Plots:
#Scatter plot to see how polls have moved for Obama and McCain during those time frame.
ggplot(mydata,aes(mydata$Obama,mydata$PollDates))+geom_point()

ggplot(mydata,aes(mydata$McCain,mydata$PollDates))+geom_point()

#Positive Correlation.
ggplot(mydata,aes(mydata$McCain,mydata$Obama))+geom_point()

#Box Plot
boxplot(mydata$Obama~mydata$PollDates,data=mydata, main="Obama Poll Numbers",
xlab="Dates", ylab="Obama Poll Data")

boxplot(mydata$McCain~mydata$PollDates,data=mydata, main="McCain Poll Numbers",
xlab="Dates", ylab="McCain Poll Data")

#Histogram
attach(mydata)
hist(Obama, breaks = 50)

hist(McCain, breaks = 50)

# Solution: From this analysis. I was able to figure how the polls have moved for both the candidates.
# The polls were move favorable for Obama compared to McCain.
#Obama was found to lead among the Likely Voters compared to Reg Voters, which is one of the important predictor in who will be winning the election. It also showed from the poll data, that the race was not close.
#The Race was set at the month of August/September in predicting Obama to be the 44th POTUS.