# Question Statement
# This is data from pollsters of 2008 US Presidential Election for the dates months of August and September.
# Would like to know how the polls have moved for both the candidates. Would also like to see whether the trends where set in line with outcome of election.

#read csv from github.
mydata=fread('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/Stat2Data/Pollster08.csv')
mydata=data.frame(mydata)
head(mydata)
##   V1      PollTaker   PollDates MidDate Days    n Pop McCain Obama Margin
## 1  1      Rasmussen  8/28-30/08    8/29    1 3000  LV     46    49      3
## 2  2          Zogby  8/29-30/08    8/30    2 2020  LV     47    45     -2
## 3  3 Diageo/Hotline  8/29-31/08    8/30    2  805  RV     39    48      9
## 4  4            CBS  8/29-31/08    8/30    2  781  RV     40    48      8
## 5  5            CNN  8/29-31/08    8/30    2  927  RV     48    49      1
## 6  6      Rasmussen 8/30-9/1/08    8/31    3 3000  LV     45    51      6
##   Charlie Meltdown
## 1       0        0
## 2       0        0
## 3       0        0
## 4       0        0
## 5       0        0
## 6       0        0
summary(mydata)
##       V1             PollTaker          PollDates        
##  Length:102         Length:102         Length:102        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    MidDate               Days            n            Pop           
##  Length:102         Min.   : 1.0   Min.   : 697   Length:102        
##  Class :character   1st Qu.:11.0   1st Qu.: 912   Class :character  
##  Mode  :character   Median :18.0   Median :1085   Mode  :character  
##                     Mean   :18.3   Mean   :1523                     
##                     3rd Qu.:26.0   3rd Qu.:2310                     
##                     Max.   :33.0   Max.   :4752                     
##                                    NA's   :3                        
##      McCain         Obama           Margin           Charlie      
##  Min.   :36.0   Min.   :40.00   Min.   :-10.000   Min.   :0.0000  
##  1st Qu.:43.0   1st Qu.:45.25   1st Qu.:  0.000   1st Qu.:0.0000  
##  Median :45.0   Median :47.00   Median :  2.000   Median :1.0000  
##  Mean   :44.6   Mean   :46.92   Mean   :  2.324   Mean   :0.6275  
##  3rd Qu.:46.0   3rd Qu.:49.00   3rd Qu.:  5.000   3rd Qu.:1.0000  
##  Max.   :54.0   Max.   :52.00   Max.   : 11.000   Max.   :1.0000  
##                                                                   
##     Meltdown     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4902  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
## 
# Observation: On an average the polls were in line with final outcome of election. 
# The Mean and median difference of polls between two candidates is approximately two points.

# Data Wrangling.
# New Data has columns Charlie and Meltdown dropped
newdata=subset(mydata, select = c(1:10))
head(newdata)
##   V1      PollTaker   PollDates MidDate Days    n Pop McCain Obama Margin
## 1  1      Rasmussen  8/28-30/08    8/29    1 3000  LV     46    49      3
## 2  2          Zogby  8/29-30/08    8/30    2 2020  LV     47    45     -2
## 3  3 Diageo/Hotline  8/29-31/08    8/30    2  805  RV     39    48      9
## 4  4            CBS  8/29-31/08    8/30    2  781  RV     40    48      8
## 5  5            CNN  8/29-31/08    8/30    2  927  RV     48    49      1
## 6  6      Rasmussen 8/30-9/1/08    8/31    3 3000  LV     45    51      6
# Result Data has the column values LV- Likely Voters and RV - Reg Voters swapped 
resultData<-as.data.frame(sapply(newdata,gsub,pattern="LV",replacement="Likely Voters",ignore.case = TRUE))
resultData<-as.data.frame(sapply(resultData,gsub,pattern="RV",replacement="Reg Voters",ignore.case = TRUE))
head(resultData)
##   V1      PollTaker   PollDates MidDate Days    n           Pop McCain
## 1  1      Rasmussen  8/28-30/08    8/29    1 3000 Likely Voters     46
## 2  2          Zogby  8/29-30/08    8/30    2 2020 Likely Voters     47
## 3  3 Diageo/Hotline  8/29-31/08    8/30    2  805    Reg Voters     39
## 4  4            CBS  8/29-31/08    8/30    2  781    Reg Voters     40
## 5  5            CNN  8/29-31/08    8/30    2  927    Reg Voters     48
## 6  6      Rasmussen 8/30-9/1/08    8/31    3 3000 Likely Voters     45
##   Obama Margin
## 1    49      3
## 2    45     -2
## 3    48      9
## 4    48      8
## 5    49      1
## 6    51      6
#This Subset has data of polls where wither Obama is leading or tied with McCain.
subdata=subset(resultData, mydata$Margin>=0)
summary(subdata)
##        V1                PollTaker       PollDates     MidDate  
##  1      : 1   Diageo/Hotline  : 9   9/1-3/08  : 4   9/23   : 7  
##  10     : 1   Rasmussen       : 9   9/22-24/08: 4   9/28   : 6  
##  100    : 1   DailyKos.com    : 8   9/27-29/08: 4   9/29   : 6  
##  101    : 1   Gallup          : 6   9/28-30/08: 4   9/20   : 5  
##  102    : 1   ARG             : 4   8/29-31/08: 3   9/21   : 5  
##  11     : 1   Economist/YouGov: 4   9/19-21/08: 3   9/2    : 4  
##  (Other):72   (Other)         :38   (Other)   :56   (Other):45  
##       Days          n                 Pop         McCain       Obama   
##  26     : 7   3000   : 9   A            : 5   45     :14   47     :16  
##  31     : 6   1100   : 8   Likely Voters:44   46     :14   48     :16  
##  32     : 6   1000   : 6   Reg Voters   :29   43     :12   49     :14  
##  23     : 5   1200   : 4                      42     :11   46     : 9  
##  24     : 5   1007   : 2                      44     :10   50     : 7  
##  30     : 4   (Other):47                      40     : 5   45     : 5  
##  (Other):45   NA's   : 2                      (Other):12   (Other):11  
##      Margin  
##  4      :15  
##  5      :13  
##  0      :10  
##  1      :10  
##  2      : 8  
##  6      : 6  
##  (Other):16
#This Subset has data of polls among Reg Voters and Likely Voters.
regVot=subset(subdata, subdata$Pop=="Reg Voters")
likVot=subset(subdata, subdata$Pop=="Likely Voters")
summary(regVot)
##        V1               PollTaker      PollDates     MidDate  
##  100    : 1   Diageo/Hotline :9   8/29-31/08: 3   8/30   : 3  
##  11     : 1   Gallup         :6   9/1-3/08  : 2   9/17   : 3  
##  14     : 1   Ipsos/McClatchy:3   9/16-18/08: 2   9/20   : 3  
##  16     : 1   CBS            :2   9/19-21/08: 2   9/23   : 3  
##  20     : 1   CNN            :2   9/22-24/08: 2   9/2    : 2  
##  23     : 1   NBC/WSJ        :2   9/25-27/08: 2   9/26   : 2  
##  (Other):23   (Other)        :5   (Other)   :16   (Other):13  
##       Days          n                 Pop         McCain      Obama  
##  2      : 3   902    : 2   A            : 0   42     :8   46     :6  
##  20     : 3   915    : 2   Likely Voters: 0   44     :4   48     :6  
##  23     : 3   1007   : 1   Reg Voters   :29   46     :4   45     :5  
##  26     : 3   1038   : 1                      40     :3   47     :5  
##  29     : 2   1046   : 1                      45     :3   49     :3  
##  32     : 2   1085   : 1                      39     :2   40     :1  
##  (Other):13   (Other):21                      (Other):5   (Other):3  
##      Margin 
##  0      :6  
##  4      :6  
##  1      :5  
##  5      :5  
##  6      :2  
##  8      :2  
##  (Other):3
summary(likVot)
##        V1            PollTaker        PollDates     MidDate        Days   
##  1      : 1   Rasmussen   : 9   9/20-22/08 : 3   9/21   : 5   24     : 5  
##  101    : 1   DailyKos.com: 8   9/27-29/08 : 3   9/27   : 4   30     : 4  
##  102    : 1   ARG         : 4   8/30-9/1/08: 2   9/28   : 4   31     : 4  
##  12     : 1   Democracy   : 3   9/11-13/08 : 2   9/29   : 4   32     : 4  
##  13     : 1   ABC/Post    : 2   9/17-19/08 : 2   9/12   : 3   15     : 3  
##  24     : 1   CBS/Times   : 2   9/19-22/08 : 2   9/23   : 3   26     : 3  
##  (Other):38   (Other)     :16   (Other)    :30   (Other):21   (Other):21  
##        n                 Pop         McCain       Obama        Margin  
##  3000   : 9   A            : 0   45     :10   49     :11   4      : 9  
##  1100   : 8   Likely Voters:44   46     :10   48     :10   5      : 7  
##  1200   : 4   Reg Voters   : 0   43     : 9   47     : 9   2      : 6  
##  1000   : 2                      44     : 6   50     : 6   0      : 4  
##  1007   : 1                      41     : 2   51     : 5   1      : 4  
##  (Other):18                      42     : 2   46     : 2   6      : 4  
##  NA's   : 2                      (Other): 5   (Other): 1   (Other):10
# Observation : On comparing the summary of Likely and Reg Voters, Obama has done better among Likely Voters compared with Reg Voters.

Including Plots:

#Scatter plot to see how polls have moved for Obama and McCain during those time frame. 

ggplot(mydata,aes(mydata$Obama,mydata$PollDates))+geom_point()

ggplot(mydata,aes(mydata$McCain,mydata$PollDates))+geom_point()

#Positive Correlation.
ggplot(mydata,aes(mydata$McCain,mydata$Obama))+geom_point()

#Box Plot
boxplot(mydata$Obama~mydata$PollDates,data=mydata, main="Obama Poll Numbers", 
    xlab="Dates", ylab="Obama Poll Data")

boxplot(mydata$McCain~mydata$PollDates,data=mydata, main="McCain Poll Numbers", 
    xlab="Dates", ylab="McCain Poll Data")

#Histogram
attach(mydata)
hist(Obama, breaks = 50)

hist(McCain, breaks = 50)

# Solution: From this analysis. I was able to figure how the polls have moved for both the candidates. 
# The polls were move favorable for Obama compared to McCain. 
#Obama was found to lead among the Likely Voters compared to Reg Voters, which is one of the important predictor in who will be winning the election. It also showed from the poll data, that the race was not close.
#The Race was set at the month of August/September in predicting Obama to be the 44th POTUS.