Importing and cleaning data

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidyr)
## Loading required package: tidyr
require(stringr)
## Loading required package: stringr
popedat <- read.csv("C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Project 2\\Ex2Popes\\popefavs.csv", stringsAsFactors = FALSE)

#renaming
colnames(popedat)[1] <- "date"
colnames(popedat)[4] <- "noopinion"

#create a column of pope names
popedat$name[14:18] <- "John Paul II"
popedat$name[7:11] <- "Benedict XVI"
popedat$name[3:4] <- "Francis"

#removing empty rows
popepoll <- popedat[popedat$Unfavorable != "" & popedat$Unfavorable != "%",]

#coercing vectors to preferred types
popepoll$date <- as.character(popepoll$date)
popepoll$Favorable <- as.numeric(popepoll$Favorable)
popepoll$Unfavorable <- as.numeric(popepoll$Unfavorable)
popepoll$noopinion <- as.numeric(popepoll$noopinion)

#extracting beginning and end data from survey fielding dates
year <- as.numeric(str_sub(popepoll$date, start = -4))
startmonth <- as.character(word(popepoll$date,1))
startmonthnum <- match(str_sub(startmonth, end = 3), month.abb)
endmonth <- as.character(str_extract(word(popepoll$date, start = 2), "[:alpha:]+"))
endmonth[is.na(endmonth)] <- startmonth[is.na(endmonth)] #was fun to figure out
endmonthnum <- match(str_sub(endmonth, end = 3), month.abb)
startday <- as.numeric(str_sub(str_extract(popepoll$date, "[:digit:]+-"), end = -2))
endday <- as.numeric(str_sub(str_extract(popepoll$date, "[:digit:]+,"), end = -2))

#concatenating and formatting date data 
popepoll$startdate <- as.Date(paste(startmonthnum, startday, year, sep = "/"), format = "%m/%d/%Y")
popepoll$enddate <- as.Date(paste(endmonthnum, endday, year, sep = "/"), format = "%m/%d/%Y")

Tidying data, although not entirely sure when the tidying process has technically started

popepolltidy <- popepoll[,c("name", "startdate", "enddate", "Favorable", "Unfavorable", "noopinion")] %>%
  gather(favorability, favscores, Favorable:noopinion)

str(popepolltidy)
## 'data.frame':    36 obs. of  5 variables:
##  $ name        : chr  "Francis" "Francis" "Benedict XVI" "Benedict XVI" ...
##  $ startdate   : Date, format: "2014-02-06" "2013-04-11" ...
##  $ enddate     : Date, format: "2014-02-09" "2013-04-14" ...
##  $ favorability: Factor w/ 3 levels "Favorable","Unfavorable",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ favscores   : num  76 58 40 63 52 50 55 78 73 61 ...

On average through their tenure, Francis and John Paul II had similarly high favorables, while Benedict XVI was slightly less known

#look at averaged favorables
popepolltidy %>%
  group_by(name, favorability) %>%
  summarize(averagescores = mean(favscores)) %>%
  spread(favorability, averagescores) %>%
  mutate(netfav = `Favorable` - `Unfavorable`)
## Source: local data frame [3 x 5]
## 
##           name Favorable Unfavorable noopinion netfav
##          (chr)     (dbl)       (dbl)     (dbl)  (dbl)
## 1 Benedict XVI      52.0        17.8      30.2   34.2
## 2      Francis      67.0         9.5      23.5   57.5
## 3 John Paul II      72.4        15.4      12.2   57.0

While Francis has held the papacy for only a fraction of the time of his last 2 predecessors, he’s grown in popularity quite quickly, similar to the movement John Paul saw over his entire tenure.

Benedict, however, became increasingly unpopular during his reign. Despite being a well educated and liberal theologian, he advocated a return to fundamental Christian values to counter the increasing secularism of western countries, and eventually was moved to resign, giving way to the more open and accepting direction of Francis.

#create table of last opinion polls of tenure
lastpolls <- popepolltidy %>%
  group_by(name) %>%
  top_n(3, startdate) %>%
  spread(favorability, favscores) %>%
  mutate(netfavorable = Favorable - Unfavorable) %>%
  mutate(awareness = Favorable + Unfavorable)

#create table of first opinion polls of tenure
firstpolls <- popepolltidy %>%
  group_by(name) %>%
  arrange(desc(startdate), name, favorability) %>%
  do(tail(., n=3)) %>% #this took some research. Dplyr has top_n, but not bottom_n. When you order dates descending, top_n reorders dates into ascending, so I had to go out of book. 
  arrange(name) %>%
  spread(favorability, favscores) %>%
  mutate(netfavorable = Favorable - Unfavorable) %>%
  mutate(awareness = Favorable + Unfavorable)

#calculate opinion movement over time
popepollsdiff <- data.frame(lastpolls$name)
popepollsdiff$timeframedays <- lastpolls$enddate - firstpolls$enddate
popepollsdiff$favchange <- lastpolls$Favorable - firstpolls$Favorable
popepollsdiff$unfavchange <- lastpolls$Unfavorable - firstpolls$Unfavorable
popepollsdiff$noopinchange <- lastpolls$noopinion - firstpolls$noopinion
popepollsdiff$netfavchange <- lastpolls$netfavorable - firstpolls$netfavorable
popepollsdiff$awarenesschange <- lastpolls$awareness - firstpolls$awareness
popepollsdiff
##   lastpolls.name timeframedays favchange unfavchange noopinchange
## 1   Benedict XVI     1792 days       -15          23           -8
## 2        Francis      301 days        18          -1          -15
## 3   John Paul II     4210 days        14          -4          -10
##   netfavchange awarenesschange
## 1          -38               8
## 2           19              17
## 3           18              10