Additional practice with data clean-up and transformations using tidyr and dplyr. CRAN documentation available for tidyr is available here and dplr documentation can be found here.

library(tidyr)
library(dplyr)
library(ggplot2)

This chart describes past Popes’ favorability ratings over time:

x

  1. Create a CSV file that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations:
# Create a dataframe
pope <- c(rep('Francis', 2), rep('Benedict_XVI', 5), rep('JohnPaul_II',5))
end_date <- as.Date(c("2014-02-09", "2013-04-14", "2010-03-28", "2008-04-20", "2007-06-03", "2005-12-18","2005-05-01","2005-02-27", "2003-10-08", "2002-05-01", "1998-12-29", "1993-08-10")) # Used end date of surveys for favorability rating
p_favorable <- c(76,58,40,63,52,50,55,78,73,61,86,64)
p_unfavorable <- c(9,10,35,15,16,11,12,11,17,26,8,15)
p_no_op = c(16,31,25,22,32,39,33,11,10,13,6,21)

pope_data <- data.frame(pope, end_date, p_favorable, p_unfavorable, p_no_op)

# Write the dataframe to a CSV in current working directory
write.csv(pope_data, "pope_data.csv")
  1. Read the information from your CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
# Read the CSV file into R
popes <- read.csv("pope_data.csv")

# Remove auto-generated indicator column from dataset
popes <- popes[,2:5]

# Convert the dates column back to date format
popes$end_date <- as.Date(popes$end_date)

# Check the structure
str(popes)
## 'data.frame':    12 obs. of  4 variables:
##  $ pope         : Factor w/ 3 levels "Benedict_XVI",..: 2 2 1 1 1 1 1 3 3 3 ...
##  $ end_date     : Date, format: "2014-02-09" "2013-04-14" ...
##  $ p_favorable  : int  76 58 40 63 52 50 55 78 73 61 ...
##  $ p_unfavorable: int  9 10 35 15 16 11 12 11 17 26 ...

Restructure the dataset

# Create a column indicating favorability with two levels and another column with the   value

# Change the colnames so transformation better populates the new structure
colnames(popes) <- c("pope", "date", "favorable", "unfavorable")

tidy_popes <- popes %>%
  gather(favorability, percentage, -pope, -date) # Convert the data into a long format using gather() from tidyr package

str(tidy_popes) 
## 'data.frame':    24 obs. of  4 variables:
##  $ pope        : Factor w/ 3 levels "Benedict_XVI",..: 2 2 1 1 1 1 1 3 3 3 ...
##  $ date        : Date, format: "2014-02-09" "2013-04-14" ...
##  $ favorability: Factor w/ 2 levels "favorable","unfavorable": 1 1 1 1 1 1 1 1 1 1 ...
##  $ percentage  : int  76 58 40 63 52 50 55 78 73 61 ...
tidy_popes
##            pope       date favorability percentage
## 1       Francis 2014-02-09    favorable         76
## 2       Francis 2013-04-14    favorable         58
## 3  Benedict_XVI 2010-03-28    favorable         40
## 4  Benedict_XVI 2008-04-20    favorable         63
## 5  Benedict_XVI 2007-06-03    favorable         52
## 6  Benedict_XVI 2005-12-18    favorable         50
## 7  Benedict_XVI 2005-05-01    favorable         55
## 8   JohnPaul_II 2005-02-27    favorable         78
## 9   JohnPaul_II 2003-10-08    favorable         73
## 10  JohnPaul_II 2002-05-01    favorable         61
## 11  JohnPaul_II 1998-12-29    favorable         86
## 12  JohnPaul_II 1993-08-10    favorable         64
## 13      Francis 2014-02-09  unfavorable          9
## 14      Francis 2013-04-14  unfavorable         10
## 15 Benedict_XVI 2010-03-28  unfavorable         35
## 16 Benedict_XVI 2008-04-20  unfavorable         15
## 17 Benedict_XVI 2007-06-03  unfavorable         16
## 18 Benedict_XVI 2005-12-18  unfavorable         11
## 19 Benedict_XVI 2005-05-01  unfavorable         12
## 20  JohnPaul_II 2005-02-27  unfavorable         11
## 21  JohnPaul_II 2003-10-08  unfavorable         17
## 22  JohnPaul_II 2002-05-01  unfavorable         26
## 23  JohnPaul_II 1998-12-29  unfavorable          8
## 24  JohnPaul_II 1993-08-10  unfavorable         15
  1. For analysis: find the net favorables (favorable - unfavorable) and awareness (fav + unfav) for each pope over their full tenure.
f_popes <- filter(tidy_popes, favorability == "favorable")
u_popes <- filter(tidy_popes, favorability == "unfavorable")

# Create a table to contain all Popes with their net favorability by date
net_f_popes <- abs(f_popes$percentage - u_popes$percentage)
n_popes <- f_popes[,1:2]
n_popes <- cbind(n_popes, net_f_popes)

# Rename the columns to make it cleaner
colnames(n_popes) <- c("Pope", "Survey_Date", "Net_Favorability")
n_popes 
##            Pope Survey_Date Net_Favorability
## 1       Francis  2014-02-09               67
## 2       Francis  2013-04-14               48
## 3  Benedict_XVI  2010-03-28                5
## 4  Benedict_XVI  2008-04-20               48
## 5  Benedict_XVI  2007-06-03               36
## 6  Benedict_XVI  2005-12-18               39
## 7  Benedict_XVI  2005-05-01               43
## 8   JohnPaul_II  2005-02-27               67
## 9   JohnPaul_II  2003-10-08               56
## 10  JohnPaul_II  2002-05-01               35
## 11  JohnPaul_II  1998-12-29               78
## 12  JohnPaul_II  1993-08-10               49
# Check out when each Pope was least favorable
min_net <- n_popes %>% group_by(Pope) %>% slice(which.min(Net_Favorability))
min_net
## Source: local data frame [3 x 3]
## Groups: Pope [3]
## 
##           Pope Survey_Date Net_Favorability
##         (fctr)      (date)            (int)
## 1 Benedict_XVI  2010-03-28                5
## 2      Francis  2013-04-14               48
## 3  JohnPaul_II  2002-05-01               35
# Check out when each Pope was most favorable
max_net <- n_popes %>% group_by(Pope) %>% slice(which.max(Net_Favorability))
max_net
## Source: local data frame [3 x 3]
## Groups: Pope [3]
## 
##           Pope Survey_Date Net_Favorability
##         (fctr)      (date)            (int)
## 1 Benedict_XVI  2008-04-20               48
## 2      Francis  2014-02-09               67
## 3  JohnPaul_II  1998-12-29               78
# Plot to compare Pope's net favorability over time
qplot(Survey_Date, Net_Favorability, data = n_popes, stat="identity", geom = "bar", fill = Pope, color = Pope)

# Create a table to contain all Popes with their awareness by date
aware_popes <- (f_popes$percentage + u_popes$percentage)
a_popes <- f_popes[,1:2]
a_popes <- cbind(a_popes, aware_popes)

# Rename the columns to make it cleaner
colnames(a_popes) <- c("Pope", "Survey_Date", "Percent_Awareness")
a_popes
##            Pope Survey_Date Percent_Awareness
## 1       Francis  2014-02-09                85
## 2       Francis  2013-04-14                68
## 3  Benedict_XVI  2010-03-28                75
## 4  Benedict_XVI  2008-04-20                78
## 5  Benedict_XVI  2007-06-03                68
## 6  Benedict_XVI  2005-12-18                61
## 7  Benedict_XVI  2005-05-01                67
## 8   JohnPaul_II  2005-02-27                89
## 9   JohnPaul_II  2003-10-08                90
## 10  JohnPaul_II  2002-05-01                87
## 11  JohnPaul_II  1998-12-29                94
## 12  JohnPaul_II  1993-08-10                79
# Plot to compare Pope's percent awareness over time
qplot(Survey_Date, Percent_Awareness, data = a_popes, stat="identity", geom = "bar", fill = Pope, color = Pope)

# Check out when each Pope had the least general awareness
min_aware <- a_popes %>% group_by(Pope) %>% slice(which.min(Percent_Awareness))
min_aware
## Source: local data frame [3 x 3]
## Groups: Pope [3]
## 
##           Pope Survey_Date Percent_Awareness
##         (fctr)      (date)             (int)
## 1 Benedict_XVI  2005-12-18                61
## 2      Francis  2013-04-14                68
## 3  JohnPaul_II  1993-08-10                79
# Check out when each Pope had the most general awareness
max_aware <- a_popes %>% group_by(Pope) %>% slice(which.max(Percent_Awareness))
max_aware
## Source: local data frame [3 x 3]
## Groups: Pope [3]
## 
##           Pope Survey_Date Percent_Awareness
##         (fctr)      (date)             (int)
## 1 Benedict_XVI  2008-04-20                78
## 2      Francis  2014-02-09                85
## 3  JohnPaul_II  1998-12-29                94