Additional practice with data clean-up and transformations using tidyr and dplyr. CRAN documentation available for tidyr is available here and dplr documentation can be found here.
library(tidyr)
library(dplyr)
library(ggplot2)
This chart describes past Popes’ favorability ratings over time:
# Create a dataframe
pope <- c(rep('Francis', 2), rep('Benedict_XVI', 5), rep('JohnPaul_II',5))
end_date <- as.Date(c("2014-02-09", "2013-04-14", "2010-03-28", "2008-04-20", "2007-06-03", "2005-12-18","2005-05-01","2005-02-27", "2003-10-08", "2002-05-01", "1998-12-29", "1993-08-10")) # Used end date of surveys for favorability rating
p_favorable <- c(76,58,40,63,52,50,55,78,73,61,86,64)
p_unfavorable <- c(9,10,35,15,16,11,12,11,17,26,8,15)
p_no_op = c(16,31,25,22,32,39,33,11,10,13,6,21)
pope_data <- data.frame(pope, end_date, p_favorable, p_unfavorable, p_no_op)
# Write the dataframe to a CSV in current working directory
write.csv(pope_data, "pope_data.csv")
# Read the CSV file into R
popes <- read.csv("pope_data.csv")
# Remove auto-generated indicator column from dataset
popes <- popes[,2:5]
# Convert the dates column back to date format
popes$end_date <- as.Date(popes$end_date)
# Check the structure
str(popes)
## 'data.frame': 12 obs. of 4 variables:
## $ pope : Factor w/ 3 levels "Benedict_XVI",..: 2 2 1 1 1 1 1 3 3 3 ...
## $ end_date : Date, format: "2014-02-09" "2013-04-14" ...
## $ p_favorable : int 76 58 40 63 52 50 55 78 73 61 ...
## $ p_unfavorable: int 9 10 35 15 16 11 12 11 17 26 ...
Restructure the dataset
# Create a column indicating favorability with two levels and another column with the value
# Change the colnames so transformation better populates the new structure
colnames(popes) <- c("pope", "date", "favorable", "unfavorable")
tidy_popes <- popes %>%
gather(favorability, percentage, -pope, -date) # Convert the data into a long format using gather() from tidyr package
str(tidy_popes)
## 'data.frame': 24 obs. of 4 variables:
## $ pope : Factor w/ 3 levels "Benedict_XVI",..: 2 2 1 1 1 1 1 3 3 3 ...
## $ date : Date, format: "2014-02-09" "2013-04-14" ...
## $ favorability: Factor w/ 2 levels "favorable","unfavorable": 1 1 1 1 1 1 1 1 1 1 ...
## $ percentage : int 76 58 40 63 52 50 55 78 73 61 ...
tidy_popes
## pope date favorability percentage
## 1 Francis 2014-02-09 favorable 76
## 2 Francis 2013-04-14 favorable 58
## 3 Benedict_XVI 2010-03-28 favorable 40
## 4 Benedict_XVI 2008-04-20 favorable 63
## 5 Benedict_XVI 2007-06-03 favorable 52
## 6 Benedict_XVI 2005-12-18 favorable 50
## 7 Benedict_XVI 2005-05-01 favorable 55
## 8 JohnPaul_II 2005-02-27 favorable 78
## 9 JohnPaul_II 2003-10-08 favorable 73
## 10 JohnPaul_II 2002-05-01 favorable 61
## 11 JohnPaul_II 1998-12-29 favorable 86
## 12 JohnPaul_II 1993-08-10 favorable 64
## 13 Francis 2014-02-09 unfavorable 9
## 14 Francis 2013-04-14 unfavorable 10
## 15 Benedict_XVI 2010-03-28 unfavorable 35
## 16 Benedict_XVI 2008-04-20 unfavorable 15
## 17 Benedict_XVI 2007-06-03 unfavorable 16
## 18 Benedict_XVI 2005-12-18 unfavorable 11
## 19 Benedict_XVI 2005-05-01 unfavorable 12
## 20 JohnPaul_II 2005-02-27 unfavorable 11
## 21 JohnPaul_II 2003-10-08 unfavorable 17
## 22 JohnPaul_II 2002-05-01 unfavorable 26
## 23 JohnPaul_II 1998-12-29 unfavorable 8
## 24 JohnPaul_II 1993-08-10 unfavorable 15
f_popes <- filter(tidy_popes, favorability == "favorable")
u_popes <- filter(tidy_popes, favorability == "unfavorable")
# Create a table to contain all Popes with their net favorability by date
net_f_popes <- abs(f_popes$percentage - u_popes$percentage)
n_popes <- f_popes[,1:2]
n_popes <- cbind(n_popes, net_f_popes)
# Rename the columns to make it cleaner
colnames(n_popes) <- c("Pope", "Survey_Date", "Net_Favorability")
n_popes
## Pope Survey_Date Net_Favorability
## 1 Francis 2014-02-09 67
## 2 Francis 2013-04-14 48
## 3 Benedict_XVI 2010-03-28 5
## 4 Benedict_XVI 2008-04-20 48
## 5 Benedict_XVI 2007-06-03 36
## 6 Benedict_XVI 2005-12-18 39
## 7 Benedict_XVI 2005-05-01 43
## 8 JohnPaul_II 2005-02-27 67
## 9 JohnPaul_II 2003-10-08 56
## 10 JohnPaul_II 2002-05-01 35
## 11 JohnPaul_II 1998-12-29 78
## 12 JohnPaul_II 1993-08-10 49
# Check out when each Pope was least favorable
min_net <- n_popes %>% group_by(Pope) %>% slice(which.min(Net_Favorability))
min_net
## Source: local data frame [3 x 3]
## Groups: Pope [3]
##
## Pope Survey_Date Net_Favorability
## (fctr) (date) (int)
## 1 Benedict_XVI 2010-03-28 5
## 2 Francis 2013-04-14 48
## 3 JohnPaul_II 2002-05-01 35
# Check out when each Pope was most favorable
max_net <- n_popes %>% group_by(Pope) %>% slice(which.max(Net_Favorability))
max_net
## Source: local data frame [3 x 3]
## Groups: Pope [3]
##
## Pope Survey_Date Net_Favorability
## (fctr) (date) (int)
## 1 Benedict_XVI 2008-04-20 48
## 2 Francis 2014-02-09 67
## 3 JohnPaul_II 1998-12-29 78
# Plot to compare Pope's net favorability over time
qplot(Survey_Date, Net_Favorability, data = n_popes, stat="identity", geom = "bar", fill = Pope, color = Pope)
# Create a table to contain all Popes with their awareness by date
aware_popes <- (f_popes$percentage + u_popes$percentage)
a_popes <- f_popes[,1:2]
a_popes <- cbind(a_popes, aware_popes)
# Rename the columns to make it cleaner
colnames(a_popes) <- c("Pope", "Survey_Date", "Percent_Awareness")
a_popes
## Pope Survey_Date Percent_Awareness
## 1 Francis 2014-02-09 85
## 2 Francis 2013-04-14 68
## 3 Benedict_XVI 2010-03-28 75
## 4 Benedict_XVI 2008-04-20 78
## 5 Benedict_XVI 2007-06-03 68
## 6 Benedict_XVI 2005-12-18 61
## 7 Benedict_XVI 2005-05-01 67
## 8 JohnPaul_II 2005-02-27 89
## 9 JohnPaul_II 2003-10-08 90
## 10 JohnPaul_II 2002-05-01 87
## 11 JohnPaul_II 1998-12-29 94
## 12 JohnPaul_II 1993-08-10 79
# Plot to compare Pope's percent awareness over time
qplot(Survey_Date, Percent_Awareness, data = a_popes, stat="identity", geom = "bar", fill = Pope, color = Pope)
# Check out when each Pope had the least general awareness
min_aware <- a_popes %>% group_by(Pope) %>% slice(which.min(Percent_Awareness))
min_aware
## Source: local data frame [3 x 3]
## Groups: Pope [3]
##
## Pope Survey_Date Percent_Awareness
## (fctr) (date) (int)
## 1 Benedict_XVI 2005-12-18 61
## 2 Francis 2013-04-14 68
## 3 JohnPaul_II 1993-08-10 79
# Check out when each Pope had the most general awareness
max_aware <- a_popes %>% group_by(Pope) %>% slice(which.max(Percent_Awareness))
max_aware
## Source: local data frame [3 x 3]
## Groups: Pope [3]
##
## Pope Survey_Date Percent_Awareness
## (fctr) (date) (int)
## 1 Benedict_XVI 2008-04-20 78
## 2 Francis 2014-02-09 85
## 3 JohnPaul_II 1998-12-29 94