Part 3 of my additional practice with data clean-up and transformations using tidyr and dplyr. CRAN documentation available for tidyr is available here and dplr documentation can be found here.
library(tidyr)
library(dplyr)
library(plyr)
library(ggplot2)
This chart lists digital ad revenue by billion(s) of US dollars available via the Pew Research Center:
year <- c('2009','2010','2011','2012','2013')
google <- c('0.36','0.86','1.67','2.26','2.99')
facebook <- c('0.56','1.21','1.73','2.18','3.17')
yahoo <- c('1.26','1.43','1.36','1.35','1.27')
microsoft <- c('0.37','0.51','0.60','0.90','0.79')
aol <- c('0.51','0.47','0.53','0.70','0.73')
ad_data <- data.frame(year,google,facebook,yahoo,microsoft,aol)
ad_data
## year google facebook yahoo microsoft aol
## 1 2009 0.36 0.56 1.26 0.37 0.51
## 2 2010 0.86 1.21 1.43 0.51 0.47
## 3 2011 1.67 1.73 1.36 0.60 0.53
## 4 2012 2.26 2.18 1.35 0.90 0.70
## 5 2013 2.99 3.17 1.27 0.79 0.73
#Write the CSV
write.csv(ad_data, "ad_data.csv")
ad <- read.csv("ad_data.csv")
ad
## X year google facebook yahoo microsoft aol
## 1 1 2009 0.36 0.56 1.26 0.37 0.51
## 2 2 2010 0.86 1.21 1.43 0.51 0.47
## 3 3 2011 1.67 1.73 1.36 0.60 0.53
## 4 4 2012 2.26 2.18 1.35 0.90 0.70
## 5 5 2013 2.99 3.17 1.27 0.79 0.73
Restructure the dataset
colnames(ad)
## [1] "X" "year" "google" "facebook" "yahoo" "microsoft"
## [7] "aol"
# Remove X column automatically created by csv import
ad <- ad[,2:6]
ad
## year google facebook yahoo microsoft
## 1 2009 0.36 0.56 1.26 0.37
## 2 2010 0.86 1.21 1.43 0.51
## 3 2011 1.67 1.73 1.36 0.60
## 4 2012 2.26 2.18 1.35 0.90
## 5 2013 2.99 3.17 1.27 0.79
# Convert the dataframe to a long format using gather()
tidy_ad <- ad %>% gather(company, ad_rev, -year)
# Arrange the data by year
tidy_ad <- arrange(tidy_ad, year)
# Check the new structure
str(tidy_ad)
## 'data.frame': 20 obs. of 3 variables:
## $ year : int 2009 2009 2009 2009 2010 2010 2010 2010 2011 2011 ...
## $ company: Factor w/ 4 levels "google","facebook",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ ad_rev : num 0.36 0.56 1.26 0.37 0.86 1.21 1.43 0.51 1.67 1.73 ...
tidy_ad
## year company ad_rev
## 1 2009 google 0.36
## 2 2009 facebook 0.56
## 3 2009 yahoo 1.26
## 4 2009 microsoft 0.37
## 5 2010 google 0.86
## 6 2010 facebook 1.21
## 7 2010 yahoo 1.43
## 8 2010 microsoft 0.51
## 9 2011 google 1.67
## 10 2011 facebook 1.73
## 11 2011 yahoo 1.36
## 12 2011 microsoft 0.60
## 13 2012 google 2.26
## 14 2012 facebook 2.18
## 15 2012 yahoo 1.35
## 16 2012 microsoft 0.90
## 17 2013 google 2.99
## 18 2013 facebook 3.17
## 19 2013 yahoo 1.27
## 20 2013 microsoft 0.79
# Check out the data graphically first to see any general trends in the data
qplot(year, ad_rev, data = tidy_ad, geom = c("point", "smooth"), method = lm, facets = . ~ company)
# Calculate the total revenue by company
# This can be done using ddply()
ddply(tidy_ad, .(company), summarise, sum = sum(ad_rev))
## company sum
## 1 google 8.14
## 2 facebook 8.85
## 3 yahoo 6.67
## 4 microsoft 3.17
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$company, FUN=sum)
## google facebook yahoo microsoft
## 8.14 8.85 6.67 3.17
# Calculate the total revenue by year -- this can be done very similarly to the calculation by company
ddply(tidy_ad, .(year), summarise, sum = sum(ad_rev))
## year sum
## 1 2009 2.55
## 2 2010 4.01
## 3 2011 5.36
## 4 2012 6.69
## 5 2013 8.22
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$year, FUN=sum)
## 2009 2010 2011 2012 2013
## 2.55 4.01 5.36 6.69 8.22
# Calculate the average revenue by company and year -- also very similar to the last two calculations
# Average by company using ddply()
ddply(tidy_ad, .(company), summarise, avg = mean(ad_rev))
## company avg
## 1 google 1.628
## 2 facebook 1.770
## 3 yahoo 1.334
## 4 microsoft 0.634
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$company, FUN=mean)
## google facebook yahoo microsoft
## 1.628 1.770 1.334 0.634
# Average by year using ddply()
ddply(tidy_ad, .(year), summarise, avg = mean(ad_rev))
## year avg
## 1 2009 0.6375
## 2 2010 1.0025
## 3 2011 1.3400
## 4 2012 1.6725
## 5 2013 2.0550
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$year, FUN=mean)
## 2009 2010 2011 2012 2013
## 0.6375 1.0025 1.3400 1.6725 2.0550
To look a further we can check to see what year each company had their highest and lowest revenue by year
# Company max revenue
max_rev <- tidy_ad %>% group_by(company) %>% slice(which.max(ad_rev))
max_rev
## Source: local data frame [4 x 3]
## Groups: company [4]
##
## year company ad_rev
## (int) (fctr) (dbl)
## 1 2013 google 2.99
## 2 2013 facebook 3.17
## 3 2010 yahoo 1.43
## 4 2012 microsoft 0.90
# Company min revenue
min_rev <- tidy_ad %>% group_by(company) %>% slice(which.min(ad_rev))
min_rev
## Source: local data frame [4 x 3]
## Groups: company [4]
##
## year company ad_rev
## (int) (fctr) (dbl)
## 1 2009 google 0.36
## 2 2009 facebook 0.56
## 3 2009 yahoo 1.26
## 4 2009 microsoft 0.37