Part 3 of my additional practice with data clean-up and transformations using tidyr and dplyr. CRAN documentation available for tidyr is available here and dplr documentation can be found here.

library(tidyr)
library(dplyr)
library(plyr)
library(ggplot2)

This chart lists digital ad revenue by billion(s) of US dollars available via the Pew Research Center:

x

  1. Create a CSV file that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations:
year <- c('2009','2010','2011','2012','2013')
google <- c('0.36','0.86','1.67','2.26','2.99')
facebook <- c('0.56','1.21','1.73','2.18','3.17')
yahoo <- c('1.26','1.43','1.36','1.35','1.27') 
microsoft <- c('0.37','0.51','0.60','0.90','0.79')
aol <- c('0.51','0.47','0.53','0.70','0.73')
ad_data <- data.frame(year,google,facebook,yahoo,microsoft,aol)
ad_data
##   year google facebook yahoo microsoft  aol
## 1 2009   0.36     0.56  1.26      0.37 0.51
## 2 2010   0.86     1.21  1.43      0.51 0.47
## 3 2011   1.67     1.73  1.36      0.60 0.53
## 4 2012   2.26     2.18  1.35      0.90 0.70
## 5 2013   2.99     3.17  1.27      0.79 0.73
#Write the CSV
write.csv(ad_data, "ad_data.csv")
  1. Read the information from your CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
ad <- read.csv("ad_data.csv")
ad
##   X year google facebook yahoo microsoft  aol
## 1 1 2009   0.36     0.56  1.26      0.37 0.51
## 2 2 2010   0.86     1.21  1.43      0.51 0.47
## 3 3 2011   1.67     1.73  1.36      0.60 0.53
## 4 4 2012   2.26     2.18  1.35      0.90 0.70
## 5 5 2013   2.99     3.17  1.27      0.79 0.73

Restructure the dataset

colnames(ad)
## [1] "X"         "year"      "google"    "facebook"  "yahoo"     "microsoft"
## [7] "aol"
# Remove X column automatically created by csv import
ad <- ad[,2:6]
ad
##   year google facebook yahoo microsoft
## 1 2009   0.36     0.56  1.26      0.37
## 2 2010   0.86     1.21  1.43      0.51
## 3 2011   1.67     1.73  1.36      0.60
## 4 2012   2.26     2.18  1.35      0.90
## 5 2013   2.99     3.17  1.27      0.79
# Convert the dataframe to a long format using gather()
tidy_ad <- ad %>% gather(company, ad_rev, -year)
# Arrange the data by year
tidy_ad <- arrange(tidy_ad, year)
# Check the new structure
str(tidy_ad)
## 'data.frame':    20 obs. of  3 variables:
##  $ year   : int  2009 2009 2009 2009 2010 2010 2010 2010 2011 2011 ...
##  $ company: Factor w/ 4 levels "google","facebook",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ ad_rev : num  0.36 0.56 1.26 0.37 0.86 1.21 1.43 0.51 1.67 1.73 ...
tidy_ad
##    year   company ad_rev
## 1  2009    google   0.36
## 2  2009  facebook   0.56
## 3  2009     yahoo   1.26
## 4  2009 microsoft   0.37
## 5  2010    google   0.86
## 6  2010  facebook   1.21
## 7  2010     yahoo   1.43
## 8  2010 microsoft   0.51
## 9  2011    google   1.67
## 10 2011  facebook   1.73
## 11 2011     yahoo   1.36
## 12 2011 microsoft   0.60
## 13 2012    google   2.26
## 14 2012  facebook   2.18
## 15 2012     yahoo   1.35
## 16 2012 microsoft   0.90
## 17 2013    google   2.99
## 18 2013  facebook   3.17
## 19 2013     yahoo   1.27
## 20 2013 microsoft   0.79
  1. For analysis: total revenue by year and company, average revenue by year and company.
# Check out the data graphically first to see any general trends in the data
qplot(year, ad_rev, data = tidy_ad, geom = c("point", "smooth"), method = lm,  facets =  . ~ company)

# Calculate the total revenue by company
# This can be done using ddply()
ddply(tidy_ad, .(company), summarise, sum = sum(ad_rev))
##     company  sum
## 1    google 8.14
## 2  facebook 8.85
## 3     yahoo 6.67
## 4 microsoft 3.17
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$company, FUN=sum)
##    google  facebook     yahoo microsoft 
##      8.14      8.85      6.67      3.17
# Calculate the total revenue by year -- this can be done very similarly to the calculation by company
ddply(tidy_ad, .(year), summarise, sum = sum(ad_rev))
##   year  sum
## 1 2009 2.55
## 2 2010 4.01
## 3 2011 5.36
## 4 2012 6.69
## 5 2013 8.22
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$year, FUN=sum)
## 2009 2010 2011 2012 2013 
## 2.55 4.01 5.36 6.69 8.22
# Calculate the average revenue by company and year -- also very similar to the last two calculations
# Average by company using ddply()
ddply(tidy_ad, .(company), summarise, avg = mean(ad_rev))
##     company   avg
## 1    google 1.628
## 2  facebook 1.770
## 3     yahoo 1.334
## 4 microsoft 0.634
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$company, FUN=mean)
##    google  facebook     yahoo microsoft 
##     1.628     1.770     1.334     0.634
# Average by year using ddply()
ddply(tidy_ad, .(year), summarise, avg = mean(ad_rev))
##   year    avg
## 1 2009 0.6375
## 2 2010 1.0025
## 3 2011 1.3400
## 4 2012 1.6725
## 5 2013 2.0550
# or tapply()
tapply(tidy_ad$ad_rev, tidy_ad$year, FUN=mean)
##   2009   2010   2011   2012   2013 
## 0.6375 1.0025 1.3400 1.6725 2.0550

To look a further we can check to see what year each company had their highest and lowest revenue by year

# Company max revenue
max_rev <- tidy_ad %>% group_by(company) %>% slice(which.max(ad_rev))
max_rev
## Source: local data frame [4 x 3]
## Groups: company [4]
## 
##    year   company ad_rev
##   (int)    (fctr)  (dbl)
## 1  2013    google   2.99
## 2  2013  facebook   3.17
## 3  2010     yahoo   1.43
## 4  2012 microsoft   0.90
# Company min revenue
min_rev <- tidy_ad %>% group_by(company) %>% slice(which.min(ad_rev))
min_rev
## Source: local data frame [4 x 3]
## Groups: company [4]
## 
##    year   company ad_rev
##   (int)    (fctr)  (dbl)
## 1  2009    google   0.36
## 2  2009  facebook   0.56
## 3  2009     yahoo   1.26
## 4  2009 microsoft   0.37