US Digital Ad Sales Revenues
library(gridExtra)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(XML)
library(tidyr)
url<-"http://www.journalism.org/media-indicators/digital-u-s-display-advertising-by-company/"
tbl<-readHTMLTable(url)
tbl <- data.frame(tbl[[1]],stringsAsFactors = FALSE)
temp<-names(tbl)
temp[1]<-"Year"
names(tbl)<-temp
#Convert Factors to Numbers
tbl<-sapply(tbl, function(x) if(is.factor(x)) { as.numeric(as.character(x))} else {x})
tbl<-data.frame(tbl)
#The data
tbl
## Year Google Facebook Yahoo Microsoft AOL Total
## 1 2009 0.36 0.56 1.26 0.37 0.51 7.97
## 2 2010 0.86 1.21 1.43 0.51 0.47 9.91
## 3 2011 1.67 1.73 1.36 0.60 0.53 12.33
## 4 2012 2.26 2.18 1.35 0.90 0.70 14.78
## 5 2013 2.99 3.17 1.27 0.79 0.73 17.72
#Gather Data into one row per Company per Year
gather(tbl,Company,Revenue,2:ncol(tbl))
## Year Company Revenue
## 1 2009 Google 0.36
## 2 2010 Google 0.86
## 3 2011 Google 1.67
## 4 2012 Google 2.26
## 5 2013 Google 2.99
## 6 2009 Facebook 0.56
## 7 2010 Facebook 1.21
## 8 2011 Facebook 1.73
## 9 2012 Facebook 2.18
## 10 2013 Facebook 3.17
## 11 2009 Yahoo 1.26
## 12 2010 Yahoo 1.43
## 13 2011 Yahoo 1.36
## 14 2012 Yahoo 1.35
## 15 2013 Yahoo 1.27
## 16 2009 Microsoft 0.37
## 17 2010 Microsoft 0.51
## 18 2011 Microsoft 0.60
## 19 2012 Microsoft 0.90
## 20 2013 Microsoft 0.79
## 21 2009 AOL 0.51
## 22 2010 AOL 0.47
## 23 2011 AOL 0.53
## 24 2012 AOL 0.70
## 25 2013 AOL 0.73
## 26 2009 Total 7.97
## 27 2010 Total 9.91
## 28 2011 Total 12.33
## 29 2012 Total 14.78
## 30 2013 Total 17.72
tbl_tidy<-gather(tbl,Company,Revenue,2:ncol(tbl))
#Plot data
g1<-ggplot(data=tbl_tidy[tbl_tidy$Company=="Total",], aes(x=Year, y=Revenue, colour=Company)) + geom_line(aes(group=Company))
g2<-ggplot(data=tbl_tidy[tbl_tidy$Company!="Total",], aes(x=Year, y=Revenue, colour=Company)) + geom_line(aes(group=Company))
grid.arrange(g1, g2, nrow=2)

#The revenues are rising fast; but all gains are limited to Google and Facebook. Facebook seems to have overtaken Google sometime in 2012.
#Calculate percentage change along columns (Companies)
goog<-round(tbl$Google/lag(tbl$Google),2)-1
fb<-round(tbl$Facebook/lag(tbl$Facebook),2)-1
#Aggregate, gather and plot data
top2<-data.frame(cbind(2010:2013,goog[2:5],fb[2:5]))
names(top2)<-c('Year','Google','Facebook')
#Google & Facebook growth rate comparision
top2
## Year Google Facebook
## 1 2010 1.39 1.16
## 2 2011 0.94 0.43
## 3 2012 0.35 0.26
## 4 2013 0.32 0.45
top2<-gather(top2,Company,Percent_Chg,2:3)
ggplot(data=top2, aes(x=Year, y=Percent_Chg, colour=Company)) + geom_line(aes(group=Company))

#For both Google and Facebook the growth rate has tempered but Facebook seems to be beating Google.
Popularity of Past Popes
library(stringr)
library(tm)
## Warning: package 'tm' was built under R version 3.2.2
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
x<-read.csv('https://raw.githubusercontent.com/vskrelja/607_DataAcqMgt_Skrelja/master/Popes.csv')
x$Pope<-c(rep("POPE FRANCIS",5),rep("POPE BENEDICT XVI",7),rep("POPE JOHN PAUL II",6))
x<-x[!(is.na(x$Favorable) | x$Favorable==""), ]
x<-x[!(is.na(x$X) | x$X==""), ]
colnames(x)[1]<-c("Period")
colnames(x)[4]<-c("No_Opinion")
#Determine date of the survey
x<-separate(x,Period,c("Start","End"),sep="-")
x$End<-str_sub(x$End, start= -4)
x<-unite(x,Date,Start,End,sep = " ")
x$Date<-stripWhitespace(x$Date)
x$Date<-as.Date(x$Date,format = "%b %d %Y")
#Convert factors to numeric before Mutate
x[,2:4]<-sapply(x[,2:4], function(x) if(is.factor(x)) { as.numeric(as.character(x))} else {x})
x<-mutate(x,Percent_Favorable = round(100*Favorable/(Favorable+Unfavorable+No_Opinion),2))
ggplot(data=x, aes(x=Date, y=Percent_Favorable, fill=Pope)) + geom_bar(stat='identity',aes(group=Pope))

#Pope Frank has already superceded Pope Benedict and seems all set to give John Paul a run for his money!
NY Times Best Seller List
url<-"http://www.nytimes.com/best-sellers-books/2015-10-04/combined-print-and-e-book-fiction/list.html"
tbl<-htmlTreeParse(url,useInternal=TRUE)
#Get the values from the webpage
ranks<-xpathSApply(tbl,"//span[@class='ranking']",xmlValue)
lastWeek_numWeeks<-xpathSApply(tbl,"//td[@class='weeklyPosition']",xmlValue)
books<-xpathSApply(tbl,"//span[@class='bookName']",xmlValue)
#Create table using the vectors extracted above
#lastWeek_numWeeks contains both last week's rank and number of weeks on the charts
lastWeek_numWeeks<-matrix(lastWeek_numWeeks,nrow=length(ranks),2,byrow=TRUE)
lastWeek_numWeeks<-data.frame(lastWeek_numWeeks)
colnames(lastWeek_numWeeks)<-c("lastWeek","numWeeks")
#Create a main data frame
df<-cbind(books,ranks,lastWeek_numWeeks)
colnames(df)[2]<-c("currWeek")
#Remove comma's from the book name
df$books<-gsub(",","",df$books)
#convert to numeric
df[,2:4]<-sapply(df[,2:4], function(x) if(is.factor(x)) { as.numeric(as.character(x))} else {x})
df<-arrange(df,-numWeeks)
#The List
df
## books currWeek lastWeek numWeeks
## 1 ALL THE LIGHT WE CANNOT SEE 11 9 53
## 2 THE GIRL ON THE TRAIN 8 4 36
## 3 GRAY MOUNTAIN 10 7 21
## 4 THE MARTIAN 1 3 17
## 5 GO SET A WATCHMAN 12 5 10
## 6 X 13 6 4
## 7 THE GIRL IN THE SPIDER'S WEB 4 2 3
## 8 MAKE ME 3 1 2
## 9 HOLLYWOOD DIRT 15 8 2
## 10 DEVOTED IN DEATH 2 NA 1
## 11 THE END GAME 5 NA 1
## 12 HARD LOVE 6 NA 1
## 13 THE SCAM 7 NA 1
## 14 FATES AND FURIES 9 NA 1
## 15 FATAL FRENZY 14 NA 1
#Find the New Arrivals on the charts
df<-mutate(df,New_Arrival=numWeeks==1)
summary <- df %>% group_by(New_Arrival) %>% summarise(NumOfBooks=n(),AvgRank=mean(currWeek),RankStdDev=sd(currWeek))
#gather to plot
summary <- gather(summary,SummaryStatitic,Value,3:4)
summary
## Source: local data frame [4 x 4]
##
## New_Arrival NumOfBooks SummaryStatitic Value
## (lgl) (int) (fctr) (dbl)
## 1 FALSE 9 AvgRank 8.555556
## 2 TRUE 6 AvgRank 7.166667
## 3 FALSE 9 RankStdDev 4.876246
## 4 TRUE 6 RankStdDev 4.070217
ggplot(data=summary, aes(x=New_Arrival, y=Value, fill=SummaryStatitic)) + geom_bar(stat='identity',aes(group=SummaryStatitic),position="dodge")

#The new arrivals seem to arrive on the charts at a higher rank and with a lower dispersion of ranks arround that higher number.