US Digital Ad Sales Revenues

library(gridExtra)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.2

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(XML)
library(tidyr)


url<-"http://www.journalism.org/media-indicators/digital-u-s-display-advertising-by-company/"
tbl<-readHTMLTable(url)

tbl <- data.frame(tbl[[1]],stringsAsFactors = FALSE)
temp<-names(tbl)
temp[1]<-"Year"
names(tbl)<-temp

#Convert Factors to Numbers
tbl<-sapply(tbl, function(x) if(is.factor(x)) { as.numeric(as.character(x))} else {x})
tbl<-data.frame(tbl)

#The data
tbl

##   Year Google Facebook Yahoo Microsoft  AOL Total
## 1 2009   0.36     0.56  1.26      0.37 0.51  7.97
## 2 2010   0.86     1.21  1.43      0.51 0.47  9.91
## 3 2011   1.67     1.73  1.36      0.60 0.53 12.33
## 4 2012   2.26     2.18  1.35      0.90 0.70 14.78
## 5 2013   2.99     3.17  1.27      0.79 0.73 17.72

#Gather Data into one row per Company per Year
gather(tbl,Company,Revenue,2:ncol(tbl))

##    Year   Company Revenue
## 1  2009    Google    0.36
## 2  2010    Google    0.86
## 3  2011    Google    1.67
## 4  2012    Google    2.26
## 5  2013    Google    2.99
## 6  2009  Facebook    0.56
## 7  2010  Facebook    1.21
## 8  2011  Facebook    1.73
## 9  2012  Facebook    2.18
## 10 2013  Facebook    3.17
## 11 2009     Yahoo    1.26
## 12 2010     Yahoo    1.43
## 13 2011     Yahoo    1.36
## 14 2012     Yahoo    1.35
## 15 2013     Yahoo    1.27
## 16 2009 Microsoft    0.37
## 17 2010 Microsoft    0.51
## 18 2011 Microsoft    0.60
## 19 2012 Microsoft    0.90
## 20 2013 Microsoft    0.79
## 21 2009       AOL    0.51
## 22 2010       AOL    0.47
## 23 2011       AOL    0.53
## 24 2012       AOL    0.70
## 25 2013       AOL    0.73
## 26 2009     Total    7.97
## 27 2010     Total    9.91
## 28 2011     Total   12.33
## 29 2012     Total   14.78
## 30 2013     Total   17.72

tbl_tidy<-gather(tbl,Company,Revenue,2:ncol(tbl))

#Plot data
g1<-ggplot(data=tbl_tidy[tbl_tidy$Company=="Total",], aes(x=Year, y=Revenue, colour=Company)) + geom_line(aes(group=Company))

g2<-ggplot(data=tbl_tidy[tbl_tidy$Company!="Total",], aes(x=Year, y=Revenue, colour=Company)) + geom_line(aes(group=Company))

grid.arrange(g1, g2, nrow=2)

#The revenues are rising fast; but all gains are limited to Google and Facebook. Facebook seems to have overtaken Google sometime in 2012.

#Calculate percentage change along columns (Companies)
goog<-round(tbl$Google/lag(tbl$Google),2)-1
fb<-round(tbl$Facebook/lag(tbl$Facebook),2)-1

#Aggregate, gather and plot data
top2<-data.frame(cbind(2010:2013,goog[2:5],fb[2:5]))
names(top2)<-c('Year','Google','Facebook')

#Google & Facebook growth rate comparision
top2

##   Year Google Facebook
## 1 2010   1.39     1.16
## 2 2011   0.94     0.43
## 3 2012   0.35     0.26
## 4 2013   0.32     0.45

top2<-gather(top2,Company,Percent_Chg,2:3)
ggplot(data=top2, aes(x=Year, y=Percent_Chg, colour=Company)) + geom_line(aes(group=Company))

#For both Google and Facebook the growth rate has tempered but Facebook seems to be beating Google.

Popularity of Past Popes

library(stringr)
library(tm)

## Warning: package 'tm' was built under R version 3.2.2

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

x<-read.csv('https://raw.githubusercontent.com/vskrelja/607_DataAcqMgt_Skrelja/master/Popes.csv')

x$Pope<-c(rep("POPE FRANCIS",5),rep("POPE BENEDICT XVI",7),rep("POPE JOHN PAUL II",6))


x<-x[!(is.na(x$Favorable) | x$Favorable==""), ]
x<-x[!(is.na(x$X) | x$X==""), ]
colnames(x)[1]<-c("Period")
colnames(x)[4]<-c("No_Opinion")

#Determine date of the survey
x<-separate(x,Period,c("Start","End"),sep="-")
x$End<-str_sub(x$End, start= -4)
x<-unite(x,Date,Start,End,sep = " ")
x$Date<-stripWhitespace(x$Date)
x$Date<-as.Date(x$Date,format = "%b %d %Y")

#Convert factors to numeric before Mutate
x[,2:4]<-sapply(x[,2:4], function(x) if(is.factor(x)) { as.numeric(as.character(x))} else {x})
x<-mutate(x,Percent_Favorable = round(100*Favorable/(Favorable+Unfavorable+No_Opinion),2))

ggplot(data=x, aes(x=Date, y=Percent_Favorable, fill=Pope)) + geom_bar(stat='identity',aes(group=Pope))

#Pope Frank has already superceded Pope Benedict and seems all set to give John Paul a run for his money!

NY Times Best Seller List

url<-"http://www.nytimes.com/best-sellers-books/2015-10-04/combined-print-and-e-book-fiction/list.html"
tbl<-htmlTreeParse(url,useInternal=TRUE)

#Get the values from the webpage
ranks<-xpathSApply(tbl,"//span[@class='ranking']",xmlValue)
lastWeek_numWeeks<-xpathSApply(tbl,"//td[@class='weeklyPosition']",xmlValue)
books<-xpathSApply(tbl,"//span[@class='bookName']",xmlValue)


#Create table using the vectors extracted above
#lastWeek_numWeeks contains both last week's rank and number of weeks on the charts
lastWeek_numWeeks<-matrix(lastWeek_numWeeks,nrow=length(ranks),2,byrow=TRUE)
lastWeek_numWeeks<-data.frame(lastWeek_numWeeks)
colnames(lastWeek_numWeeks)<-c("lastWeek","numWeeks")

#Create a main data frame
df<-cbind(books,ranks,lastWeek_numWeeks)
colnames(df)[2]<-c("currWeek")

#Remove comma's from the book name
df$books<-gsub(",","",df$books)
#convert to numeric
df[,2:4]<-sapply(df[,2:4], function(x) if(is.factor(x)) { as.numeric(as.character(x))} else {x})
df<-arrange(df,-numWeeks)

#The List
df

##                            books currWeek lastWeek numWeeks
## 1   ALL THE LIGHT WE CANNOT SEE        11        9       53
## 2         THE GIRL ON THE TRAIN         8        4       36
## 3                 GRAY MOUNTAIN        10        7       21
## 4                   THE MARTIAN         1        3       17
## 5             GO SET A WATCHMAN        12        5       10
## 6                             X        13        6        4
## 7  THE GIRL IN THE SPIDER'S WEB         4        2        3
## 8                       MAKE ME         3        1        2
## 9                HOLLYWOOD DIRT        15        8        2
## 10             DEVOTED IN DEATH         2       NA        1
## 11                 THE END GAME         5       NA        1
## 12                    HARD LOVE         6       NA        1
## 13                     THE SCAM         7       NA        1
## 14             FATES AND FURIES         9       NA        1
## 15                 FATAL FRENZY        14       NA        1

#Find the New Arrivals on the charts
df<-mutate(df,New_Arrival=numWeeks==1)
summary <- df %>% group_by(New_Arrival) %>% summarise(NumOfBooks=n(),AvgRank=mean(currWeek),RankStdDev=sd(currWeek))

#gather to plot
summary <- gather(summary,SummaryStatitic,Value,3:4)
summary

## Source: local data frame [4 x 4]
## 
##   New_Arrival NumOfBooks SummaryStatitic    Value
##         (lgl)      (int)          (fctr)    (dbl)
## 1       FALSE          9         AvgRank 8.555556
## 2        TRUE          6         AvgRank 7.166667
## 3       FALSE          9      RankStdDev 4.876246
## 4        TRUE          6      RankStdDev 4.070217

ggplot(data=summary, aes(x=New_Arrival, y=Value, fill=SummaryStatitic)) + geom_bar(stat='identity',aes(group=SummaryStatitic),position="dodge")

#The new arrivals seem to arrive on the charts at a higher rank and with a lower dispersion of ranks arround that higher number.

607 HW7

Randi Skrelja

October 11, 2015

US Digital Ad Sales Revenues

Popularity of Past Popes

NY Times Best Seller List