Load Libraries Used
library(ggplot2)
library(gridExtra)
library(lubridate)
library(dplyr)
library(gplots)
Load the Data
# load the data into R (saved as bid_data.txt in my working directory)
# read.table is easily used to import text file data into an R data frame
# header = TRUE since the data file contains variable names in its first line
bid <- read.table("bid_data.txt",header=TRUE,sep=",")
dim(bid)
## [1] 1452783 7
str(bid)
## 'data.frame': 1452783 obs. of 7 variables:
## $ age : int 13 14 15 16 17 18 19 20 21 22 ...
## $ audience_size: int 360000 1000000 1580000 2400000 3400000 5000000 5800000 5800000 5800000 5800000 ...
## $ cpm_max : int 612 432 383 366 330 378 372 382 396 278 ...
## $ cpm_median : int 314 212 184 182 170 235 201 207 226 151 ...
## $ cpm_min : int 36 23 23 27 33 85 28 27 36 18 ...
## $ gender : Factor w/ 3 levels "all","female",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : Factor w/ 9137 levels "2014-06-25 23:02:27",..: 1 1 1 1 1 1 1 1 1 1 ...
# convert variables of class "integer" to appropriate class for analyis
# note: making age a factor variable
bid$age <- as.factor(bid$age)
bid$audience_size <- as.numeric(bid$audience_size)
bid$cpm_max <- as.numeric(bid$cpm_max)
bid$cpm_median <- as.numeric(bid$cpm_median)
bid$cpm_min <- as.numeric(bid$cpm_min)
head(bid,10)
## age audience_size cpm_max cpm_median cpm_min gender time
## 1 13 360000 612 314 36 all 2014-06-25 23:02:27
## 2 14 1000000 432 212 23 all 2014-06-25 23:02:27
## 3 15 1580000 383 184 23 all 2014-06-25 23:02:27
## 4 16 2400000 366 182 27 all 2014-06-25 23:02:27
## 5 17 3400000 330 170 33 all 2014-06-25 23:02:27
## 6 18 5000000 378 235 85 all 2014-06-25 23:02:27
## 7 19 5800000 372 201 28 all 2014-06-25 23:02:27
## 8 20 5800000 382 207 27 all 2014-06-25 23:02:27
## 9 21 5800000 396 226 36 all 2014-06-25 23:02:27
## 10 22 5800000 278 151 18 all 2014-06-25 23:02:27
tail(bid,10)
## age audience_size cpm_max cpm_median cpm_min gender
## 1452774 56 1400000 380 176 35 female
## 1452775 57 1400000 396 158 43 female
## 1452776 58 1300000 455 206 51 female
## 1452777 59 1300000 394 152 40 female
## 1452778 60 1200000 536 210 44 female
## 1452779 61 1100000 405 149 43 female
## 1452780 62 1100000 411 120 35 female
## 1452781 63 1000000 432 166 39 female
## 1452782 64 980000 469 172 49 female
## 1452783 65 9000000 364 163 36 female
## time
## 1452774 2015-07-24 14:01:07
## 1452775 2015-07-24 14:01:07
## 1452776 2015-07-24 14:01:07
## 1452777 2015-07-24 14:01:07
## 1452778 2015-07-24 14:01:07
## 1452779 2015-07-24 14:01:07
## 1452780 2015-07-24 14:01:07
## 1452781 2015-07-24 14:01:07
## 1452782 2015-07-24 14:01:07
## 1452783 2015-07-24 14:01:07
levels(bid$gender) # all, female, male
## [1] "all" "female" "male"
levels(bid$age)
## [1] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26"
## [15] "27" "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40"
## [29] "41" "42" "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54"
## [43] "55" "56" "57" "58" "59" "60" "61" "62" "63" "64" "65"
table(bid$age) # 27411 observations for each age in 13,14,...,65
##
## 13 14 15 16 17 18 19 20 21 22 23 24
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411
## 25 26 27 28 29 30 31 32 33 34 35 36
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411
## 37 38 39 40 41 42 43 44 45 46 47 48
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411
## 49 50 51 52 53 54 55 56 57 58 59 60
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411
## 61 62 63 64 65
## 27411 27411 27411 27411 27411
length(unique(bid$time)) # 9137 unique hours
## [1] 9137
unique(table(bid$time)) # 159 demographics
## [1] 159
summary(bid)
## age audience_size cpm_max cpm_median
## 13 : 27411 Min. : 20 Min. : 0.0 Min. : 0.0
## 14 : 27411 1st Qu.: 1380000 1st Qu.:233.0 1st Qu.:100.0
## 15 : 27411 Median : 1980000 Median :289.0 Median :154.0
## 16 : 27411 Mean : 2352902 Mean :284.7 Mean :151.6
## 17 : 27411 3rd Qu.: 2800000 3rd Qu.:340.0 3rd Qu.:200.0
## 18 : 27411 Max. :15000000 Max. :964.0 Max. :687.0
## (Other):1288317
## cpm_min gender time
## Min. : 0.00 all :484261 2014-06-25 23:02:27: 159
## 1st Qu.: 25.00 female:484261 2014-06-26 00:02:19: 159
## Median : 37.00 male :484261 2014-06-26 01:02:13: 159
## Mean : 41.72 2014-06-26 02:02:13: 159
## 3rd Qu.: 55.00 2014-06-26 03:02:14: 159
## Max. :306.00 2014-06-26 04:02:11: 159
## (Other) :1451829
# any missing data?
apply(bid, 2, function(x) { sum(is.na(x)) })
## age audience_size cpm_max cpm_median cpm_min
## 0 0 0 0 0
## gender time
## 0 0
# any duplicates?
nrow(bid) - nrow(unique(bid))
## [1] 0
### plot histograms for quantitative variables
# audience_size (using binwidth = 150000). note the outliers
hist1 <- ggplot(data=bid,aes(x=audience_size)) + geom_histogram(binwidth=150000) + theme_light()
# cpm_max. left_skewwed distribution
hist2 <- ggplot(data=bid,aes(x=cpm_max)) + geom_histogram(binwidth=15) + theme_light()
# cpm_median. left_skewed distribution
hist3 <- ggplot(data=bid,aes(x=cpm_median)) + geom_histogram(binwidth=7) + theme_light()
# cpm_min
hist4 <- ggplot(data=bid,aes(x=cpm_min)) + geom_histogram(binwidth=5) + theme_light()
# 2 x 2 panel plot
grid.arrange(hist1,hist2,hist3,hist4)
# density plots for continuous variables
dens1 <- ggplot(data=bid,aes(x=audience_size)) + geom_density(size=1) + theme_light()
# cpm_max. left_skewwed distribution
dens2 <- ggplot(data=bid,aes(x=cpm_max)) + geom_density(size=1) + theme_light()
# cpm_median. left_skewed distribution
dens3 <- ggplot(data=bid,aes(x=cpm_median)) + geom_density(size=1) + theme_light()
# cpm_min
dens4 <- ggplot(data=bid,aes(x=cpm_min)) + geom_density(size=1) + theme_light()
# panel plot (2x2)
grid.arrange(dens1,dens2,dens3,dens4)
# add new variable indicating age group
# age groups seem to be the advertising industry standard
bid$agegrp <- 0
bid$agegrp[which(bid$age %in% 13:17)] <- c("13-17")
bid$agegrp[which(bid$age %in% 18:24)] <- c("18-24")
bid$agegrp[which(bid$age %in% 25:34)] <- c("25-34")
bid$agegrp[which(bid$age %in% 35:44)] <- c("35-44")
bid$agegrp[which(bid$age %in% 45:54)] <- c("45-54")
bid$agegrp[which(bid$age %in% 55:64)] <- c("55-64")
bid$agegrp[which(bid$age == 65)] <- c("65")
bid$agegrp <- as.factor(bid$agegrp)
Based off the density and histogram, outliers appear to be present for the audience_size variable.
# by eye (approx. 0.63 percent)
length(which(bid$aud > 10000000))/length(bid$aud) # percentage of outliers
## [1] 0.006287932
# general rule of thumb: 3 or more standard deviations from the mean
outliers <- which(bid$aud >= (mean(bid$aud) + 3*sd(bid$aud)))
length(outliers) # 17939
## [1] 17939
# investigate demographics of outliers
table(bid[outliers,1]) # all outliers are age 65 (perhaps really 65+?)
##
## 13 14 15 16 17 18 19 20 21 22 23 24
## 0 0 0 0 0 0 0 0 0 0 0 0
## 25 26 27 28 29 30 31 32 33 34 35 36
## 0 0 0 0 0 0 0 0 0 0 0 0
## 37 38 39 40 41 42 43 44 45 46 47 48
## 0 0 0 0 0 0 0 0 0 0 0 0
## 49 50 51 52 53 54 55 56 57 58 59 60
## 0 0 0 0 0 0 0 0 0 0 0 0
## 61 62 63 64 65
## 0 0 0 0 17939
table(bid[outliers,6]) # all female
##
## all female male
## 9135 8804 0
# summary of audience_size values for 65 year olds appear illegitimately high
summary(bid$audience_size[which(bid$age == 65)])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20 5700000 8200000 9078000 13000000 15000000
table(bid$audience_size[which(bid$age == 65)])
##
## 20 5e+06 5200000 5400000 5600000 5700000 5800000 5900000
## 6 1656 2271 1323 1220 1288 987 390
## 7200000 7400000 7600000 7800000 8e+06 8200000 8400000 8500000
## 331 478 1200 1916 538 1119 577 149
## 8600000 8700000 8800000 8900000 9e+06 12400000 12600000 12800000
## 870 716 685 384 172 480 546 982
## 1.3e+07 13200000 13400000 13600000 13800000 1.4e+07 14200000 14400000
## 1118 799 327 212 385 1708 408 312
## 1.5e+07
## 1858
# audience reach vs age groups: barplot
reach.agebar <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# audience reach vs. age groups: boxplot
reach.agebox <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_boxplot() + theme_light()
grid.arrange(reach.agebar,reach.agebox)
# remove outliers
bid <- bid[-outliers,]
# redo reach vs age groups barplot
reach.agebar <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# redo reach vs age groups boxplot
reach.agebox <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_boxplot() + theme_light()
grid.arrange(reach.agebar,reach.agebox)
# remove all 65 year olds (almost all appear to be outliers)
bid <- bid[-which(bid$age == 65),]
# redo histogram of audience size variable with outliers removed
hist1 <- ggplot(data=bid,aes(x=audience_size)) + geom_histogram(binwidth=150000) + theme_light()
# redo density of audience size variable with outliers removed
dens1 <- ggplot(data=bid,aes(x=audience_size)) + geom_density(size=1) + theme_light()
grid.arrange(hist1,dens1)
The 17939 outliers above three standard deviations from the mean are all for observations with age 65. 8804 are female and 9135 are for gender “all” (I’m guessing female + other). Even if 65 was used to denote the 65+ population, it should still be the smallest age group of facebook users (see this link). This information and the above plots suggest that the outliers are illegitimate. They most likely are from data error or intentional misreporting. Thus, I chose to drop them from the analyis. Additionally, I made the decision to drop ALL observations of age 65 as even the observations who were within three standard deviations of the mean seemed illegitimately high.
# CREATE SINGLE VARIABLE FOR GENDER AND AGE GROUP (i.e. demographic)
bid$age_gender <- c("0")
bid$agegrp <- as.character(bid$agegrp)
bid$gender <- as.character(bid$gender)
bid$age_gender <- paste(bid$agegrp,bid$gender)
bid$age_gender <- as.factor(bid$age_gender)
levels(bid$age_gender)
## [1] "13-17 all" "13-17 female" "13-17 male" "18-24 all"
## [5] "18-24 female" "18-24 male" "25-34 all" "25-34 female"
## [9] "25-34 male" "35-44 all" "35-44 female" "35-44 male"
## [13] "45-54 all" "45-54 female" "45-54 male" "55-64 all"
## [17] "55-64 female" "55-64 male"
Next, EDA is performed to explore relationships between various demographics and the median_cpm
# # redo reach vs age groups barplot
# reach.agebar <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# # redo reach vs age groups boxplot
# reach.agebox <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_boxplot() + theme_light()
# grid.arrange(reach.agebar,reach.agebox)
#
#
#
# # audience reach vs. gender: barplot
# reach.genbar <- ggplot(data=bid,aes(x=gender,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# # audience reach vs. gender: boxplot
# reach.genbox <- ggplot(data=bid,aes(x=gender,y=audience_size)) + geom_boxplot() + theme_light()
# grid.arrange(reach.genbar,reach.genbox)
#
# # cpm min vs. age groups barplot
# cpmin.agebar <- ggplot(data=bid,aes(x=agegrp,y=cpm_min)) + geom_bar(stat="identity") + theme_light()
# # cpm median vs. age groups barplot
# cpmed.agebar <- ggplot(data=bid,aes(x=agegrp,y=cpm_median)) + geom_bar(stat="identity") + theme_light()
# # cpm max vs. age groups barplot
# cpmax.agebar <- ggplot(data=bid,aes(x=agegrp,y=cpm_max)) + geom_bar(stat="identity") + theme_light()
# # cpm min vs. age groups boxplot
# cpmin.agebox <- ggplot(data=bid,aes(x=agegrp,y=cpm_min)) + geom_boxplot() + theme_light()
# # cpm median vs. age groups boxplot
# cpmed.agebox <- ggplot(data=bid,aes(x=agegrp,y=cpm_median)) + geom_boxplot() + theme_light()
# # cpm max vs. age groups boxplot
# cpmax.agebox <- ggplot(data=bid,aes(x=agegrp,y=cpm_max)) + geom_boxplot() + theme_light()
# # 2x3 panel plot
# grid.arrange(cpmin.agebar,cpmed.agebar,cpmax.agebar,cpmin.agebox,cpmed.agebox,cpmax.agebox,ncol=3)
#
#
#
# # cpm min vs gender groups barplot
# cpmin.genbar <- ggplot(data=bid,aes(x=gender,y=cpm_min)) + geom_bar(stat="identity") + theme_light()
# # cpm median vs. gender groups barplot
# cpmed.genbar <- ggplot(data=bid,aes(x=gender,y=cpm_median)) + geom_bar(stat="identity") + theme_light()
# # cpm max vs. gender groups barplot
# cpmax.genbar <- ggplot(data=bid,aes(x=gender,y=cpm_max)) + geom_bar(stat="identity") + theme_light()
# # cpm min vs. gender groups boxplot
# cpmin.genbox <- ggplot(data=bid,aes(x=gender,y=cpm_min)) + geom_boxplot() + theme_light()
# # cpm median vs. gender groups boxplot
# cpmed.genbox <- ggplot(data=bid,aes(x=gender,y=cpm_median)) + geom_boxplot() + theme_light()
# # cpm max vs. gender groups boxplot
# cpmax.genbox <- ggplot(data=bid,aes(x=gender,y=cpm_max)) + geom_boxplot() + theme_light()
# # 2x3 panel plot
# grid.arrange(cpmin.genbar,cpmed.genbar,cpmax.genbar,cpmin.genbox,cpmed.genbox,cpmax.genbox,ncol=3)
# cpm min vs gender and age groups barplot
cpmin.genagebar <- ggplot(data=bid,aes(x=age_gender,y=cpm_min)) + geom_bar(stat="identity") + theme_light()
# cpm median vs. gender and age groups barplot
cpmed.genagebar <- ggplot(data=bid,aes(x=age_gender,y=cpm_median)) + geom_bar(stat="identity") + theme_light()
# cpm max vs. gender and age groups barplot
cpmax.genagebar <- ggplot(data=bid,aes(x=age_gender,y=cpm_max)) + geom_bar(stat="identity") + theme_light()
# cpm min vs. gender and age groups boxplot
cpmin.genagebox <- ggplot(data=bid,aes(x=age_gender,y=cpm_min)) + geom_boxplot() + theme_light()
# cpm median vs. gender and age groups boxplot
cpmed.genagebox <- ggplot(data=bid,aes(x=age_gender,y=cpm_median)) + geom_boxplot() + theme_light()
# cpm max vs. gender and age groups boxplot
cpmax.genagebox <- ggplot(data=bid,aes(x=age_gender,y=cpm_max)) + geom_boxplot() + theme_light()
grid.arrange(cpmin.genagebar,cpmin.genagebox)
grid.arrange(cpmed.genagebar,cpmed.genagebox)
grid.arrange(cpmax.genagebar,cpmax.genagebox)
These results sopport a hypothesis that the least expensive demographics are of ages 55-64, followed closely by ages 45-54. Within either of these age groups, the least expensive gender group is male, closely followed by all, then female. Females have notably higher cost than male or all within these demographics. These results are consistent when considering either cpm min, median or max with one exception in cpm max (55-64 is the only clear least expensive demographic… and 18-24 beats 45-54). An important takeaway is that the 18-24 age group (all genders) is a relatively inexpensive demographic compared to 25-34 and 35-44.
The results also support a hypothesis that the most expensive demographics are of ages 25-34 and 35-44. Within either of these age groups, gender has very little effect. 13-17 year olds also are relatively expensive and have the most outliers for cost of any demographic (and the range of outlier values is very large).
Also, 18-24 year olds and 25-34 year olds seem to be the most active, and females more active than males in all age groups, but this is explored in greater detail later on.
See which demographics change the most quickly, and the most unpredictably by computing percentage changes for various time intervals.
bid2 <- bid
# time variable now in POSIXct format
bid2$time <- ymd_hms(as.character(bid2$time))
# break dataset into time of day component (hour)
bid2$hour <- sapply(bid2$time,hour)
bid2$hour <- as.numeric(bid2$hour)
# break dataset into weekday component
bid2$wday <- sapply(bid2$time,wday)
# break dataset into month component
bid2$month <- sapply(bid2$time,month)
# break dataset into season component
bid2$season <- c(0)
bid2$season[which(bid2$month %in% c(12,1,2))] <- c("Winter")
bid2$season[which(bid2$month %in% 3:5)] <- c("Spring")
bid2$season[which(bid2$month %in% 6:8)] <- c("Summer")
bid2$season[which(bid2$month %in% 9:11)] <- c("Fall")
bid2$season <- as.factor(bid2$season)
bid2$year <- sapply(bid2$time,year)
bid2$day <- sapply(bid2$time,day)
# compute average audience_size and average cpm_median for each demographic group and for each hour
bid2hour <- group_by(bid2,age_gender,hour)
bid2hour <- summarise(bid2hour, mean_audsize = mean(audience_size), mean_medcpm = mean(cpm_median) )
# initializations
bid2hour$time_int <- c("NA") # for the time interval of the change
bid2hour$pct_chg_aud <- c(0) # for percentage change in mean audience size
bid2hour$pct_chg_cpm <- c(0) # for percentage change in mean cpm
# compute percentage changes of average audience_size and average cpm_median between each hour, and for each demographic
for(i in 0:17){ # for each demographic (18 of them)
for(j in 1:24){ # for each hour
index <- (24*i) + j # compute index into bid2hour dataset
if(j==1){ # special case: 11pm to midnight change
bid2hour$pct_chg_aud[index] <- ((bid2hour$mean_audsize[index] - bid2hour$mean_audsize[24*i+24])/(bid2hour$mean_audsize[24*i+24])) * 100
bid2hour$pct_chg_cpm[index] <- ((bid2hour$mean_medcpm[index] - bid2hour$mean_medcpm[24*i+24])/(bid2hour$mean_medcpm[24*i+24])) * 100
bid2hour$time_int[index] <- c("23-0")
}
else{ # all other cases are easy. use observation before current index
bid2hour$pct_chg_aud[index] <- ((bid2hour$mean_audsize[index] - bid2hour$mean_audsize[index-1])/(bid2hour$mean_audsize[index-1])) * 100
bid2hour$pct_chg_cpm[index] <- ((bid2hour$mean_medcpm[index] - bid2hour$mean_medcpm[index-1])/(bid2hour$mean_medcpm[index-1])) * 100
bid2hour$time_int[index] <- paste(j-2,"-",j-1,sep="")
} # end else
} # end inner loop
} # end outer loop
head(bid2hour)
## Source: local data frame [6 x 7]
## Groups: age_gender [1]
##
## age_gender hour mean_audsize mean_medcpm time_int pct_chg_aud
## (fctr) (dbl) (dbl) (dbl) (chr) (dbl)
## 1 13-17 all 0 1553147 188.1031 23-0 0.01975103
## 2 13-17 all 1 1553488 188.7775 0-1 0.02199752
## 3 13-17 all 2 1552956 188.8360 1-2 -0.03428652
## 4 13-17 all 3 1553286 189.1708 2-3 0.02130420
## 5 13-17 all 4 1553547 189.6823 3-4 0.01676553
## 6 13-17 all 5 1553268 189.4395 4-5 -0.01792376
## Variables not shown: pct_chg_cpm (dbl)
bid2hour$time_int <- as.factor(bid2hour$time_int)
# use boxplots to evaluate distribution of percentage changes of mean audience size hour to hour
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23"))
# NOTE: X = i on the x-axis represents the percentage change from hour i to hour i+1
# same, but compare by demographic now
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23")) + facet_wrap( ~ age_gender)
# use boxplots to evaluate distribution of percentage changes of mean median_cpm hour to hour
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23"))
# NOTE: X = i on the x-axis represents the percentage change from hour i to hour i+1
# Same, but compare by demographic now
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23")) + facet_wrap( ~ age_gender)
######## REPEAT FOR WEEKDAYS #########
# compute average audience_size and average cpm_median for each demographic group and for each weekday
bid2day <- group_by(bid2,age_gender,wday)
bid2day <- summarise(bid2day, mean_audsize = mean(audience_size), mean_medcpm = mean(cpm_median) )
# initializations
bid2day$time_int <- c("NA") # for the time interval of the change
bid2day$pct_chg_aud <- c(0) # for percentage change in mean audience size
bid2day$pct_chg_cpm <- c(0) # for percentage change in mean cpm
# compute percentage changes of average audience_size and average cpm_median between each day, and for each demographic
for(i in 0:17){ # for each demographic (18 of them)
for(j in 1:7){ # for each day
index <- (7*i) + j # compute index into bid2day dataset
if(j==1){ # special case: Saturday to Sunday
bid2day$pct_chg_aud[index] <- ((bid2day$mean_audsize[index] - bid2day$mean_audsize[7*i+7])/(bid2day$mean_audsize[7*i+7])) * 100
bid2day$pct_chg_cpm[index] <- ((bid2day$mean_medcpm[index] - bid2day$mean_medcpm[7*i+7])/(bid2day$mean_medcpm[7*i+7])) * 100
bid2day$time_int[index] <- c("7-1")
}
else{ # all other cases are easy. use previous day
bid2day$pct_chg_aud[index] <- ((bid2day$mean_audsize[index] - bid2day$mean_audsize[index-1])/(bid2day$mean_audsize[index-1])) * 100
bid2day$pct_chg_cpm[index] <- ((bid2day$mean_medcpm[index] - bid2day$mean_medcpm[index-1])/(bid2day$mean_medcpm[index-1])) * 100
bid2day$time_int[index] <- paste(j-1,"-",j,sep="")
} # end else
} # end inner loop
} # end outer loop
head(bid2day)
## Source: local data frame [6 x 7]
## Groups: age_gender [1]
##
## age_gender wday mean_audsize mean_medcpm time_int pct_chg_aud
## (fctr) (dbl) (dbl) (dbl) (chr) (dbl)
## 1 13-17 all 1 1549491 189.6078 7-1 -0.14300722
## 2 13-17 all 2 1553701 172.5391 1-2 0.27175859
## 3 13-17 all 3 1552619 177.8483 2-3 -0.06967239
## 4 13-17 all 4 1553027 185.3371 3-4 0.02628457
## 5 13-17 all 5 1553727 189.8716 4-5 0.04505131
## 6 13-17 all 6 1552942 199.6174 5-6 -0.05050139
## Variables not shown: pct_chg_cpm (dbl)
bid2day$time_int <- as.factor(bid2day$time_int)
# use boxplots to evaluate distribution of percentage changes of mean audience size day to day
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light()
# same, but compare by demographic now
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# use boxplots to evaluate distribution of percentage changes of mean median_cpm day to day
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light()
# Same, but compare by demographic now
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
########### REPEAT FOR MONTHS ############
# compute average audience_size and average cpm_median for each demographic group and for each month
bid2month <- group_by(bid2,age_gender,month)
bid2month <- summarise(bid2month, mean_audsize = mean(audience_size), mean_medcpm = mean(cpm_median) )
# initializations
bid2month$time_int <- c("NA") # for the time interval of the change
bid2month$pct_chg_aud <- c(0) # for percentage change in mean audience size
bid2month$pct_chg_cpm <- c(0) # for percentage change in mean cpm
# compute percentage changes of average audience_size and average cpm_median between each hour, and for each demographic
for(i in 0:17){ # for each demographic (18 of them)
for(j in 1:12){ # for each month
index <- (12*i) + j # compute index into bid2month dataset
if(j==1){ # special case: 11pm to midnight change
bid2month$pct_chg_aud[index] <- ((bid2month$mean_audsize[index] - bid2month$mean_audsize[12*i+12])/(bid2month$mean_audsize[12*i+12])) * 100
bid2month$pct_chg_cpm[index] <- ((bid2month$mean_medcpm[index] - bid2month$mean_medcpm[12*i+12])/(bid2month$mean_medcpm[12*i+12])) * 100
bid2month$time_int[index] <- c("12-1")
}
else{ # all other cases are easy. use previous month
bid2month$pct_chg_aud[index] <- ((bid2month$mean_audsize[index] - bid2month$mean_audsize[index-1])/(bid2month$mean_audsize[index-1])) * 100
bid2month$pct_chg_cpm[index] <- ((bid2month$mean_medcpm[index] - bid2month$mean_medcpm[index-1])/(bid2month$mean_medcpm[index-1])) * 100
bid2month$time_int[index] <- paste(j-1,"-",j,sep="")
} # end else
} # end inner loop
} # end outer loop
head(bid2month)
## Source: local data frame [6 x 7]
## Groups: age_gender [1]
##
## age_gender month mean_audsize mean_medcpm time_int pct_chg_aud
## (fctr) (dbl) (dbl) (dbl) (chr) (dbl)
## 1 13-17 all 1 1507801 149.8935 12-1 -0.9238891
## 2 13-17 all 2 1502012 181.5802 1-2 -0.3839054
## 3 13-17 all 3 1503828 212.3731 2-3 0.1208854
## 4 13-17 all 4 1501672 215.4407 3-4 -0.1433619
## 5 13-17 all 5 1459015 223.5705 4-5 -2.8406208
## 6 13-17 all 6 1459786 224.8904 5-6 0.0528698
## Variables not shown: pct_chg_cpm (dbl)
bid2month$time_int <- as.factor(bid2month$time_int)
# use boxplots to evaluate distribution of percentage changes of mean audience size month to month
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12"))
# NOTE: x = i on the x-axis represents the percentage change from month i to month i+1
# same, but compare by demographic now
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12")) + facet_wrap( ~ age_gender)
# use boxplots to evaluate distribution of percentage changes of mean median_cpm month to month
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12"))
# NOTE: x = i on the x-axis represents the percentage change from month i to month i+1
# Same, but compare by demographic now
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12")) + facet_wrap( ~ age_gender)
A good strategy might be to advertise on Monday, as it showed substantial average cpm percent change decrease from Sunday and has low unpredictability (variance). On the flip side, advertisers may want to stay away from advertising on Saturday, as there was a substantial average cpm percent change increase from Friday (but not as strong of a result since more people tend to be on FB on the weekend).
Looking at percentage change in mean audience size values from month to month might be helpful to an ad strategy.
A good strategy might be to advertise in July or January as these months saw drastic average cpm percent change decreases from their previous months and have low unpredictability (variance). On the flip side, advertisers may want to stay away from advertising in Oct. and Nov., as these months saw drastic average cpm percent change increases from their previous months and have high unpredictability (variance).
by time of day, day of the week, month
# To analyze "when", considering by time of day, day of the week, and month separately. To analyze "active", I am using audience_size
# group by demographic and time of day
bid3hour <- group_by(bid2, age_gender, hour )
bid3hour$hour <- as.factor(bid3hour$hour)
bid3hour$wday <- as.factor(bid3hour$wday)
bid3hour$month <- as.factor(bid3hour$month)
# PLOT "ACTIVE-NESS"
# Audience reach vs age and gender groups barplot
ggplot(data = bid3hour, aes(x = age_gender, y =audience_size)) + geom_bar(stat="identity") + theme_light() + scale_x_discrete(labels=c("13-17 all" = "13-17 \n A" , "13-17 female" = "13-17 \n F", "13-17 male" = "13-17 \n M" , "18-24 all" = "18-24 \n A", "18-24 female" = "18-24 \n F", "18-24 male" = "18-24 \n M" , "25-34 all" = "25-34 \n A", "25-34 female" = "25-34 \n F", "25-34 male" = "25-34 \n M", "35-44 all" = "35-44 \n A", "35-44 female" = "35-44 \n F", "35-44 male" = "35-44 \n M", "45-54 all" = "45-54 \n A","45-54 female" = "45-54 \n F", "45-54 male" = "45-54 \n M", "55-64 all" = "55-64 \n A", "55-64 female" = "55-64 \n F", "55-64 male" = "55-64 \n M" ))
# Audience reach vs age and gender groups boxplot
ggplot(data=bid3hour,aes(x = age_gender , y= audience_size)) + geom_boxplot() + theme_light() + scale_x_discrete(labels=c("13-17 all" = "13-17 \n A" , "13-17 female" = "13-17 \n F", "13-17 male" = "13-17 \n M" , "18-24 all" = "18-24 \n A", "18-24 female" = "18-24 \n F", "18-24 male" = "18-24 \n M" , "25-34 all" = "25-34 \n A", "25-34 female" = "25-34 \n F", "25-34 male" = "25-34 \n M", "35-44 all" = "35-44 \n A", "35-44 female" = "35-44 \n F", "35-44 male" = "35-44 \n M", "45-54 all" = "45-54 \n A","45-54 female" = "45-54 \n F", "45-54 male" = "45-54 \n M", "55-64 all" = "55-64 \n A", "55-64 female" = "55-64 \n F", "55-64 male" = "55-64 \n M" ))
# Audience reach vs. time of day (and by demographic)
ggplot(data=bid3hour,aes(x = hour , y= audience_size)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: hour of day has no impact on estimated audience reach within each demographic from this plot
# summarize audience reach and cost by demographic and time of day
bid3hour <- summarise(bid3hour, mean_audsize = mean(audience_size), med_audsize = median(audience_size),mean_mincpm = mean(cpm_min),med_mincpm = median(cpm_min), mean_medcpm = mean(cpm_median),med_mediancpm = median(cpm_median), mean_maxcpm = mean(cpm_max),med_maxcpm = median(cpm_max))
head(bid3hour,24)
## Source: local data frame [24 x 10]
## Groups: age_gender [1]
##
## age_gender hour mean_audsize med_audsize mean_mincpm med_mincpm
## (fctr) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 13-17 all 0 1553147 1500000 51.18010 45
## 2 13-17 all 1 1553488 1500000 51.48564 45
## 3 13-17 all 2 1552956 1500000 51.35144 45
## 4 13-17 all 3 1553286 1500000 51.52813 45
## 5 13-17 all 4 1553547 1500000 51.66667 45
## 6 13-17 all 5 1553268 1500000 51.46684 45
## 7 13-17 all 6 1548565 1500000 51.65288 46
## 8 13-17 all 7 1548665 1500000 51.43717 45
## 9 13-17 all 8 1551830 1500000 51.69628 46
## 10 13-17 all 9 1552534 1500000 51.53508 45
## .. ... ... ... ... ... ...
## Variables not shown: mean_medcpm (dbl), med_mediancpm (dbl), mean_maxcpm
## (dbl), med_maxcpm (dbl)
bid3hour$hour <- as.factor(bid3hour$hour)
# plot median audience reach vs. time of day, grouped by demographic
ggplot(data=bid3hour,aes(x = hour , y= med_audsize)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: hour of day has no impact on estimated audience reach within each demographic from this plot as well
# plot median cpm_median vs. time of day, grouped by demographic
ggplot(data=bid3hour,aes(x = hour , y= med_mediancpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: hour of day has little impact on the median of the estimated cpm_median within each demographic. There seems to be a slight decrease in the afternoon/evening for most demographics
# alternative plot: group means of median cpm_median for each hour and 95% confidence intervals (each group is a demographic)
plotmeans(med_mediancpm ~ hour, main="Group means of median cpm_median by hour", data=bid3hour)
######### REPEAT FOR DAY TO DAY ##########
bid3day <- group_by(bid2, age_gender, wday )
bid3day$hour <- as.factor(bid3day$hour)
bid3day$wday <- as.factor(bid3day$wday)
bid3day$month <- as.factor(bid3day$month)
# Audience reach vs. day of week (and by demographic)
ggplot(data=bid3day,aes(x = wday , y= audience_size)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: day of week has no effect on audience reach within demographics from this plot
# summarize audience reach and cost by demographic and day of week
bid3day <- summarise(bid3day, mean_audsize = mean(audience_size), med_audsize = median(audience_size),mean_mincpm = mean(cpm_min),med_mincpm = median(cpm_min), mean_medcpm = mean(cpm_median),med_mediancpm = median(cpm_median), mean_maxcpm = mean(cpm_max),med_maxcpm = median(cpm_max))
head(bid3day,7)
## Source: local data frame [7 x 10]
## Groups: age_gender [1]
##
## age_gender wday mean_audsize med_audsize mean_mincpm med_mincpm
## (fctr) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 13-17 all 1 1549491 1500000 53.94028 48
## 2 13-17 all 2 1553701 1500000 46.90503 41
## 3 13-17 all 3 1552619 1500000 47.40123 40
## 4 13-17 all 4 1553027 1500000 48.68617 42
## 5 13-17 all 5 1553727 1500000 50.55901 45
## 6 13-17 all 6 1552942 1500000 53.99162 48
## 7 13-17 all 7 1551710 1500000 57.21622 51
## Variables not shown: mean_medcpm (dbl), med_mediancpm (dbl), mean_maxcpm
## (dbl), med_maxcpm (dbl)
bid3day$wday <- as.factor(bid3day$wday)
# plot median audience reach vs. day of week, grouped by demographic
ggplot(data=bid3day,aes(x = wday , y= med_audsize)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: wday has no impact on median estimated audience reach within each demographic from this plot. This does not seem reasonable... should greater activity on weekends. Starting to conclude that audience reach is not a good variable for measuring "active-ness of FB user demographics"
# plot median cpm_median vs. day of week (all demographics)
ggplot(data=bid3day,aes(x = wday , y= med_mediancpm)) + geom_boxplot() + theme_light()
# result: Friday, Saturday, and Sunday are the most expensive. Monday and Tuesday are the cheapest during the week
# alternative plot: group means of median cpm_median for each hour, and 95 confidence intervals
# notice outlier for tuesday makes it seem to be cheaper than monday
plotmeans(med_mediancpm ~ wday, main="Group means of median cpm_median by day", data=bid3day)
# same, but by demographic
ggplot(data=bid3day,aes(x = wday , y= med_mediancpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: males are noticably more expensive than females for age groups 13-17 and 18-24, no notable difference between M or F for 35-44, and males are cheaper than females for age groups 25-34, 45-54, and 55-64. age groups 45-54 and 55-64 are substantially cheaper than all other age groups, for every day of the week. For almost all demographics, the trend of Friday, Saturday, and Sunday being the most expensive days of the week holds.
######### REPEAT FOR MONTH TO MONTH ##########
bid3month <- group_by(bid2, age_gender, month )
bid3month$hour <- as.factor(bid3month$hour)
bid3month$wday <- as.factor(bid3month$wday)
bid3month$month <- as.factor(bid3month$month)
# summarize audience reach and cost by demographic and month
bid3month <- summarise(bid3month, mean_audsize = mean(audience_size), med_audsize = median(audience_size),mean_mincpm = mean(cpm_min),med_mincpm = median(cpm_min), mean_medcpm = mean(cpm_median),med_mediancpm = median(cpm_median), mean_maxcpm = mean(cpm_max),med_maxcpm = median(cpm_max))
head(bid3month,12)
## Source: local data frame [12 x 10]
## Groups: age_gender [1]
##
## age_gender month mean_audsize med_audsize mean_mincpm med_mincpm
## (fctr) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 13-17 all 1 1507801 1480000 45.82156 43
## 2 13-17 all 2 1502012 1480000 52.67477 48
## 3 13-17 all 3 1503828 1500000 62.21723 52
## 4 13-17 all 4 1501672 1500000 67.10407 62
## 5 13-17 all 5 1459015 1500000 73.64584 71
## 6 13-17 all 6 1459786 1400000 66.32795 64
## 7 13-17 all 7 1595943 1580000 47.13144 44
## 8 13-17 all 8 1724046 1600000 28.92453 25
## 9 13-17 all 9 1667989 1580000 32.15698 29
## 10 13-17 all 10 1611864 1520000 37.21518 34
## 11 13-17 all 11 1566023 1480000 42.71605 39
## 12 13-17 all 12 1521861 1460000 58.11160 54
## Variables not shown: mean_medcpm (dbl), med_mediancpm (dbl), mean_maxcpm
## (dbl), med_maxcpm (dbl)
bid3month$month <- as.factor(bid3month$month)
# plot median audience reach vs. month, grouped by demographic
ggplot(data=bid3month,aes(x = month , y= med_audsize)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: For most months, there's hardly any difference between median estimated audience reach within each demographic. It's clear once again that there are slightly more female users than male users for all age groups. Again, audience reach does not seem to be a good measure of active-ness
# plot median cpm_median vs. month (all demographics)
ggplot(data=bid3month,aes(x = month , y= med_mediancpm)) + geom_boxplot() + theme_light()
# result: May and June are the most expensive. August and January are the cheapest.
# alternative plot: group means of median cpm_median for each month
plotmeans(med_mediancpm ~ month, main="Group means of median cpm_median by month", data=bid3month)
# same, but by demographic
ggplot(data=bid3month,aes(x = month , y= med_mediancpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)
# result: males are noticably more expensive than females for age groups 13-17 and 18-24, no notable difference between M or F for 25-34 and 35-44, and males are cheaper than females for age groups 45-54, and 55-64. Age groups 45-54 and 55-64 are substantially cheaper than all other age groups.
Here Icreate a new variable that combines age and gender to identify the entities
bid2$entity <- paste(as.character(bid2$age),bid2$gender)
Miscellaneous plot 1: Group means of cpm_median for demographics over the entire time period data was observed. Confirms results in section 1.5
Miscellaneous plot 2: Group means of audience_size for demographics over the entire time period data was observed. Confirms results in section 1.7
Miscallaneous Plot 3: Average median cost per 1000 impressions for each hour in the entire time period the data was observed. Shows increasing trend over time, an outlier towards the end, and that times where any cpm_median value is 0, that time gets a 0 value since taking an average.
Resources: 1. Practical Guides To Panel Data Modeling