Load Libraries Used

library(ggplot2)
library(gridExtra)
library(lubridate)
library(dplyr)
library(gplots)

Load the Data

# load the data into R (saved as bid_data.txt in my working directory)
# read.table is easily used to import text file data into an R data frame
# header = TRUE since the data file contains variable names in its first line
bid <- read.table("bid_data.txt",header=TRUE,sep=",")

1 Exploratory Data Analysis

1.1 Raw Data Sructure

dim(bid)
## [1] 1452783       7
str(bid)
## 'data.frame':    1452783 obs. of  7 variables:
##  $ age          : int  13 14 15 16 17 18 19 20 21 22 ...
##  $ audience_size: int  360000 1000000 1580000 2400000 3400000 5000000 5800000 5800000 5800000 5800000 ...
##  $ cpm_max      : int  612 432 383 366 330 378 372 382 396 278 ...
##  $ cpm_median   : int  314 212 184 182 170 235 201 207 226 151 ...
##  $ cpm_min      : int  36 23 23 27 33 85 28 27 36 18 ...
##  $ gender       : Factor w/ 3 levels "all","female",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time         : Factor w/ 9137 levels "2014-06-25 23:02:27",..: 1 1 1 1 1 1 1 1 1 1 ...
# convert variables of class "integer" to appropriate class for analyis
# note: making age a factor variable 
bid$age <- as.factor(bid$age) 
bid$audience_size <- as.numeric(bid$audience_size)
bid$cpm_max <- as.numeric(bid$cpm_max)
bid$cpm_median <- as.numeric(bid$cpm_median)
bid$cpm_min <- as.numeric(bid$cpm_min)

head(bid,10)
##    age audience_size cpm_max cpm_median cpm_min gender                time
## 1   13        360000     612        314      36    all 2014-06-25 23:02:27
## 2   14       1000000     432        212      23    all 2014-06-25 23:02:27
## 3   15       1580000     383        184      23    all 2014-06-25 23:02:27
## 4   16       2400000     366        182      27    all 2014-06-25 23:02:27
## 5   17       3400000     330        170      33    all 2014-06-25 23:02:27
## 6   18       5000000     378        235      85    all 2014-06-25 23:02:27
## 7   19       5800000     372        201      28    all 2014-06-25 23:02:27
## 8   20       5800000     382        207      27    all 2014-06-25 23:02:27
## 9   21       5800000     396        226      36    all 2014-06-25 23:02:27
## 10  22       5800000     278        151      18    all 2014-06-25 23:02:27
tail(bid,10)
##         age audience_size cpm_max cpm_median cpm_min gender
## 1452774  56       1400000     380        176      35 female
## 1452775  57       1400000     396        158      43 female
## 1452776  58       1300000     455        206      51 female
## 1452777  59       1300000     394        152      40 female
## 1452778  60       1200000     536        210      44 female
## 1452779  61       1100000     405        149      43 female
## 1452780  62       1100000     411        120      35 female
## 1452781  63       1000000     432        166      39 female
## 1452782  64        980000     469        172      49 female
## 1452783  65       9000000     364        163      36 female
##                        time
## 1452774 2015-07-24 14:01:07
## 1452775 2015-07-24 14:01:07
## 1452776 2015-07-24 14:01:07
## 1452777 2015-07-24 14:01:07
## 1452778 2015-07-24 14:01:07
## 1452779 2015-07-24 14:01:07
## 1452780 2015-07-24 14:01:07
## 1452781 2015-07-24 14:01:07
## 1452782 2015-07-24 14:01:07
## 1452783 2015-07-24 14:01:07
levels(bid$gender) # all, female, male
## [1] "all"    "female" "male"
levels(bid$age) 
##  [1] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26"
## [15] "27" "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40"
## [29] "41" "42" "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54"
## [43] "55" "56" "57" "58" "59" "60" "61" "62" "63" "64" "65"
table(bid$age) # 27411 observations for each age in 13,14,...,65
## 
##    13    14    15    16    17    18    19    20    21    22    23    24 
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 
##    25    26    27    28    29    30    31    32    33    34    35    36 
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 
##    37    38    39    40    41    42    43    44    45    46    47    48 
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 
##    49    50    51    52    53    54    55    56    57    58    59    60 
## 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 27411 
##    61    62    63    64    65 
## 27411 27411 27411 27411 27411
length(unique(bid$time)) # 9137 unique hours
## [1] 9137
unique(table(bid$time)) # 159 demographics
## [1] 159
summary(bid)
##       age          audience_size         cpm_max        cpm_median   
##  13     :  27411   Min.   :      20   Min.   :  0.0   Min.   :  0.0  
##  14     :  27411   1st Qu.: 1380000   1st Qu.:233.0   1st Qu.:100.0  
##  15     :  27411   Median : 1980000   Median :289.0   Median :154.0  
##  16     :  27411   Mean   : 2352902   Mean   :284.7   Mean   :151.6  
##  17     :  27411   3rd Qu.: 2800000   3rd Qu.:340.0   3rd Qu.:200.0  
##  18     :  27411   Max.   :15000000   Max.   :964.0   Max.   :687.0  
##  (Other):1288317                                                     
##     cpm_min          gender                        time        
##  Min.   :  0.00   all   :484261   2014-06-25 23:02:27:    159  
##  1st Qu.: 25.00   female:484261   2014-06-26 00:02:19:    159  
##  Median : 37.00   male  :484261   2014-06-26 01:02:13:    159  
##  Mean   : 41.72                   2014-06-26 02:02:13:    159  
##  3rd Qu.: 55.00                   2014-06-26 03:02:14:    159  
##  Max.   :306.00                   2014-06-26 04:02:11:    159  
##                                   (Other)            :1451829
# any missing data?
apply(bid, 2, function(x) { sum(is.na(x)) })
##           age audience_size       cpm_max    cpm_median       cpm_min 
##             0             0             0             0             0 
##        gender          time 
##             0             0
# any duplicates?
nrow(bid) - nrow(unique(bid))
## [1] 0
### plot histograms for quantitative variables
# audience_size (using binwidth = 150000). note the outliers
hist1 <- ggplot(data=bid,aes(x=audience_size)) + geom_histogram(binwidth=150000) + theme_light() 
# cpm_max. left_skewwed distribution 
hist2 <- ggplot(data=bid,aes(x=cpm_max)) + geom_histogram(binwidth=15) + theme_light()
# cpm_median. left_skewed distribution
hist3 <- ggplot(data=bid,aes(x=cpm_median)) + geom_histogram(binwidth=7) + theme_light()
# cpm_min
hist4 <- ggplot(data=bid,aes(x=cpm_min)) + geom_histogram(binwidth=5) + theme_light()

# 2 x 2 panel plot
grid.arrange(hist1,hist2,hist3,hist4)

# density plots for continuous variables
dens1 <- ggplot(data=bid,aes(x=audience_size)) + geom_density(size=1)  + theme_light() 

# cpm_max. left_skewwed distribution 
dens2 <- ggplot(data=bid,aes(x=cpm_max)) + geom_density(size=1) + theme_light()
# cpm_median. left_skewed distribution
dens3 <- ggplot(data=bid,aes(x=cpm_median)) + geom_density(size=1) + theme_light()
# cpm_min
dens4 <- ggplot(data=bid,aes(x=cpm_min)) + geom_density(size=1) + theme_light()

# panel plot (2x2)
grid.arrange(dens1,dens2,dens3,dens4)

1.2 Defining Age Groups

# add new variable indicating age group
# age groups seem to be the advertising industry standard 
bid$agegrp <- 0
bid$agegrp[which(bid$age %in% 13:17)] <- c("13-17")
bid$agegrp[which(bid$age %in% 18:24)] <- c("18-24")
bid$agegrp[which(bid$age %in% 25:34)] <- c("25-34")
bid$agegrp[which(bid$age %in% 35:44)] <- c("35-44")
bid$agegrp[which(bid$age %in% 45:54)] <- c("45-54")
bid$agegrp[which(bid$age %in% 55:64)] <- c("55-64")
bid$agegrp[which(bid$age == 65)] <- c("65")
bid$agegrp <- as.factor(bid$agegrp)

1.3 Handling of Outliers

Based off the density and histogram, outliers appear to be present for the audience_size variable.

# by eye (approx. 0.63 percent)
length(which(bid$aud > 10000000))/length(bid$aud) # percentage of outliers
## [1] 0.006287932
# general rule of thumb: 3 or more standard deviations from the mean
outliers <- which(bid$aud >= (mean(bid$aud) + 3*sd(bid$aud)))
length(outliers) # 17939
## [1] 17939
# investigate demographics of outliers
table(bid[outliers,1]) # all outliers are age 65 (perhaps really 65+?)
## 
##    13    14    15    16    17    18    19    20    21    22    23    24 
##     0     0     0     0     0     0     0     0     0     0     0     0 
##    25    26    27    28    29    30    31    32    33    34    35    36 
##     0     0     0     0     0     0     0     0     0     0     0     0 
##    37    38    39    40    41    42    43    44    45    46    47    48 
##     0     0     0     0     0     0     0     0     0     0     0     0 
##    49    50    51    52    53    54    55    56    57    58    59    60 
##     0     0     0     0     0     0     0     0     0     0     0     0 
##    61    62    63    64    65 
##     0     0     0     0 17939
table(bid[outliers,6]) # all female
## 
##    all female   male 
##   9135   8804      0
# summary of audience_size values for 65 year olds appear illegitimately high
summary(bid$audience_size[which(bid$age == 65)])
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       20  5700000  8200000  9078000 13000000 15000000
table(bid$audience_size[which(bid$age == 65)])
## 
##       20    5e+06  5200000  5400000  5600000  5700000  5800000  5900000 
##        6     1656     2271     1323     1220     1288      987      390 
##  7200000  7400000  7600000  7800000    8e+06  8200000  8400000  8500000 
##      331      478     1200     1916      538     1119      577      149 
##  8600000  8700000  8800000  8900000    9e+06 12400000 12600000 12800000 
##      870      716      685      384      172      480      546      982 
##  1.3e+07 13200000 13400000 13600000 13800000  1.4e+07 14200000 14400000 
##     1118      799      327      212      385     1708      408      312 
##  1.5e+07 
##     1858
# audience reach vs age groups: barplot
reach.agebar <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# audience reach vs. age groups: boxplot
reach.agebox <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_boxplot() + theme_light()
grid.arrange(reach.agebar,reach.agebox)

# remove outliers
bid <- bid[-outliers,]

# redo reach vs age groups barplot
reach.agebar <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# redo reach vs age groups boxplot
reach.agebox <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_boxplot() + theme_light()
grid.arrange(reach.agebar,reach.agebox)

# remove all 65 year olds (almost all appear to be outliers)
bid <- bid[-which(bid$age == 65),]

# redo histogram of audience size variable with outliers removed
hist1 <- ggplot(data=bid,aes(x=audience_size)) + geom_histogram(binwidth=150000) + theme_light() 

# redo density of audience size variable with outliers removed
dens1 <- ggplot(data=bid,aes(x=audience_size)) + geom_density(size=1)  + theme_light() 
grid.arrange(hist1,dens1)

The 17939 outliers above three standard deviations from the mean are all for observations with age 65. 8804 are female and 9135 are for gender “all” (I’m guessing female + other). Even if 65 was used to denote the 65+ population, it should still be the smallest age group of facebook users (see this link). This information and the above plots suggest that the outliers are illegitimate. They most likely are from data error or intentional misreporting. Thus, I chose to drop them from the analyis. Additionally, I made the decision to drop ALL observations of age 65 as even the observations who were within three standard deviations of the mean seemed illegitimately high.

1.4 Redefining Targetable Groups (i.e. Demographics)

# CREATE SINGLE VARIABLE FOR GENDER AND AGE GROUP (i.e. demographic)
bid$age_gender <- c("0")
bid$agegrp <- as.character(bid$agegrp)
bid$gender <- as.character(bid$gender)
bid$age_gender <- paste(bid$agegrp,bid$gender)
bid$age_gender <- as.factor(bid$age_gender)
levels(bid$age_gender)
##  [1] "13-17 all"    "13-17 female" "13-17 male"   "18-24 all"   
##  [5] "18-24 female" "18-24 male"   "25-34 all"    "25-34 female"
##  [9] "25-34 male"   "35-44 all"    "35-44 female" "35-44 male"  
## [13] "45-54 all"    "45-54 female" "45-54 male"   "55-64 all"   
## [17] "55-64 female" "55-64 male"

1.5 Most/Least Expensive Demographic

Next, EDA is performed to explore relationships between various demographics and the median_cpm

# # redo reach vs age groups barplot
# reach.agebar <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# # redo reach vs age groups boxplot
# reach.agebox <- ggplot(data=bid,aes(x=agegrp,y=audience_size)) + geom_boxplot() + theme_light()
# grid.arrange(reach.agebar,reach.agebox)
# 
# 
# 
# # audience reach vs. gender: barplot
# reach.genbar <- ggplot(data=bid,aes(x=gender,y=audience_size)) + geom_bar(stat="identity") + theme_light()
# # audience reach vs. gender: boxplot
# reach.genbox <- ggplot(data=bid,aes(x=gender,y=audience_size)) + geom_boxplot() + theme_light()
# grid.arrange(reach.genbar,reach.genbox)


# 
# # cpm min vs. age groups barplot
# cpmin.agebar <- ggplot(data=bid,aes(x=agegrp,y=cpm_min)) + geom_bar(stat="identity") + theme_light()
# # cpm median vs. age groups barplot 
# cpmed.agebar <- ggplot(data=bid,aes(x=agegrp,y=cpm_median)) + geom_bar(stat="identity") + theme_light()
# # cpm max vs. age groups barplot
# cpmax.agebar <- ggplot(data=bid,aes(x=agegrp,y=cpm_max)) + geom_bar(stat="identity") + theme_light()
# # cpm min vs. age groups boxplot
# cpmin.agebox <- ggplot(data=bid,aes(x=agegrp,y=cpm_min)) + geom_boxplot() + theme_light()
# # cpm median vs. age groups boxplot
# cpmed.agebox <- ggplot(data=bid,aes(x=agegrp,y=cpm_median)) + geom_boxplot() + theme_light()
# # cpm max vs. age groups boxplot
# cpmax.agebox <- ggplot(data=bid,aes(x=agegrp,y=cpm_max)) + geom_boxplot() + theme_light()
# # 2x3 panel plot 
# grid.arrange(cpmin.agebar,cpmed.agebar,cpmax.agebar,cpmin.agebox,cpmed.agebox,cpmax.agebox,ncol=3)
# 
# 
# 
# # cpm min vs gender groups barplot
# cpmin.genbar <- ggplot(data=bid,aes(x=gender,y=cpm_min)) + geom_bar(stat="identity") + theme_light()
# # cpm median vs. gender groups barplot 
# cpmed.genbar <- ggplot(data=bid,aes(x=gender,y=cpm_median)) + geom_bar(stat="identity") + theme_light()
# # cpm max vs. gender groups barplot
# cpmax.genbar <- ggplot(data=bid,aes(x=gender,y=cpm_max)) + geom_bar(stat="identity") + theme_light()
# # cpm min vs. gender groups boxplot
# cpmin.genbox <- ggplot(data=bid,aes(x=gender,y=cpm_min)) + geom_boxplot() + theme_light()
# # cpm median vs. gender groups boxplot
# cpmed.genbox <- ggplot(data=bid,aes(x=gender,y=cpm_median)) + geom_boxplot() + theme_light()
# # cpm max vs. gender groups boxplot
# cpmax.genbox <- ggplot(data=bid,aes(x=gender,y=cpm_max)) + geom_boxplot() + theme_light()
# # 2x3 panel plot 
# grid.arrange(cpmin.genbar,cpmed.genbar,cpmax.genbar,cpmin.genbox,cpmed.genbox,cpmax.genbox,ncol=3)


# cpm min vs gender and age groups barplot
cpmin.genagebar <- ggplot(data=bid,aes(x=age_gender,y=cpm_min)) + geom_bar(stat="identity") + theme_light()
# cpm median vs. gender and age groups barplot 
cpmed.genagebar <- ggplot(data=bid,aes(x=age_gender,y=cpm_median)) + geom_bar(stat="identity") + theme_light()
# cpm max vs. gender and age groups barplot
cpmax.genagebar <- ggplot(data=bid,aes(x=age_gender,y=cpm_max)) + geom_bar(stat="identity") + theme_light()
# cpm min vs. gender and age groups boxplot
cpmin.genagebox <- ggplot(data=bid,aes(x=age_gender,y=cpm_min)) + geom_boxplot() + theme_light()
# cpm median vs. gender and age groups boxplot
cpmed.genagebox <- ggplot(data=bid,aes(x=age_gender,y=cpm_median)) + geom_boxplot() + theme_light()
# cpm max vs. gender and age groups boxplot
cpmax.genagebox <- ggplot(data=bid,aes(x=age_gender,y=cpm_max)) + geom_boxplot() + theme_light()



grid.arrange(cpmin.genagebar,cpmin.genagebox)

grid.arrange(cpmed.genagebar,cpmed.genagebox)

grid.arrange(cpmax.genagebar,cpmax.genagebox)

These results sopport a hypothesis that the least expensive demographics are of ages 55-64, followed closely by ages 45-54. Within either of these age groups, the least expensive gender group is male, closely followed by all, then female. Females have notably higher cost than male or all within these demographics. These results are consistent when considering either cpm min, median or max with one exception in cpm max (55-64 is the only clear least expensive demographic… and 18-24 beats 45-54). An important takeaway is that the 18-24 age group (all genders) is a relatively inexpensive demographic compared to 25-34 and 35-44.

The results also support a hypothesis that the most expensive demographics are of ages 25-34 and 35-44. Within either of these age groups, gender has very little effect. 13-17 year olds also are relatively expensive and have the most outliers for cost of any demographic (and the range of outlier values is very large).

Also, 18-24 year olds and 25-34 year olds seem to be the most active, and females more active than males in all age groups, but this is explored in greater detail later on.

1.6 Exploring Volatility

See which demographics change the most quickly, and the most unpredictably by computing percentage changes for various time intervals.

bid2 <- bid
# time variable now in POSIXct format
bid2$time <- ymd_hms(as.character(bid2$time))

# break dataset into time of day component (hour)
bid2$hour <- sapply(bid2$time,hour)
bid2$hour <- as.numeric(bid2$hour)
# break dataset into weekday component
bid2$wday <- sapply(bid2$time,wday)
# break dataset into month  component
bid2$month <- sapply(bid2$time,month)
# break dataset into season  component
bid2$season <- c(0)
bid2$season[which(bid2$month %in% c(12,1,2))] <- c("Winter")
bid2$season[which(bid2$month %in% 3:5)] <- c("Spring") 
bid2$season[which(bid2$month %in% 6:8)] <- c("Summer")
bid2$season[which(bid2$month %in% 9:11)] <- c("Fall")
bid2$season <- as.factor(bid2$season)
bid2$year <- sapply(bid2$time,year)
bid2$day <- sapply(bid2$time,day)

# compute average audience_size and average cpm_median for each demographic group and for each hour
bid2hour <- group_by(bid2,age_gender,hour)
bid2hour <- summarise(bid2hour, mean_audsize = mean(audience_size), mean_medcpm = mean(cpm_median) )


# initializations
bid2hour$time_int <- c("NA")   # for the time interval of the change
bid2hour$pct_chg_aud <- c(0)   # for percentage change in mean audience size
bid2hour$pct_chg_cpm <- c(0)   # for percentage change in mean cpm 

# compute percentage changes of average audience_size and average cpm_median between each hour, and for each demographic
for(i in 0:17){               # for each demographic (18 of them)
    for(j in 1:24){           # for each hour
        
        index <- (24*i) + j   # compute index into bid2hour dataset
       
        if(j==1){  # special case: 11pm to midnight change
            bid2hour$pct_chg_aud[index] <- ((bid2hour$mean_audsize[index] - bid2hour$mean_audsize[24*i+24])/(bid2hour$mean_audsize[24*i+24])) * 100
            
            bid2hour$pct_chg_cpm[index] <- ((bid2hour$mean_medcpm[index] - bid2hour$mean_medcpm[24*i+24])/(bid2hour$mean_medcpm[24*i+24])) * 100
            
            bid2hour$time_int[index] <- c("23-0")
            
        }
        else{ # all other cases are easy. use observation before current index
            bid2hour$pct_chg_aud[index] <- ((bid2hour$mean_audsize[index] - bid2hour$mean_audsize[index-1])/(bid2hour$mean_audsize[index-1])) * 100
            
            bid2hour$pct_chg_cpm[index] <- ((bid2hour$mean_medcpm[index] - bid2hour$mean_medcpm[index-1])/(bid2hour$mean_medcpm[index-1])) * 100
            
            bid2hour$time_int[index] <- paste(j-2,"-",j-1,sep="")
            
        } # end else  
    } # end inner loop
} # end outer loop
head(bid2hour)
## Source: local data frame [6 x 7]
## Groups: age_gender [1]
## 
##   age_gender  hour mean_audsize mean_medcpm time_int pct_chg_aud
##       (fctr) (dbl)        (dbl)       (dbl)    (chr)       (dbl)
## 1  13-17 all     0      1553147    188.1031     23-0  0.01975103
## 2  13-17 all     1      1553488    188.7775      0-1  0.02199752
## 3  13-17 all     2      1552956    188.8360      1-2 -0.03428652
## 4  13-17 all     3      1553286    189.1708      2-3  0.02130420
## 5  13-17 all     4      1553547    189.6823      3-4  0.01676553
## 6  13-17 all     5      1553268    189.4395      4-5 -0.01792376
## Variables not shown: pct_chg_cpm (dbl)
bid2hour$time_int <- as.factor(bid2hour$time_int)

# use boxplots to evaluate distribution of percentage changes of mean audience size hour to hour
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23")) 

# NOTE: X = i on the x-axis represents the percentage change from hour i to hour i+1

# same, but compare by demographic now
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23"))   + facet_wrap( ~ age_gender) 

# use boxplots to evaluate distribution of percentage changes of mean median_cpm hour to hour 
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light()  + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23"))

# NOTE: X = i on the x-axis represents the percentage change from hour i to hour i+1

# Same, but compare by demographic now
ggplot(data=bid2hour,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light()  + scale_x_discrete(limits=c("0-1","1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12", "12-13", "13-14", "14-15", "15-16", "16-17", "17-18","18-19", "19-20","20-21", "21-22", "22-23", "23-0"), labels = c("0-1"="0","1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-13"="12", "13-14"="13", "14-15"="14", "15-16"="15", "16-17"="16", "17-18"="17","18-19"="18", "19-20"="19","20-21"="20", "21-22"="21", "22-23"="22", "23-0"="23")) + facet_wrap( ~ age_gender) 

######## REPEAT FOR WEEKDAYS #########
# compute average audience_size and average cpm_median for each demographic group and for each weekday
bid2day <- group_by(bid2,age_gender,wday)
bid2day <- summarise(bid2day, mean_audsize = mean(audience_size), mean_medcpm = mean(cpm_median) )


# initializations
bid2day$time_int <- c("NA")   # for the time interval of the change
bid2day$pct_chg_aud <- c(0)   # for percentage change in mean audience size
bid2day$pct_chg_cpm <- c(0)   # for percentage change in mean cpm 

# compute percentage changes of average audience_size and average cpm_median between each day, and for each demographic
for(i in 0:17){               # for each demographic (18 of them)
    for(j in 1:7){           # for each day
        
        index <- (7*i) + j   # compute index into bid2day dataset
       
        if(j==1){  # special case: Saturday to Sunday
            bid2day$pct_chg_aud[index] <- ((bid2day$mean_audsize[index] - bid2day$mean_audsize[7*i+7])/(bid2day$mean_audsize[7*i+7])) * 100
            
            bid2day$pct_chg_cpm[index] <- ((bid2day$mean_medcpm[index] - bid2day$mean_medcpm[7*i+7])/(bid2day$mean_medcpm[7*i+7])) * 100
            
            bid2day$time_int[index] <- c("7-1")
            
        }
        else{ # all other cases are easy. use previous day 
            bid2day$pct_chg_aud[index] <- ((bid2day$mean_audsize[index] - bid2day$mean_audsize[index-1])/(bid2day$mean_audsize[index-1])) * 100
            
            bid2day$pct_chg_cpm[index] <- ((bid2day$mean_medcpm[index] - bid2day$mean_medcpm[index-1])/(bid2day$mean_medcpm[index-1])) * 100
            
            bid2day$time_int[index] <- paste(j-1,"-",j,sep="")
            
        } # end else  
    } # end inner loop
} # end outer loop
head(bid2day)
## Source: local data frame [6 x 7]
## Groups: age_gender [1]
## 
##   age_gender  wday mean_audsize mean_medcpm time_int pct_chg_aud
##       (fctr) (dbl)        (dbl)       (dbl)    (chr)       (dbl)
## 1  13-17 all     1      1549491    189.6078      7-1 -0.14300722
## 2  13-17 all     2      1553701    172.5391      1-2  0.27175859
## 3  13-17 all     3      1552619    177.8483      2-3 -0.06967239
## 4  13-17 all     4      1553027    185.3371      3-4  0.02628457
## 5  13-17 all     5      1553727    189.8716      4-5  0.04505131
## 6  13-17 all     6      1552942    199.6174      5-6 -0.05050139
## Variables not shown: pct_chg_cpm (dbl)
bid2day$time_int <- as.factor(bid2day$time_int)


# use boxplots to evaluate distribution of percentage changes of mean audience size day to day 
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() 

# same, but compare by demographic now
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender) 

# use boxplots to evaluate distribution of percentage changes of mean median_cpm day to day
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light()  

# Same, but compare by demographic now
ggplot(data=bid2day,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender) 

########### REPEAT FOR MONTHS ############
# compute average audience_size and average cpm_median for each demographic group and for each month
bid2month <- group_by(bid2,age_gender,month)
bid2month <- summarise(bid2month, mean_audsize = mean(audience_size), mean_medcpm = mean(cpm_median) )


# initializations
bid2month$time_int <- c("NA")   # for the time interval of the change
bid2month$pct_chg_aud <- c(0)   # for percentage change in mean audience size
bid2month$pct_chg_cpm <- c(0)   # for percentage change in mean cpm 

# compute percentage changes of average audience_size and average cpm_median between each hour, and for each demographic
for(i in 0:17){               # for each demographic (18 of them)
    for(j in 1:12){           # for each month
        
        index <- (12*i) + j   # compute index into bid2month dataset
       
        if(j==1){  # special case: 11pm to midnight change
            bid2month$pct_chg_aud[index] <- ((bid2month$mean_audsize[index] - bid2month$mean_audsize[12*i+12])/(bid2month$mean_audsize[12*i+12])) * 100
            
            bid2month$pct_chg_cpm[index] <- ((bid2month$mean_medcpm[index] - bid2month$mean_medcpm[12*i+12])/(bid2month$mean_medcpm[12*i+12])) * 100
            
            bid2month$time_int[index] <- c("12-1")
            
        }
        else{ # all other cases are easy. use previous month
            bid2month$pct_chg_aud[index] <- ((bid2month$mean_audsize[index] - bid2month$mean_audsize[index-1])/(bid2month$mean_audsize[index-1])) * 100
            
            bid2month$pct_chg_cpm[index] <- ((bid2month$mean_medcpm[index] - bid2month$mean_medcpm[index-1])/(bid2month$mean_medcpm[index-1])) * 100
            
            bid2month$time_int[index] <- paste(j-1,"-",j,sep="")
            
        } # end else  
    } # end inner loop
} # end outer loop
head(bid2month)
## Source: local data frame [6 x 7]
## Groups: age_gender [1]
## 
##   age_gender month mean_audsize mean_medcpm time_int pct_chg_aud
##       (fctr) (dbl)        (dbl)       (dbl)    (chr)       (dbl)
## 1  13-17 all     1      1507801    149.8935     12-1  -0.9238891
## 2  13-17 all     2      1502012    181.5802      1-2  -0.3839054
## 3  13-17 all     3      1503828    212.3731      2-3   0.1208854
## 4  13-17 all     4      1501672    215.4407      3-4  -0.1433619
## 5  13-17 all     5      1459015    223.5705      4-5  -2.8406208
## 6  13-17 all     6      1459786    224.8904      5-6   0.0528698
## Variables not shown: pct_chg_cpm (dbl)
bid2month$time_int <- as.factor(bid2month$time_int)

# use boxplots to evaluate distribution of percentage changes of mean audience size month to month 
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light()  + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12")) 

# NOTE: x = i on the x-axis represents the percentage change from month i to month i+1

# same, but compare by demographic now
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_aud)) + geom_boxplot() + theme_light() + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12")) + facet_wrap( ~ age_gender) 

# use boxplots to evaluate distribution of percentage changes of mean median_cpm month to month 
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light()  + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12")) 

# NOTE: x = i on the x-axis represents the percentage change from month i to month i+1

# Same, but compare by demographic now
ggplot(data=bid2month,aes(x=time_int,y=pct_chg_cpm)) + geom_boxplot() + theme_light()  + scale_x_discrete(limits=c("1-2","2-3","3-4","4-5","5-6","6-7","7-8","8-9","9-10","10-11","11-12","12-1"), labels = c("1-2"="1","2-3"="2","3-4"="3","4-5"="4","5-6"="5","6-7"="6","7-8"="7","8-9"="8","9-10"="9","10-11"="10","11-12"="11", "12-1"="12"))  + facet_wrap( ~ age_gender) 

A good strategy might be to advertise on Monday, as it showed substantial average cpm percent change decrease from Sunday and has low unpredictability (variance). On the flip side, advertisers may want to stay away from advertising on Saturday, as there was a substantial average cpm percent change increase from Friday (but not as strong of a result since more people tend to be on FB on the weekend).

Looking at percentage change in mean audience size values from month to month might be helpful to an ad strategy.

A good strategy might be to advertise in July or January as these months saw drastic average cpm percent change decreases from their previous months and have low unpredictability (variance). On the flip side, advertisers may want to stay away from advertising in Oct. and Nov., as these months saw drastic average cpm percent change increases from their previous months and have high unpredictability (variance).

1.7 WHEN are the Least/Most Active and Least/Most Expensive Demographics?

by time of day, day of the week, month

# To analyze "when", considering by time of day, day of the week, and month separately. To analyze "active", I am using audience_size
# group by demographic and time of day 
bid3hour <- group_by(bid2, age_gender, hour )
bid3hour$hour <- as.factor(bid3hour$hour)
bid3hour$wday <- as.factor(bid3hour$wday)
bid3hour$month <- as.factor(bid3hour$month)

# PLOT "ACTIVE-NESS"
# Audience reach vs age and gender groups barplot 
ggplot(data = bid3hour, aes(x = age_gender, y =audience_size)) + geom_bar(stat="identity") + theme_light() + scale_x_discrete(labels=c("13-17 all" = "13-17 \n A" ,  "13-17 female" = "13-17 \n F", "13-17 male" = "13-17 \n M" , "18-24 all" = "18-24 \n A",  "18-24 female" = "18-24 \n F", "18-24 male" = "18-24 \n M" , "25-34 all" = "25-34 \n A",  "25-34 female" = "25-34 \n F", "25-34 male" = "25-34 \n M", "35-44 all" = "35-44 \n A",  "35-44 female" = "35-44 \n F", "35-44 male" = "35-44 \n M",   "45-54 all" = "45-54 \n A","45-54 female" = "45-54 \n F", "45-54 male" = "45-54 \n M", "55-64 all"  = "55-64 \n A",  "55-64 female" = "55-64 \n F", "55-64 male" = "55-64 \n M" ))

# Audience reach vs age and gender groups boxplot
ggplot(data=bid3hour,aes(x = age_gender , y= audience_size)) + geom_boxplot() + theme_light() + scale_x_discrete(labels=c("13-17 all" = "13-17 \n A" ,  "13-17 female" = "13-17 \n F", "13-17 male" = "13-17 \n M" , "18-24 all" = "18-24 \n A",  "18-24 female" = "18-24 \n F", "18-24 male" = "18-24 \n M" , "25-34 all" = "25-34 \n A",  "25-34 female" = "25-34 \n F", "25-34 male" = "25-34 \n M", "35-44 all" = "35-44 \n A",  "35-44 female" = "35-44 \n F", "35-44 male" = "35-44 \n M",   "45-54 all" = "45-54 \n A","45-54 female" = "45-54 \n F", "45-54 male" = "45-54 \n M", "55-64 all"  = "55-64 \n A",  "55-64 female" = "55-64 \n F", "55-64 male" = "55-64 \n M" ))

# Audience reach vs. time of day (and by demographic)
ggplot(data=bid3hour,aes(x = hour , y= audience_size)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)

# result: hour of day has no impact on estimated audience reach within each demographic from this plot


# summarize audience reach and cost by demographic and time of day
bid3hour <- summarise(bid3hour, mean_audsize = mean(audience_size),  med_audsize = median(audience_size),mean_mincpm = mean(cpm_min),med_mincpm = median(cpm_min),   mean_medcpm = mean(cpm_median),med_mediancpm = median(cpm_median), mean_maxcpm = mean(cpm_max),med_maxcpm = median(cpm_max))
head(bid3hour,24)
## Source: local data frame [24 x 10]
## Groups: age_gender [1]
## 
##    age_gender  hour mean_audsize med_audsize mean_mincpm med_mincpm
##        (fctr) (dbl)        (dbl)       (dbl)       (dbl)      (dbl)
## 1   13-17 all     0      1553147     1500000    51.18010         45
## 2   13-17 all     1      1553488     1500000    51.48564         45
## 3   13-17 all     2      1552956     1500000    51.35144         45
## 4   13-17 all     3      1553286     1500000    51.52813         45
## 5   13-17 all     4      1553547     1500000    51.66667         45
## 6   13-17 all     5      1553268     1500000    51.46684         45
## 7   13-17 all     6      1548565     1500000    51.65288         46
## 8   13-17 all     7      1548665     1500000    51.43717         45
## 9   13-17 all     8      1551830     1500000    51.69628         46
## 10  13-17 all     9      1552534     1500000    51.53508         45
## ..        ...   ...          ...         ...         ...        ...
## Variables not shown: mean_medcpm (dbl), med_mediancpm (dbl), mean_maxcpm
##   (dbl), med_maxcpm (dbl)
bid3hour$hour <- as.factor(bid3hour$hour)

# plot median audience reach vs. time of day, grouped by demographic
ggplot(data=bid3hour,aes(x = hour , y= med_audsize)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)

# result: hour of day has no impact on estimated audience reach within each demographic from this plot as well


# plot median cpm_median vs. time of day, grouped by demographic
ggplot(data=bid3hour,aes(x = hour , y= med_mediancpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender) 

# result: hour of day has little impact on the median of the estimated cpm_median within each demographic. There seems to be a slight decrease in the afternoon/evening for most demographics

# alternative plot: group means of median cpm_median for each hour and 95% confidence intervals (each group is a demographic)
plotmeans(med_mediancpm ~ hour, main="Group means of median cpm_median by hour", data=bid3hour)

######### REPEAT FOR DAY TO DAY ##########
bid3day <- group_by(bid2, age_gender, wday )
bid3day$hour <- as.factor(bid3day$hour)
bid3day$wday <- as.factor(bid3day$wday)
bid3day$month <- as.factor(bid3day$month)

# Audience reach vs. day of week (and by demographic)
ggplot(data=bid3day,aes(x = wday , y= audience_size)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)

# result: day of week has no effect on audience reach within demographics from this plot

# summarize audience reach and cost by demographic and day of week
bid3day <- summarise(bid3day, mean_audsize = mean(audience_size),  med_audsize = median(audience_size),mean_mincpm = mean(cpm_min),med_mincpm = median(cpm_min),   mean_medcpm = mean(cpm_median),med_mediancpm = median(cpm_median), mean_maxcpm = mean(cpm_max),med_maxcpm = median(cpm_max))
head(bid3day,7)
## Source: local data frame [7 x 10]
## Groups: age_gender [1]
## 
##   age_gender  wday mean_audsize med_audsize mean_mincpm med_mincpm
##       (fctr) (dbl)        (dbl)       (dbl)       (dbl)      (dbl)
## 1  13-17 all     1      1549491     1500000    53.94028         48
## 2  13-17 all     2      1553701     1500000    46.90503         41
## 3  13-17 all     3      1552619     1500000    47.40123         40
## 4  13-17 all     4      1553027     1500000    48.68617         42
## 5  13-17 all     5      1553727     1500000    50.55901         45
## 6  13-17 all     6      1552942     1500000    53.99162         48
## 7  13-17 all     7      1551710     1500000    57.21622         51
## Variables not shown: mean_medcpm (dbl), med_mediancpm (dbl), mean_maxcpm
##   (dbl), med_maxcpm (dbl)
bid3day$wday <- as.factor(bid3day$wday)

# plot median audience reach vs. day of week, grouped by demographic
ggplot(data=bid3day,aes(x = wday , y= med_audsize)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)

# result: wday has no impact on median estimated audience reach within each demographic from this plot. This does not seem reasonable... should greater activity on weekends. Starting to conclude that audience reach is not a good variable for measuring "active-ness of FB user demographics"


# plot median cpm_median vs. day of week (all demographics)
ggplot(data=bid3day,aes(x = wday , y= med_mediancpm)) + geom_boxplot() + theme_light() 

# result: Friday, Saturday, and Sunday are the most expensive. Monday and Tuesday are the cheapest during the week

# alternative plot: group means of median cpm_median for each hour, and 95 confidence intervals
# notice outlier for tuesday makes it seem to be cheaper than monday 
plotmeans(med_mediancpm ~ wday, main="Group means of median cpm_median by day", data=bid3day)

# same, but by demographic
ggplot(data=bid3day,aes(x = wday , y= med_mediancpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender) 

# result: males are noticably more expensive than females for age groups 13-17 and 18-24, no notable difference between M or F for 35-44, and males are cheaper than females for age groups 25-34, 45-54, and 55-64. age groups 45-54 and 55-64 are substantially cheaper than all other age groups, for every day of the week. For almost all demographics, the trend of Friday, Saturday, and Sunday being the most expensive days of the week holds.



######### REPEAT FOR MONTH TO MONTH ##########
bid3month <- group_by(bid2, age_gender, month )
bid3month$hour <- as.factor(bid3month$hour)
bid3month$wday <- as.factor(bid3month$wday)
bid3month$month <- as.factor(bid3month$month)


# summarize audience reach and cost by demographic and month
bid3month <- summarise(bid3month, mean_audsize = mean(audience_size),  med_audsize = median(audience_size),mean_mincpm = mean(cpm_min),med_mincpm = median(cpm_min),   mean_medcpm = mean(cpm_median),med_mediancpm = median(cpm_median), mean_maxcpm = mean(cpm_max),med_maxcpm = median(cpm_max))
head(bid3month,12)
## Source: local data frame [12 x 10]
## Groups: age_gender [1]
## 
##    age_gender month mean_audsize med_audsize mean_mincpm med_mincpm
##        (fctr) (dbl)        (dbl)       (dbl)       (dbl)      (dbl)
## 1   13-17 all     1      1507801     1480000    45.82156         43
## 2   13-17 all     2      1502012     1480000    52.67477         48
## 3   13-17 all     3      1503828     1500000    62.21723         52
## 4   13-17 all     4      1501672     1500000    67.10407         62
## 5   13-17 all     5      1459015     1500000    73.64584         71
## 6   13-17 all     6      1459786     1400000    66.32795         64
## 7   13-17 all     7      1595943     1580000    47.13144         44
## 8   13-17 all     8      1724046     1600000    28.92453         25
## 9   13-17 all     9      1667989     1580000    32.15698         29
## 10  13-17 all    10      1611864     1520000    37.21518         34
## 11  13-17 all    11      1566023     1480000    42.71605         39
## 12  13-17 all    12      1521861     1460000    58.11160         54
## Variables not shown: mean_medcpm (dbl), med_mediancpm (dbl), mean_maxcpm
##   (dbl), med_maxcpm (dbl)
bid3month$month <- as.factor(bid3month$month)

# plot median audience reach vs. month, grouped by demographic
ggplot(data=bid3month,aes(x = month , y= med_audsize)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender)

# result: For most months, there's hardly any difference between median estimated audience reach within each demographic. It's clear once again that there are slightly more female users than male users for all age groups. Again, audience reach does not seem to be a good measure of active-ness


# plot median cpm_median vs. month (all demographics)
ggplot(data=bid3month,aes(x = month , y= med_mediancpm)) + geom_boxplot() + theme_light() 

# result: May and June are the most expensive. August and January are the cheapest. 

# alternative plot: group means of median cpm_median for each month
plotmeans(med_mediancpm ~ month, main="Group means of median cpm_median by month", data=bid3month)

# same, but by demographic
ggplot(data=bid3month,aes(x = month , y= med_mediancpm)) + geom_boxplot() + theme_light() + facet_wrap( ~ age_gender) 

# result: males are noticably more expensive than females for age groups 13-17 and 18-24, no notable difference between M or F for 25-34 and 35-44, and males are cheaper than females for age groups  45-54, and 55-64. Age groups 45-54 and 55-64 are substantially cheaper than all other age groups. 

2 Statistical Panel Data Analysis

Here Icreate a new variable that combines age and gender to identify the entities

bid2$entity <- paste(as.character(bid2$age),bid2$gender)

Miscellaneous plot 1: Group means of cpm_median for demographics over the entire time period data was observed. Confirms results in section 1.5

Miscellaneous plot 2: Group means of audience_size for demographics over the entire time period data was observed. Confirms results in section 1.7

Miscallaneous Plot 3: Average median cost per 1000 impressions for each hour in the entire time period the data was observed. Shows increasing trend over time, an outlier towards the end, and that times where any cpm_median value is 0, that time gets a 0 value since taking an average.

Resources: 1. Practical Guides To Panel Data Modeling

  1. Panel Data Analysis Fixed and Random Effects[http://dss.princeton.edu/training/Panel101R.pdf]