Case study

Apply libraries

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(statsr)
library(BAS)
library(GGally)

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

library(devtools)
library(mvtnorm)
library(corrplot)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(readxl)

Load data

Test_Data <- read_excel("~/Desktop/Case_data/Test_Data.xlsx")

## Warning in strptime(x, format, tz = tz): unknown timezone 'zone/tz/2018c.
## 1.0/zoneinfo/America/Toronto'

soups<-na.omit(Test_Data)
summary(soups)

##       Year          Week #          Week              Device         
##  Min.   :2016   Min.   : 1.00   Length:17036       Length:17036      
##  1st Qu.:2016   1st Qu.:13.00   Class :character   Class :character  
##  Median :2016   Median :24.00   Mode  :character   Mode  :character  
##  Mean   :2016   Mean   :24.34                                        
##  3rd Qu.:2017   3rd Qu.:35.00                                        
##  Max.   :2017   Max.   :53.00                                        
##     Brand              Source             Sessions      Goal_Completions 
##  Length:17036       Length:17036       Min.   :     1   Min.   :    0.0  
##  Class :character   Class :character   1st Qu.:   218   1st Qu.:   32.0  
##  Mode  :character   Mode  :character   Median :  1062   Median :  195.0  
##                                        Mean   :  2564   Mean   :  932.7  
##                                        3rd Qu.:  3232   3rd Qu.:  950.0  
##                                        Max.   :140126   Max.   :22958.0  
##     Bounces           Tier                Date                    
##  Min.   :     0   Length:17036       Min.   :2015-12-27 00:00:00  
##  1st Qu.:    76   Class :character   1st Qu.:2016-06-05 00:00:00  
##  Median :   303   Mode  :character   Median :2016-11-13 00:00:00  
##  Mean   :  1011                      Mean   :2016-11-17 10:12:28  
##  3rd Qu.:   945                      3rd Qu.:2017-04-30 00:00:00  
##  Max.   :124244                      Max.   :2017-10-08 00:00:00

str(soups)

## Classes 'tbl_df', 'tbl' and 'data.frame':    17036 obs. of  11 variables:
##  $ Year            : num  2016 2016 2016 2016 2016 ...
##  $ Week #          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Week            : chr  "27 December, 2015" "27 December, 2015" "27 December, 2015" "27 December, 2015" ...
##  $ Device          : chr  "desktop" "desktop" "desktop" "desktop" ...
##  $ Brand           : chr  "Brand 1" "Brand 1" "Brand 1" "Brand 1" ...
##  $ Source          : chr  "Direct" "Display" "Organic" "Paid Search" ...
##  $ Sessions        : num  409 27 719 573 481 ...
##  $ Goal_Completions: num  209 6 307 269 241 ...
##  $ Bounces         : num  145 15 284 183 137 589 72 1 802 448 ...
##  $ Tier            : chr  "Tier 1" "Tier 1" "Tier 1" "Tier 1" ...
##  $ Date            : POSIXct, format: "2015-12-27" "2015-12-27" ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:1768] 6 13 25 28 41 45 51 58 65 68 ...
##   .. ..- attr(*, "names")= chr [1:1768] "6" "13" "25" "28" ...

head(soups)

## # A tibble: 6 x 11
##    Year `Week #`              Week  Device   Brand      Source Sessions
##   <dbl>    <dbl>             <chr>   <chr>   <chr>       <chr>    <dbl>
## 1  2016        1 27 December, 2015 desktop Brand 1      Direct      409
## 2  2016        1 27 December, 2015 desktop Brand 1     Display       27
## 3  2016        1 27 December, 2015 desktop Brand 1     Organic      719
## 4  2016        1 27 December, 2015 desktop Brand 1 Paid Search      573
## 5  2016        1 27 December, 2015 desktop Brand 1    Referral      481
## 6  2016        1 27 December, 2015 desktop Brand 2      Direct     1677
## # ... with 4 more variables: Goal_Completions <dbl>, Bounces <dbl>,
## #   Tier <chr>, Date <dttm>

tail(soups)

## # A tibble: 6 x 11
##    Year `Week #`            Week Device   Brand      Source Sessions
##   <dbl>    <dbl>           <chr>  <chr>   <chr>       <chr>    <dbl>
## 1  2017       40 1 October, 2017 tablet Brand 4     Display      381
## 2  2017       40 1 October, 2017 tablet Brand 4     Organic       59
## 3  2017       40 1 October, 2017 tablet Brand 4 Paid Search     1284
## 4  2017       40 1 October, 2017 tablet Brand 4 Paid Social      204
## 5  2017       40 1 October, 2017 tablet Brand 4    Referral      354
## 6  2017       40 1 October, 2017 tablet Brand 4      Social        4
## # ... with 4 more variables: Goal_Completions <dbl>, Bounces <dbl>,
## #   Tier <chr>, Date <dttm>

describe(soups)

## Warning in describe(soups): NAs introduced by coercion

## Warning in describe(soups): NAs introduced by coercion

## Warning in describe(soups): NAs introduced by coercion

## Warning in describe(soups): NAs introduced by coercion

## Warning in describe(soups): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

##                  vars     n    mean      sd median trimmed     mad  min
## Year                1 17036 2016.44    0.50   2016 2016.42    0.00 2016
## Week #              2 17036   24.34   13.97     24   23.97   16.31    1
## Week*               3 17036     NaN      NA     NA     NaN      NA  Inf
## Device*             4 17036     NaN      NA     NA     NaN      NA  Inf
## Brand*              5 17036     NaN      NA     NA     NaN      NA  Inf
## Source*             6 17036     NaN      NA     NA     NaN      NA  Inf
## Sessions            7 17036 2564.49 4555.90   1062 1726.66 1460.36    1
## Goal_Completions    8 17036  932.69 1746.75    195  490.10  278.73    0
## Bounces             9 17036 1011.23 3212.07    303  529.71  413.65    0
## Tier*              10 17036     NaN      NA     NA     NaN      NA  Inf
## Date*              11 17036     NaN      NA     NA     NaN      NA  Inf
##                     max  range  skew kurtosis    se
## Year               2017      1  0.24    -1.94  0.00
## Week #               53     52  0.16    -0.99  0.11
## Week*              -Inf   -Inf    NA       NA    NA
## Device*            -Inf   -Inf    NA       NA    NA
## Brand*             -Inf   -Inf    NA       NA    NA
## Source*            -Inf   -Inf    NA       NA    NA
## Sessions         140126 140125  9.68   193.91 34.91
## Goal_Completions  22958  22958  3.23    14.05 13.38
## Bounces          124244 124244 18.70   509.06 24.61
## Tier*              -Inf   -Inf    NA       NA    NA
## Date*              -Inf   -Inf    NA       NA    NA

Finding the correlations between metrics

soup_cor <- soups[c( 7:9)]

cor(soup_cor)

##                   Sessions Goal_Completions   Bounces
## Sessions         1.0000000        0.6157141 0.8611959
## Goal_Completions 0.6157141        1.0000000 0.1680161
## Bounces          0.8611959        0.1680161 1.0000000

During the correlation analysis I found that Sessions are correlate to GC 0.62 and to Bounce as 0.86
GC correlates to Sessions as 0.62
Bounce correlates to Sessions as 0.86

The findings were expected.

Exploratory Data Analysis

Devices vs Sessions, mobile devices brings the most of the sessions

ggplot( data = soups, aes(x = Device, y = Sessions)) + geom_jitter()

Device vs Bounces, because mobile devices bring most of sessions, there will be more bounces

ggplot( data = soups, aes(x = Device, y = Bounces)) + geom_jitter()

Finding the highest Conversion Rate among the Sources. Direct wins. Unattributed, Email , Paid Social, Social have the most highest outlines and least median of the CR.

Adding new column

soups$ConversionRate <- soups$`Goal_Completions`/ soups$Sessions
View(soups)

## Warning: running command ''/usr/bin/otool' -L '/Library/Frameworks/
## R.framework/Resources/modules/R_de.so'' had status 1

ggplot( data = soups, aes(x = Source, y = ConversionRate)) + geom_boxplot()

RUnning a linear model placing Goal Completions ~ Sourceto get the idea of contribution of each Source to the Goal Competions

ggplot(data = soups, aes(x = Source, y = `Goal_Completions` )) +
  geom_jitter()

soup.GC.lm=lm(`Goal_Completions`~Source, data = soups)
summary(soup.GC.lm)

## 
## Call:
## lm(formula = Goal_Completions ~ Source, data = soups)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1823.8  -827.6  -117.5    26.6 21133.2 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1824.81      32.00  57.025  < 2e-16 ***
## SourceDisplay      -1485.50      45.52 -32.631  < 2e-16 ***
## SourceEmail        -1706.35      50.12 -34.046  < 2e-16 ***
## SourceOrganic       -400.34      45.33  -8.831  < 2e-16 ***
## SourcePaid Search   -176.49      45.25  -3.901 9.63e-05 ***
## SourcePaid Social  -1707.39      51.74 -33.002  < 2e-16 ***
## SourceReferral      -912.24      45.25 -20.162  < 2e-16 ***
## SourceSocial       -1809.46      57.25 -31.607  < 2e-16 ***
## SourceUnattributed -1822.64     256.20  -7.114 1.17e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1608 on 17027 degrees of freedom
## Multiple R-squared:  0.1533, Adjusted R-squared:  0.1529 
## F-statistic: 385.4 on 8 and 17027 DF,  p-value: < 2.2e-16

Running the model search, I found that the most contributed to the Goal_Completions are placed in the order Paid Search, Organic, Referral, Display, Email, Paid Social, Social, Unattributed, as we can clearly see on the chart. But this model is not precisely matching the graph which could mean the model should be investigated further

Running the linear model placing Sessions ~ Source

ggplot(data = soups, aes(x = Source, y = Sessions )) +
  geom_jitter()

soup.S.lm=lm(Sessions ~ Source, data = soups)
summary(soup.S.lm)

## 
## Call:
## lm(formula = Sessions ~ Source, data = soups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4512  -1988   -755    295 135450 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3312.0       86.8  38.157  < 2e-16 ***
## SourceDisplay        -132.0      123.5  -1.069    0.285    
## SourceEmail         -2783.9      135.9 -20.478  < 2e-16 ***
## SourceOrganic        -630.3      123.0  -5.126 2.99e-07 ***
## SourcePaid Search    1364.1      122.7  11.115  < 2e-16 ***
## SourcePaid Social   -1459.8      140.3 -10.403  < 2e-16 ***
## SourceReferral      -1312.3      122.7 -10.693  < 2e-16 ***
## SourceSocial        -3233.4      155.3 -20.822  < 2e-16 ***
## SourceUnattributed  -3300.4      694.9  -4.749 2.06e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4361 on 17027 degrees of freedom
## Multiple R-squared:  0.08426,    Adjusted R-squared:  0.08383 
## F-statistic: 195.8 on 8 and 17027 DF,  p-value: < 2.2e-16

Here is an interesting finding of the model: The variables impact order on numbers of sessions Paid Search, then Display, then Organic, Referral, Paid Social, Email, Social and Unattributed. The model’s results are not matching the chart. Model requires further investigation.

Placing Bounces ~ Source

ggplot( data = soups, aes(x = Source , y = Bounces)) + geom_jitter()

soup.B.lm=lm(Bounces ~ Source, data = soups)
summary(soup.B.lm)

## 
## Call:
## lm(formula = Bounces ~ Source, data = soups)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -2246   -772   -267     30 122458 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          845.99      62.22  13.598  < 2e-16 ***
## SourceDisplay       1401.31      88.51  15.832  < 2e-16 ***
## SourceEmail         -683.59      97.45  -7.015 2.38e-12 ***
## SourceOrganic       -296.99      88.14  -3.370 0.000754 ***
## SourcePaid Search    939.69      87.97  10.682  < 2e-16 ***
## SourcePaid Social    594.38     100.59   5.909 3.50e-09 ***
## SourceReferral      -417.30      87.97  -4.744 2.12e-06 ***
## SourceSocial        -804.58     111.31  -7.229 5.09e-13 ***
## SourceUnattributed  -840.29     498.12  -1.687 0.091636 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3126 on 17027 degrees of freedom
## Multiple R-squared:  0.05349,    Adjusted R-squared:  0.05305 
## F-statistic: 120.3 on 8 and 17027 DF,  p-value: < 2.2e-16

Here I found that if the source of the traffic was a Display ad, it has 1401.31 affect on Bounce, then it’s Paid Search (939.69), Paid Social (594.38). Same time Organic traffic source provide (-296.99) coefficient to the Bounce rate , Referral (-417.3), Email, Social. I can oversee that as conscious following of a recommendation prevalent advertisement. The Graph of the data set does not match the model’s findings.

Bayesian modeling of contribution of different variables of the dataset to Conversion Rate

model_bma = bas.lm(ConversionRate ~.-Week -Date -Year -`Week #`, data = soups,
                   prior = "BIC", 
                   modelprior = uniform(),
                   method="MCMC")
model_bma

## 
## Call:
## bas.lm(formula = ConversionRate ~ . - Week - Date - Year - `Week #`,     data = soups, prior = "BIC", modelprior = uniform(), method = "MCMC")
## 
## 
##  Marginal Posterior Inclusion Probabilities: 
##          Intercept        Devicemobile        Devicetablet  
##            1.00000             1.00000             0.23216  
##       BrandBrand 2        BrandBrand 3        BrandBrand 4  
##            0.99998             0.99999             1.00000  
##       BrandBrand 5       SourceDisplay         SourceEmail  
##            0.99993             1.00000             0.99999  
##      SourceOrganic   SourcePaid Search   SourcePaid Social  
##            0.04345             0.99998             1.00000  
##     SourceReferral        SourceSocial  SourceUnattributed  
##            0.99999             0.99999             0.01076  
##           Sessions    Goal_Completions             Bounces  
##            0.99999             1.00000             1.00000  
##         TierTier 2  
##            0.99999

summary(model_bma)

##                    P(B != 0 | Y)     model 1       model 2       model 3
## Intercept             1.00000000      1.0000  1.000000e+00  1.000000e+00
## Devicemobile          0.99999733      1.0000  1.000000e+00  1.000000e+00
## Devicetablet          0.23215981      0.0000  1.000000e+00  0.000000e+00
## BrandBrand 2          0.99998474      1.0000  1.000000e+00  1.000000e+00
## BrandBrand 3          0.99999065      1.0000  1.000000e+00  1.000000e+00
## BrandBrand 4          0.99999638      1.0000  1.000000e+00  1.000000e+00
## BrandBrand 5          0.99992943      1.0000  1.000000e+00  1.000000e+00
## SourceDisplay         0.99999676      1.0000  1.000000e+00  1.000000e+00
## SourceEmail           0.99999428      1.0000  1.000000e+00  1.000000e+00
## SourceOrganic         0.04345188      0.0000  0.000000e+00  1.000000e+00
## SourcePaid Search     0.99998302      1.0000  1.000000e+00  1.000000e+00
## SourcePaid Social     0.99999580      1.0000  1.000000e+00  1.000000e+00
## SourceReferral        0.99998856      1.0000  1.000000e+00  1.000000e+00
## SourceSocial          0.99999065      1.0000  1.000000e+00  1.000000e+00
## SourceUnattributed    0.01076469      0.0000  0.000000e+00  0.000000e+00
## Sessions              0.99998627      1.0000  1.000000e+00  1.000000e+00
## Goal_Completions      0.99999771      1.0000  1.000000e+00  1.000000e+00
## Bounces               1.00000000      1.0000  1.000000e+00  1.000000e+00
## TierTier 2            0.99999409      1.0000  1.000000e+00  1.000000e+00
## BF                            NA      1.0000  3.008047e-01  4.500783e-02
## PostProbs                     NA      0.7264  2.197000e-01  3.320000e-02
## R2                            NA      0.6011  6.013000e-01  6.012000e-01
## dim                           NA     16.0000  1.700000e+01  1.700000e+01
## logmarg                       NA -49604.9656 -4.960617e+04 -4.960807e+04
##                          model 4       model 5
## Intercept           1.000000e+00  1.000000e+00
## Devicemobile        1.000000e+00  1.000000e+00
## Devicetablet        1.000000e+00  0.000000e+00
## BrandBrand 2        1.000000e+00  1.000000e+00
## BrandBrand 3        1.000000e+00  1.000000e+00
## BrandBrand 4        1.000000e+00  1.000000e+00
## BrandBrand 5        1.000000e+00  1.000000e+00
## SourceDisplay       1.000000e+00  1.000000e+00
## SourceEmail         1.000000e+00  1.000000e+00
## SourceOrganic       1.000000e+00  0.000000e+00
## SourcePaid Search   1.000000e+00  1.000000e+00
## SourcePaid Social   1.000000e+00  1.000000e+00
## SourceReferral      1.000000e+00  1.000000e+00
## SourceSocial        1.000000e+00  1.000000e+00
## SourceUnattributed  0.000000e+00  1.000000e+00
## Sessions            1.000000e+00  1.000000e+00
## Goal_Completions    1.000000e+00  1.000000e+00
## Bounces             1.000000e+00  1.000000e+00
## TierTier 2          1.000000e+00  1.000000e+00
## BF                  1.396999e-02  1.119280e-02
## PostProbs           9.900000e-03  7.900000e-03
## R2                  6.014000e-01  6.011000e-01
## dim                 1.800000e+01  1.700000e+01
## logmarg            -4.960924e+04 -4.960946e+04

Comparing Tier1 and Tier 2

par(mfrow=c(2,2))
boxplot(ConversionRate ~ Tier , data = soups, col = "lightgray", main = "Compare Convertion Rate")
boxplot(Sessions ~ Tier , data = soups, col = "lightgray", main = "Compare Sessions")
boxplot(Bounces ~ Tier , data = soups, col = "lightgray", main = "Compare Bounces")
boxplot(Goal_Completions ~ Tier , data = soups, col = "lightgray", main = "Compare Goal Completions")