library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(statsr)
library(BAS)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(devtools)
library(mvtnorm)
library(corrplot)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(readxl)
Test_Data <- read_excel("~/Desktop/Case_data/Test_Data.xlsx")
## Warning in strptime(x, format, tz = tz): unknown timezone 'zone/tz/2018c.
## 1.0/zoneinfo/America/Toronto'
soups<-na.omit(Test_Data)
summary(soups)
## Year Week # Week Device
## Min. :2016 Min. : 1.00 Length:17036 Length:17036
## 1st Qu.:2016 1st Qu.:13.00 Class :character Class :character
## Median :2016 Median :24.00 Mode :character Mode :character
## Mean :2016 Mean :24.34
## 3rd Qu.:2017 3rd Qu.:35.00
## Max. :2017 Max. :53.00
## Brand Source Sessions Goal_Completions
## Length:17036 Length:17036 Min. : 1 Min. : 0.0
## Class :character Class :character 1st Qu.: 218 1st Qu.: 32.0
## Mode :character Mode :character Median : 1062 Median : 195.0
## Mean : 2564 Mean : 932.7
## 3rd Qu.: 3232 3rd Qu.: 950.0
## Max. :140126 Max. :22958.0
## Bounces Tier Date
## Min. : 0 Length:17036 Min. :2015-12-27 00:00:00
## 1st Qu.: 76 Class :character 1st Qu.:2016-06-05 00:00:00
## Median : 303 Mode :character Median :2016-11-13 00:00:00
## Mean : 1011 Mean :2016-11-17 10:12:28
## 3rd Qu.: 945 3rd Qu.:2017-04-30 00:00:00
## Max. :124244 Max. :2017-10-08 00:00:00
str(soups)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17036 obs. of 11 variables:
## $ Year : num 2016 2016 2016 2016 2016 ...
## $ Week # : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Week : chr "27 December, 2015" "27 December, 2015" "27 December, 2015" "27 December, 2015" ...
## $ Device : chr "desktop" "desktop" "desktop" "desktop" ...
## $ Brand : chr "Brand 1" "Brand 1" "Brand 1" "Brand 1" ...
## $ Source : chr "Direct" "Display" "Organic" "Paid Search" ...
## $ Sessions : num 409 27 719 573 481 ...
## $ Goal_Completions: num 209 6 307 269 241 ...
## $ Bounces : num 145 15 284 183 137 589 72 1 802 448 ...
## $ Tier : chr "Tier 1" "Tier 1" "Tier 1" "Tier 1" ...
## $ Date : POSIXct, format: "2015-12-27" "2015-12-27" ...
## - attr(*, "na.action")=Class 'omit' Named int [1:1768] 6 13 25 28 41 45 51 58 65 68 ...
## .. ..- attr(*, "names")= chr [1:1768] "6" "13" "25" "28" ...
head(soups)
## # A tibble: 6 x 11
## Year `Week #` Week Device Brand Source Sessions
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 2016 1 27 December, 2015 desktop Brand 1 Direct 409
## 2 2016 1 27 December, 2015 desktop Brand 1 Display 27
## 3 2016 1 27 December, 2015 desktop Brand 1 Organic 719
## 4 2016 1 27 December, 2015 desktop Brand 1 Paid Search 573
## 5 2016 1 27 December, 2015 desktop Brand 1 Referral 481
## 6 2016 1 27 December, 2015 desktop Brand 2 Direct 1677
## # ... with 4 more variables: Goal_Completions <dbl>, Bounces <dbl>,
## # Tier <chr>, Date <dttm>
tail(soups)
## # A tibble: 6 x 11
## Year `Week #` Week Device Brand Source Sessions
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 2017 40 1 October, 2017 tablet Brand 4 Display 381
## 2 2017 40 1 October, 2017 tablet Brand 4 Organic 59
## 3 2017 40 1 October, 2017 tablet Brand 4 Paid Search 1284
## 4 2017 40 1 October, 2017 tablet Brand 4 Paid Social 204
## 5 2017 40 1 October, 2017 tablet Brand 4 Referral 354
## 6 2017 40 1 October, 2017 tablet Brand 4 Social 4
## # ... with 4 more variables: Goal_Completions <dbl>, Bounces <dbl>,
## # Tier <chr>, Date <dttm>
describe(soups)
## Warning in describe(soups): NAs introduced by coercion
## Warning in describe(soups): NAs introduced by coercion
## Warning in describe(soups): NAs introduced by coercion
## Warning in describe(soups): NAs introduced by coercion
## Warning in describe(soups): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## vars n mean sd median trimmed mad min
## Year 1 17036 2016.44 0.50 2016 2016.42 0.00 2016
## Week # 2 17036 24.34 13.97 24 23.97 16.31 1
## Week* 3 17036 NaN NA NA NaN NA Inf
## Device* 4 17036 NaN NA NA NaN NA Inf
## Brand* 5 17036 NaN NA NA NaN NA Inf
## Source* 6 17036 NaN NA NA NaN NA Inf
## Sessions 7 17036 2564.49 4555.90 1062 1726.66 1460.36 1
## Goal_Completions 8 17036 932.69 1746.75 195 490.10 278.73 0
## Bounces 9 17036 1011.23 3212.07 303 529.71 413.65 0
## Tier* 10 17036 NaN NA NA NaN NA Inf
## Date* 11 17036 NaN NA NA NaN NA Inf
## max range skew kurtosis se
## Year 2017 1 0.24 -1.94 0.00
## Week # 53 52 0.16 -0.99 0.11
## Week* -Inf -Inf NA NA NA
## Device* -Inf -Inf NA NA NA
## Brand* -Inf -Inf NA NA NA
## Source* -Inf -Inf NA NA NA
## Sessions 140126 140125 9.68 193.91 34.91
## Goal_Completions 22958 22958 3.23 14.05 13.38
## Bounces 124244 124244 18.70 509.06 24.61
## Tier* -Inf -Inf NA NA NA
## Date* -Inf -Inf NA NA NA
soup_cor <- soups[c( 7:9)]
cor(soup_cor)
## Sessions Goal_Completions Bounces
## Sessions 1.0000000 0.6157141 0.8611959
## Goal_Completions 0.6157141 1.0000000 0.1680161
## Bounces 0.8611959 0.1680161 1.0000000
During the correlation analysis I found that Sessions are correlate to GC 0.62 and to Bounce as 0.86
GC correlates to Sessions as 0.62
Bounce correlates to Sessions as 0.86
The findings were expected.
Devices vs Sessions, mobile devices brings the most of the sessions
ggplot( data = soups, aes(x = Device, y = Sessions)) + geom_jitter()
Device vs Bounces, because mobile devices bring most of sessions, there will be more bounces
ggplot( data = soups, aes(x = Device, y = Bounces)) + geom_jitter()
Finding the highest Conversion Rate among the Sources. Direct wins. Unattributed, Email , Paid Social, Social have the most highest outlines and least median of the CR.
soups$ConversionRate <- soups$`Goal_Completions`/ soups$Sessions
View(soups)
## Warning: running command ''/usr/bin/otool' -L '/Library/Frameworks/
## R.framework/Resources/modules/R_de.so'' had status 1
ggplot( data = soups, aes(x = Source, y = ConversionRate)) + geom_boxplot()
ggplot(data = soups, aes(x = Source, y = `Goal_Completions` )) +
geom_jitter()
soup.GC.lm=lm(`Goal_Completions`~Source, data = soups)
summary(soup.GC.lm)
##
## Call:
## lm(formula = Goal_Completions ~ Source, data = soups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1823.8 -827.6 -117.5 26.6 21133.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1824.81 32.00 57.025 < 2e-16 ***
## SourceDisplay -1485.50 45.52 -32.631 < 2e-16 ***
## SourceEmail -1706.35 50.12 -34.046 < 2e-16 ***
## SourceOrganic -400.34 45.33 -8.831 < 2e-16 ***
## SourcePaid Search -176.49 45.25 -3.901 9.63e-05 ***
## SourcePaid Social -1707.39 51.74 -33.002 < 2e-16 ***
## SourceReferral -912.24 45.25 -20.162 < 2e-16 ***
## SourceSocial -1809.46 57.25 -31.607 < 2e-16 ***
## SourceUnattributed -1822.64 256.20 -7.114 1.17e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1608 on 17027 degrees of freedom
## Multiple R-squared: 0.1533, Adjusted R-squared: 0.1529
## F-statistic: 385.4 on 8 and 17027 DF, p-value: < 2.2e-16
Running the model search, I found that the most contributed to the Goal_Completions are placed in the order Paid Search, Organic, Referral, Display, Email, Paid Social, Social, Unattributed, as we can clearly see on the chart. But this model is not precisely matching the graph which could mean the model should be investigated further
ggplot(data = soups, aes(x = Source, y = Sessions )) +
geom_jitter()
soup.S.lm=lm(Sessions ~ Source, data = soups)
summary(soup.S.lm)
##
## Call:
## lm(formula = Sessions ~ Source, data = soups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4512 -1988 -755 295 135450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3312.0 86.8 38.157 < 2e-16 ***
## SourceDisplay -132.0 123.5 -1.069 0.285
## SourceEmail -2783.9 135.9 -20.478 < 2e-16 ***
## SourceOrganic -630.3 123.0 -5.126 2.99e-07 ***
## SourcePaid Search 1364.1 122.7 11.115 < 2e-16 ***
## SourcePaid Social -1459.8 140.3 -10.403 < 2e-16 ***
## SourceReferral -1312.3 122.7 -10.693 < 2e-16 ***
## SourceSocial -3233.4 155.3 -20.822 < 2e-16 ***
## SourceUnattributed -3300.4 694.9 -4.749 2.06e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4361 on 17027 degrees of freedom
## Multiple R-squared: 0.08426, Adjusted R-squared: 0.08383
## F-statistic: 195.8 on 8 and 17027 DF, p-value: < 2.2e-16
Here is an interesting finding of the model: The variables impact order on numbers of sessions Paid Search, then Display, then Organic, Referral, Paid Social, Email, Social and Unattributed. The model’s results are not matching the chart. Model requires further investigation.
ggplot( data = soups, aes(x = Source , y = Bounces)) + geom_jitter()
soup.B.lm=lm(Bounces ~ Source, data = soups)
summary(soup.B.lm)
##
## Call:
## lm(formula = Bounces ~ Source, data = soups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2246 -772 -267 30 122458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 845.99 62.22 13.598 < 2e-16 ***
## SourceDisplay 1401.31 88.51 15.832 < 2e-16 ***
## SourceEmail -683.59 97.45 -7.015 2.38e-12 ***
## SourceOrganic -296.99 88.14 -3.370 0.000754 ***
## SourcePaid Search 939.69 87.97 10.682 < 2e-16 ***
## SourcePaid Social 594.38 100.59 5.909 3.50e-09 ***
## SourceReferral -417.30 87.97 -4.744 2.12e-06 ***
## SourceSocial -804.58 111.31 -7.229 5.09e-13 ***
## SourceUnattributed -840.29 498.12 -1.687 0.091636 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3126 on 17027 degrees of freedom
## Multiple R-squared: 0.05349, Adjusted R-squared: 0.05305
## F-statistic: 120.3 on 8 and 17027 DF, p-value: < 2.2e-16
Here I found that if the source of the traffic was a Display ad, it has 1401.31 affect on Bounce, then it’s Paid Search (939.69), Paid Social (594.38). Same time Organic traffic source provide (-296.99) coefficient to the Bounce rate , Referral (-417.3), Email, Social. I can oversee that as conscious following of a recommendation prevalent advertisement. The Graph of the data set does not match the model’s findings.
model_bma = bas.lm(ConversionRate ~.-Week -Date -Year -`Week #`, data = soups,
prior = "BIC",
modelprior = uniform(),
method="MCMC")
model_bma
##
## Call:
## bas.lm(formula = ConversionRate ~ . - Week - Date - Year - `Week #`, data = soups, prior = "BIC", modelprior = uniform(), method = "MCMC")
##
##
## Marginal Posterior Inclusion Probabilities:
## Intercept Devicemobile Devicetablet
## 1.00000 1.00000 0.23216
## BrandBrand 2 BrandBrand 3 BrandBrand 4
## 0.99998 0.99999 1.00000
## BrandBrand 5 SourceDisplay SourceEmail
## 0.99993 1.00000 0.99999
## SourceOrganic SourcePaid Search SourcePaid Social
## 0.04345 0.99998 1.00000
## SourceReferral SourceSocial SourceUnattributed
## 0.99999 0.99999 0.01076
## Sessions Goal_Completions Bounces
## 0.99999 1.00000 1.00000
## TierTier 2
## 0.99999
summary(model_bma)
## P(B != 0 | Y) model 1 model 2 model 3
## Intercept 1.00000000 1.0000 1.000000e+00 1.000000e+00
## Devicemobile 0.99999733 1.0000 1.000000e+00 1.000000e+00
## Devicetablet 0.23215981 0.0000 1.000000e+00 0.000000e+00
## BrandBrand 2 0.99998474 1.0000 1.000000e+00 1.000000e+00
## BrandBrand 3 0.99999065 1.0000 1.000000e+00 1.000000e+00
## BrandBrand 4 0.99999638 1.0000 1.000000e+00 1.000000e+00
## BrandBrand 5 0.99992943 1.0000 1.000000e+00 1.000000e+00
## SourceDisplay 0.99999676 1.0000 1.000000e+00 1.000000e+00
## SourceEmail 0.99999428 1.0000 1.000000e+00 1.000000e+00
## SourceOrganic 0.04345188 0.0000 0.000000e+00 1.000000e+00
## SourcePaid Search 0.99998302 1.0000 1.000000e+00 1.000000e+00
## SourcePaid Social 0.99999580 1.0000 1.000000e+00 1.000000e+00
## SourceReferral 0.99998856 1.0000 1.000000e+00 1.000000e+00
## SourceSocial 0.99999065 1.0000 1.000000e+00 1.000000e+00
## SourceUnattributed 0.01076469 0.0000 0.000000e+00 0.000000e+00
## Sessions 0.99998627 1.0000 1.000000e+00 1.000000e+00
## Goal_Completions 0.99999771 1.0000 1.000000e+00 1.000000e+00
## Bounces 1.00000000 1.0000 1.000000e+00 1.000000e+00
## TierTier 2 0.99999409 1.0000 1.000000e+00 1.000000e+00
## BF NA 1.0000 3.008047e-01 4.500783e-02
## PostProbs NA 0.7264 2.197000e-01 3.320000e-02
## R2 NA 0.6011 6.013000e-01 6.012000e-01
## dim NA 16.0000 1.700000e+01 1.700000e+01
## logmarg NA -49604.9656 -4.960617e+04 -4.960807e+04
## model 4 model 5
## Intercept 1.000000e+00 1.000000e+00
## Devicemobile 1.000000e+00 1.000000e+00
## Devicetablet 1.000000e+00 0.000000e+00
## BrandBrand 2 1.000000e+00 1.000000e+00
## BrandBrand 3 1.000000e+00 1.000000e+00
## BrandBrand 4 1.000000e+00 1.000000e+00
## BrandBrand 5 1.000000e+00 1.000000e+00
## SourceDisplay 1.000000e+00 1.000000e+00
## SourceEmail 1.000000e+00 1.000000e+00
## SourceOrganic 1.000000e+00 0.000000e+00
## SourcePaid Search 1.000000e+00 1.000000e+00
## SourcePaid Social 1.000000e+00 1.000000e+00
## SourceReferral 1.000000e+00 1.000000e+00
## SourceSocial 1.000000e+00 1.000000e+00
## SourceUnattributed 0.000000e+00 1.000000e+00
## Sessions 1.000000e+00 1.000000e+00
## Goal_Completions 1.000000e+00 1.000000e+00
## Bounces 1.000000e+00 1.000000e+00
## TierTier 2 1.000000e+00 1.000000e+00
## BF 1.396999e-02 1.119280e-02
## PostProbs 9.900000e-03 7.900000e-03
## R2 6.014000e-01 6.011000e-01
## dim 1.800000e+01 1.700000e+01
## logmarg -4.960924e+04 -4.960946e+04
par(mfrow=c(2,2))
boxplot(ConversionRate ~ Tier , data = soups, col = "lightgray", main = "Compare Convertion Rate")
boxplot(Sessions ~ Tier , data = soups, col = "lightgray", main = "Compare Sessions")
boxplot(Bounces ~ Tier , data = soups, col = "lightgray", main = "Compare Bounces")
boxplot(Goal_Completions ~ Tier , data = soups, col = "lightgray", main = "Compare Goal Completions")