COLLEGE / COMPANY: Delhi Technological University Date: December 29, 2017 EMAIL: deepankvarshney14@gmail.com NAME: Deepank Varshney Project Title: Email Marketing Campaign management

SYNOPSIS Initial Analysis of the Effect of various factors on effective email marketing campaign

setwd("~/winter internship")
eml <- read.csv(paste("email_campaign.csv",sep=""))
 View(eml)

Dimensions

dim(eml)
## [1] 68353    12

Summary Statistics

library(psych)
describe(eml)
##                           vars     n     mean       sd  median  trimmed
## Email_ID*                    1 68353 34177.00 19731.96 34177.0 34177.00
## Email_Type                   2 68353     1.29     0.45     1.0     1.23
## Subject_Hotness_Score        3 68353     1.10     1.00     0.8     0.97
## Email_Source_Type            4 68353     1.46     0.50     1.0     1.45
## Customer_Location*           5 68353     5.34     2.57     6.0     5.55
## Email_Campaign_Type          6 68353     2.27     0.47     2.0     2.23
## Total_Past_Communications    7 61528    28.93    12.54    28.0    28.60
## Time_Email_sent_Category     8 68353     2.00     0.63     2.0     2.00
## Word_Count                   9 68353   699.93   271.72   694.0   699.05
## Total_Links                 10 66152    10.43     6.38     9.0     9.59
## Total_Images                11 66676     3.55     5.60     0.0     2.31
## Email_Status                12 68353     0.23     0.50     0.0     0.12
##                                mad min   max range  skew kurtosis    se
## Email_ID*                 25334.67   1 68353 68352  0.00    -1.20 75.47
## Email_Type                    0.00   1     2     1  0.95    -1.09  0.00
## Subject_Hotness_Score         1.04   0     5     5  0.90     0.08  0.00
## Email_Source_Type             0.00   1     2     1  0.17    -1.97  0.00
## Customer_Location*            2.97   1     8     7 -0.53    -1.10  0.01
## Email_Campaign_Type           0.00   1     3     2  0.71    -0.69  0.00
## Total_Past_Communications    13.34   0    67    67  0.21    -0.44  0.05
## Time_Email_sent_Category      0.00   1     3     2  0.00    -0.49  0.00
## Word_Count                  265.39  40  1316  1276  0.01    -0.35  1.04
## Total_Links                   4.45   1    49    48  1.39     2.50  0.02
## Total_Images                  0.00   0    45    45  2.09     5.03  0.02
## Email_Status                  0.00   0     2     2  2.08     3.53  0.00

1Way Contingency Tables

table(eml$Email_Type)
## 
##     1     2 
## 48866 19487

Type 1 email is used more by the company

table(eml$Email_Source_Type)
## 
##     1     2 
## 37149 31204

Type 1 email source is used more by the company

table(eml$Email_Campaign_Type)
## 
##     1     2     3 
##   736 48273 19344

Type 2 email campaign is used more by the company

table(eml$Email_Status)
## 
##     0     1     2 
## 54941 11039  2373

0 - represents mails which weren’t read 1 - represents mails which were read 2 - represents mails which were read and replied to Clearly maximum people don’t open their mails

table(eml$Customer_Location)
## 
##           A     B     C     D     E     F     G 
## 11595  1454  4341  5758  7406 10193  4433 23173

the company has maximum customers in G area

table(eml$Time_Email_sent_Category)
## 
##     1     2     3 
## 13636 41129 13588

1- represents morning 2- represents afternoon 3- represents night

2 way contingency tables

mytable1 <- xtabs(~ Email_Status + Time_Email_sent_Category,data = eml)
addmargins(mytable1)
##             Time_Email_sent_Category
## Email_Status     1     2     3   Sum
##          0   10970 33062 10909 54941
##          1    2186  6631  2222 11039
##          2     480  1436   457  2373
##          Sum 13636 41129 13588 68353

Maximum mails are sent in afternoon and email status doesn’t much depend on time

mytable1 <- xtabs(~ Email_Status + Customer_Location ,data = eml)
addmargins(mytable1)
##             Customer_Location
## Email_Status           A     B     C     D     E     F     G   Sum
##          0    9351  1160  3502  4654  5950  8136  3579 18609 54941
##          1    1848   245   714   890  1206  1693   698  3745 11039
##          2     396    49   125   214   250   364   156   819  2373
##          Sum 11595  1454  4341  5758  7406 10193  4433 23173 68353

Maximum mails are sent in G area

mytable1 <- xtabs(~ Email_Status + Email_Campaign_Type
 ,data = eml)
addmargins(mytable1)
##             Email_Campaign_Type
## Email_Status     1     2     3   Sum
##          0      77 42115 12749 54941
##          1     486  5446  5107 11039
##          2     173   712  1488  2373
##          Sum   736 48273 19344 68353

Campaign 3 seems to be most promising

mytable1 <- xtabs(~ Email_Status +Email_Source_Type  ,data = eml)
addmargins(mytable1)
##             Email_Source_Type
## Email_Status     1     2   Sum
##          0   29577 25364 54941
##          1    6150  4889 11039
##          2    1422   951  2373
##          Sum 37149 31204 68353
mytable1 <- xtabs(~ Email_Status + Email_Type ,data = eml)
addmargins(mytable1)
##             Email_Type
## Email_Status     1     2   Sum
##          0   39004 15937 54941
##          1    8208  2831 11039
##          2    1654   719  2373
##          Sum 48866 19487 68353

Type 1 seems more promising

Comparitive averages of various factors by email status

aggregate(eml$Word_Count,by=list(eml$Email_Status),mean)
##   Group.1        x
## 1       0 725.2568
## 2       1 590.9396
## 3       2 620.6153

For people to read and reply, average word used in mail is 620

aggregate(eml$Total_Images,by=list(eml$Email_Status),mean,na.rm=TRUE)
##   Group.1        x
## 1       0 3.617575
## 2       1 3.187361
## 3       2 3.690415

For people to read and reply, average images used in mail is 3.7

aggregate(eml$Total_Links,by=list(eml$Email_Status),mean,na.rm=TRUE)
##   Group.1         x
## 1       0 10.543199
## 2       1  9.854375
## 3       2 10.473638

For people to read and reply, average links used in a mail is 10.47

aggregate(eml$Total_Past_Communications,by=list(eml$Email_Status),mean,na.rm=TRUE)
##   Group.1        x
## 1       0 27.40013
## 2       1 34.70782
## 3       2 37.59680

For people to read and reply, average past communications in mail is 34.7

aggregate(eml$Subject_Hotness_Score,by=list(eml$Email_Status),mean)
##   Group.1         x
## 1       0 1.1595584
## 2       1 0.9075188
## 3       2 0.4863043

For people to read and reply, average subject hotness score is 0.486

Boxplots

boxplot(eml$Total_Images ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of images in the mail",ylab="Email Status" ,las=1,
     )

boxplot(eml$Total_Past_Communications ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of past communications with the recipient in the mail",ylab="Email Status" ,las=1,
     )

For people to read and reply, past commnications should be higher

boxplot(eml$Total_Links ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of links in the mail",ylab="Email Status" ,las=1,
     )

boxplot(eml$Word_Count ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of words in the mail",ylab="Email Status" ,las=1,
     )

For people to read and reply, number of words should be higher

boxplot(eml$Subject_Hotness_Score ~ eml$Email_Status, horizontal=TRUE,
    xlab="Subject hotness score",ylab="Email Status" ,las=1,
     )

For people to read and reply, subject hotness should be lower

Histograms

hist(eml$Total_Past_Communications,main="Total Past Comunnications frequency",col="lightblue")

hist(eml$Total_Images,main="Total Images frequency",col="lightblue")

hist(eml$Total_Links,main="Total links frequency",col="lightblue")

hist(eml$Subject_Hotness_Score,main="subject hotness score frequency",col="lightblue")

hist(eml$Word_Count,main="word count frequency",col="lightblue")

plots

plot(jitter(eml$Email_Status), jitter(eml$Subject_Hotness_Score) )

plot(jitter(eml$Email_Status), jitter(eml$Word_Count) )

plot(jitter(eml$Email_Status), jitter(eml$Total_Past_Communications) )

plot(jitter(eml$Email_Status), jitter(eml$Total_Links) )

plot(jitter(eml$Email_Status), jitter(eml$Total_Images) )

round(cor(eml[,c(2:4,6:12)]),2)
##                           Email_Type Subject_Hotness_Score
## Email_Type                      1.00                 -0.24
## Subject_Hotness_Score          -0.24                  1.00
## Email_Source_Type              -0.28                  0.01
## Email_Campaign_Type             0.23                 -0.55
## Total_Past_Communications         NA                    NA
## Time_Email_sent_Category        0.00                  0.00
## Word_Count                      0.08                 -0.24
## Total_Links                       NA                    NA
## Total_Images                      NA                    NA
## Email_Status                   -0.02                 -0.15
##                           Email_Source_Type Email_Campaign_Type
## Email_Type                            -0.28                0.23
## Subject_Hotness_Score                  0.01               -0.55
## Email_Source_Type                      1.00                0.04
## Email_Campaign_Type                    0.04                1.00
## Total_Past_Communications                NA                  NA
## Time_Email_sent_Category               0.00                0.00
## Word_Count                             0.05                0.06
## Total_Links                              NA                  NA
## Total_Images                             NA                  NA
## Email_Status                          -0.02                0.19
##                           Total_Past_Communications
## Email_Type                                       NA
## Subject_Hotness_Score                            NA
## Email_Source_Type                                NA
## Email_Campaign_Type                              NA
## Total_Past_Communications                         1
## Time_Email_sent_Category                         NA
## Word_Count                                       NA
## Total_Links                                      NA
## Total_Images                                     NA
## Email_Status                                     NA
##                           Time_Email_sent_Category Word_Count Total_Links
## Email_Type                                       0       0.08          NA
## Subject_Hotness_Score                            0      -0.24          NA
## Email_Source_Type                                0       0.05          NA
## Email_Campaign_Type                              0       0.06          NA
## Total_Past_Communications                       NA         NA          NA
## Time_Email_sent_Category                         1       0.00          NA
## Word_Count                                       0       1.00          NA
## Total_Links                                     NA         NA           1
## Total_Images                                    NA         NA          NA
## Email_Status                                     0      -0.17          NA
##                           Total_Images Email_Status
## Email_Type                          NA        -0.02
## Subject_Hotness_Score               NA        -0.15
## Email_Source_Type                   NA        -0.02
## Email_Campaign_Type                 NA         0.19
## Total_Past_Communications           NA           NA
## Time_Email_sent_Category            NA         0.00
## Word_Count                          NA        -0.17
## Total_Links                         NA           NA
## Total_Images                         1           NA
## Email_Status                        NA         1.00
library(corrgram)
corrgram(eml,order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)

Hypothesis- Effect of various factors on effective email marketing campaign where maximum mails sent are read, read and replied to.

Chi Square Tests

chisq.test(eml$Email_Status, eml$Email_Type)
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Email_Type
## X-squared = 54.842, df = 2, p-value = 1.234e-12

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Email_Campaign_Type)
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Email_Campaign_Type
## X-squared = 6733.5, df = 4, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Email_Source_Type)
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Email_Source_Type
## X-squared = 43.859, df = 2, p-value = 2.993e-10

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Total_Past_Communications)
## Warning in chisq.test(eml$Email_Status, eml$Total_Past_Communications):
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Total_Past_Communications
## X-squared = 8778.7, df = 126, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Total_Links)
## Warning in chisq.test(eml$Email_Status, eml$Total_Links): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Total_Links
## X-squared = 3092.2, df = 72, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Total_Images)
## Warning in chisq.test(eml$Email_Status, eml$Total_Images): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Total_Images
## X-squared = 134.77, df = 88, p-value = 0.0009949

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Word_Count)
## Warning in chisq.test(eml$Email_Status, eml$Word_Count): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Word_Count
## X-squared = 3960.9, df = 300, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Subject_Hotness_Score)
## Warning in chisq.test(eml$Email_Status, eml$Subject_Hotness_Score): Chi-
## squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Subject_Hotness_Score
## X-squared = 3579.8, df = 100, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Time_Email_sent_Category)
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Time_Email_sent_Category
## X-squared = 1.1065, df = 4, p-value = 0.8932

Null hypothesis is accepted, they are not correlated

chisq.test(eml$Email_Status, eml$Customer_Location)
## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Customer_Location
## X-squared = 11.73, df = 14, p-value = 0.6279

Null hypothesis is accepted, they are not correlated

T tests

t.test(eml$Email_Status, eml$Email_Type)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Email_Type
## t = -410.45, df = 135460, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.059194 -1.049126
## sample estimates:
## mean of x mean of y 
## 0.2309335 1.2850936

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Email_Campaign_Type)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Email_Campaign_Type
## t = -781.21, df = 136240, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.046422 -2.036179
## sample estimates:
## mean of x mean of y 
## 0.2309335 2.2722338

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Email_Source_Type)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Email_Source_Type
## t = -455.36, df = 136700, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.230854 -1.220304
## sample estimates:
## mean of x mean of y 
## 0.2309335 1.4565125

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Total_Past_Communications)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Total_Past_Communications
## t = -567.5, df = 61701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -28.80145 -28.60319
## sample estimates:
##  mean of x  mean of y 
##  0.2309335 28.9332499

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Total_Links)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Total_Links
## t = -409.73, df = 66927, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.24738 -10.14981
## sample estimates:
##  mean of x  mean of y 
##  0.2309335 10.4295259

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Total_Images)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Total_Images
## t = -152.57, df = 67701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.362391 -3.277097
## sample estimates:
## mean of x mean of y 
## 0.2309335 3.5506779

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Word_Count)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Word_Count
## t = -673.24, df = 68352, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -701.7379 -697.6638
## sample estimates:
##   mean of x   mean of y 
##   0.2309335 699.9317513

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Subject_Hotness_Score)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Subject_Hotness_Score
## t = -202.8, df = 100320, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.8729027 -0.8561918
## sample estimates:
## mean of x mean of y 
## 0.2309335 1.0954808

Null hypothesis is rejected, there is significant difference in their means

t.test(eml$Email_Status, eml$Time_Email_sent_Category)
## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Time_Email_sent_Category
## t = -575.52, df = 129590, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.774387 -1.762342
## sample estimates:
## mean of x mean of y 
## 0.2309335 1.9992978

Null hypothesis is rejected, there is significant difference in their means