Deepank_Varshney_My_Project

#COLLEGE / COMPANY: Delhi Technological University

#Date: December 29, 2017

#NAME: Deepank Varshney

#Project Title: Email Marketing Campaign management

SYNOPSIS Analysis of the Effect of various factors on effective email marketing campaign

setwd("~/winter internship")
eml <- read.csv(paste("email_campaign.csv",sep=""))
 View(eml)

#Dimensions

dim(eml)

## [1] 68353    12

#Head

head(eml)

##            Email_ID Email_Type Subject_Hotness_Score Email_Source_Type
## 1 EMA00081000034500          1                   2.2                 2
## 2 EMA00081000045360          2                   2.1                 1
## 3 EMA00081000066290          2                   0.1                 1
## 4 EMA00081000076560          1                   3.0                 2
## 5 EMA00081000109720          1                   0.0                 2
## 6 EMA00081000131660          1                   1.5                 1
##   Customer_Location Email_Campaign_Type Total_Past_Communications
## 1                 E                   2                        33
## 2                                     2                        15
## 3                 B                   3                        36
## 4                 E                   2                        25
## 5                 C                   3                        18
## 6                 G                   2                        NA
##   Time_Email_sent_Category Word_Count Total_Links Total_Images
## 1                        1        440           8            0
## 2                        2        504           5            0
## 3                        2        962           5            0
## 4                        2        610          16            0
## 5                        2        947           4            0
## 6                        2        416          11            0
##   Email_Status
## 1            0
## 2            0
## 3            1
## 4            0
## 5            0
## 6            0

#Summary Statistics

library(psych)
describe(eml)

##                           vars     n     mean       sd  median  trimmed
## Email_ID*                    1 68353 34177.00 19731.96 34177.0 34177.00
## Email_Type                   2 68353     1.29     0.45     1.0     1.23
## Subject_Hotness_Score        3 68353     1.10     1.00     0.8     0.97
## Email_Source_Type            4 68353     1.46     0.50     1.0     1.45
## Customer_Location*           5 68353     5.34     2.57     6.0     5.55
## Email_Campaign_Type          6 68353     2.27     0.47     2.0     2.23
## Total_Past_Communications    7 61528    28.93    12.54    28.0    28.60
## Time_Email_sent_Category     8 68353     2.00     0.63     2.0     2.00
## Word_Count                   9 68353   699.93   271.72   694.0   699.05
## Total_Links                 10 66152    10.43     6.38     9.0     9.59
## Total_Images                11 66676     3.55     5.60     0.0     2.31
## Email_Status                12 68353     0.23     0.50     0.0     0.12
##                                mad min   max range  skew kurtosis    se
## Email_ID*                 25334.67   1 68353 68352  0.00    -1.20 75.47
## Email_Type                    0.00   1     2     1  0.95    -1.09  0.00
## Subject_Hotness_Score         1.04   0     5     5  0.90     0.08  0.00
## Email_Source_Type             0.00   1     2     1  0.17    -1.97  0.00
## Customer_Location*            2.97   1     8     7 -0.53    -1.10  0.01
## Email_Campaign_Type           0.00   1     3     2  0.71    -0.69  0.00
## Total_Past_Communications    13.34   0    67    67  0.21    -0.44  0.05
## Time_Email_sent_Category      0.00   1     3     2  0.00    -0.49  0.00
## Word_Count                  265.39  40  1316  1276  0.01    -0.35  1.04
## Total_Links                   4.45   1    49    48  1.39     2.50  0.02
## Total_Images                  0.00   0    45    45  2.09     5.03  0.02
## Email_Status                  0.00   0     2     2  2.08     3.53  0.00

#1Way Contingency Tables

table(eml$Email_Type)

## 
##     1     2 
## 48866 19487

Type 1 email is used more by the company

table(eml$Email_Source_Type)

## 
##     1     2 
## 37149 31204

Type 1 email source is used more by the company

table(eml$Email_Campaign_Type)

## 
##     1     2     3 
##   736 48273 19344

Type 2 email campaign is used more by the company

table(eml$Email_Status)

## 
##     0     1     2 
## 54941 11039  2373

0 - represents mails which weren’t read 1 - represents mails which were read 2 - represents mails which were read and replied to Clearly maximum people don’t open their mails

table(eml$Customer_Location)

## 
##           A     B     C     D     E     F     G 
## 11595  1454  4341  5758  7406 10193  4433 23173

the company has maximum customers in G area

table(eml$Time_Email_sent_Category)

## 
##     1     2     3 
## 13636 41129 13588

1- represents morning 2- represents afternoon 3- represents night

#2 way contingency tables

mytable1 <- xtabs(~ Email_Status + Time_Email_sent_Category,data = eml)
addmargins(mytable1)

##             Time_Email_sent_Category
## Email_Status     1     2     3   Sum
##          0   10970 33062 10909 54941
##          1    2186  6631  2222 11039
##          2     480  1436   457  2373
##          Sum 13636 41129 13588 68353

Maximum mails are sent in afternoon and email status doesn’t much depend on time

mytable1 <- xtabs(~ Email_Status + Customer_Location ,data = eml)
addmargins(mytable1)

##             Customer_Location
## Email_Status           A     B     C     D     E     F     G   Sum
##          0    9351  1160  3502  4654  5950  8136  3579 18609 54941
##          1    1848   245   714   890  1206  1693   698  3745 11039
##          2     396    49   125   214   250   364   156   819  2373
##          Sum 11595  1454  4341  5758  7406 10193  4433 23173 68353

Maximum mails are sent in G area

mytable1 <- xtabs(~ Email_Status + Email_Campaign_Type
 ,data = eml)
addmargins(mytable1)

##             Email_Campaign_Type
## Email_Status     1     2     3   Sum
##          0      77 42115 12749 54941
##          1     486  5446  5107 11039
##          2     173   712  1488  2373
##          Sum   736 48273 19344 68353

Campaign 3 seems to be most promising

mytable1 <- xtabs(~ Email_Status +Email_Source_Type  ,data = eml)
addmargins(mytable1)

##             Email_Source_Type
## Email_Status     1     2   Sum
##          0   29577 25364 54941
##          1    6150  4889 11039
##          2    1422   951  2373
##          Sum 37149 31204 68353

mytable1 <- xtabs(~ Email_Status + Email_Type ,data = eml)
addmargins(mytable1)

##             Email_Type
## Email_Status     1     2   Sum
##          0   39004 15937 54941
##          1    8208  2831 11039
##          2    1654   719  2373
##          Sum 48866 19487 68353

Type 1 seems more promising

#Comparitive averages of various factors by email status

mean1 <- aggregate(eml$Word_Count,by=list(eml$Email_Status),mean)
mean1

##   Group.1        x
## 1       0 725.2568
## 2       1 590.9396
## 3       2 620.6153

library(lattice)
barchart(Group.1 ~ x,data=mean1, main="averages of Word Count by email Status(1.UnRead;2.read;3.Read And Replied")

For people to read and reply, average word used in mail is 620. Less than this and more than this results in less attractive email

mean2 <-aggregate(eml$Total_Images,by=list(eml$Email_Status),mean,na.rm=TRUE)
mean2

##   Group.1        x
## 1       0 3.617575
## 2       1 3.187361
## 3       2 3.690415

library(lattice)
barchart(Group.1 ~ x,data=mean2, main="averages of Images by email Status(1.UnRead;2.read;3.Read And Replied")

For people to read and reply, average images used in mail is 3.7.Less than this results in less attractive email

mean3<-aggregate(eml$Total_Links,by=list(eml$Email_Status),mean,na.rm=TRUE)
mean3

##   Group.1         x
## 1       0 10.543199
## 2       1  9.854375
## 3       2 10.473638

library(lattice)
barchart(Group.1 ~ x,data=mean3, main="averages of links by email Status(1.UnRead;2.read;3.Read And Replied")

For people to read and reply, average links used in a mail is 10.47. Less than this and more than this results in less attractive email

mean4<-aggregate(eml$Total_Past_Communications,by=list(eml$Email_Status),mean,na.rm=TRUE)
mean4

##   Group.1        x
## 1       0 27.40013
## 2       1 34.70782
## 3       2 37.59680

library(lattice)
barchart(Group.1 ~ x,data=mean4, main="averages of past comm. by email Status(1.UnRead;2.read;3.Read And Replied")

For people to read and reply, average past communications in mail is 34.7 .Less than this results in less attractive email

mean5<-aggregate(eml$Subject_Hotness_Score,by=list(eml$Email_Status),mean)
mean5

##   Group.1         x
## 1       0 1.1595584
## 2       1 0.9075188
## 3       2 0.4863043

library(lattice)
barchart(Group.1 ~ x,data=mean5, main="averages of subject score by email Status(1.UnRead;2.read;3.Read And Replied")

For people to read and reply, average subject hotness score is 0.486. More than this results in less attractive email

#Boxplots

boxplot(eml$Total_Images ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of images in the mail",ylab="Email Status" ,las=1,
     )

boxplot(eml$Total_Past_Communications ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of past communications with the recipient in the mail",ylab="Email Status" ,las=1,
     )

For people to read and reply, past commnications should be higher

boxplot(eml$Total_Links ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of links in the mail",ylab="Email Status" ,las=1,
     )

boxplot(eml$Word_Count ~ eml$Email_Status, horizontal=TRUE,
    xlab="Number of words in the mail",ylab="Email Status" ,las=1,
     )

For people to read and reply, number of words should be higher

boxplot(eml$Subject_Hotness_Score ~ eml$Email_Status, horizontal=TRUE,
    xlab="Subject hotness score",ylab="Email Status" ,las=1,
     )

For people to read and reply, subject hotness should be lower

#Histograms

hist(eml$Total_Past_Communications,main="Total Past Comunnications frequency",col="lightblue")

hist(eml$Total_Images,main="Total Images frequency",col="lightblue")

hist(eml$Total_Links,main="Total links frequency",col="lightblue")

hist(eml$Subject_Hotness_Score,main="subject hotness score frequency",col="lightblue")

hist(eml$Word_Count,main="word count frequency",col="lightblue")

#histogram using library lattice

library(lattice)
histogram(~ Customer_Location | Email_Status ,data = eml,col="brown")

percent of total emails by location is nearly same in all the 3 cases of email status(read;unread;read and replied)

library(lattice)
histogram(~ Email_Type | Email_Status ,data = eml,col="brown")

percent of total emails by email type is not same in all the 3 cases of email status(read;unread;read and replied). For status read and replied, type 2 email type is used more

library(lattice)
histogram(~ Email_Campaign_Type | Email_Status ,data = eml,col="brown")

percent of total emails by Email Campaign type is not same in all the 3 cases of email status(read;unread;read and replied).For status read and replied, type 3 email campaign is used more

library(lattice)
histogram(~ Email_Source_Type | Email_Status ,data = eml,col="brown")

percent of total emails by email source type is not same in all the 3 cases of email status(read;unread;read and replied).For status read and replied, type 1 email source type is used more

library(lattice)
histogram(~ Time_Email_sent_Category | Email_Status ,data = eml,col="brown")

percent of total emails by time at which it is sent is nearly same in all the 3 cases of email status(read;unread;read and replied)

#plots

plot(jitter(eml$Email_Status), jitter(eml$Subject_Hotness_Score) )

plot(jitter(eml$Email_Status), jitter(eml$Word_Count) )

plot(jitter(eml$Email_Status), jitter(eml$Total_Past_Communications) )

plot(jitter(eml$Email_Status), jitter(eml$Total_Links) )

plot(jitter(eml$Email_Status), jitter(eml$Total_Images) )

round(cor(eml[,c(2:4,6:12)]),2)

##                           Email_Type Subject_Hotness_Score
## Email_Type                      1.00                 -0.24
## Subject_Hotness_Score          -0.24                  1.00
## Email_Source_Type              -0.28                  0.01
## Email_Campaign_Type             0.23                 -0.55
## Total_Past_Communications         NA                    NA
## Time_Email_sent_Category        0.00                  0.00
## Word_Count                      0.08                 -0.24
## Total_Links                       NA                    NA
## Total_Images                      NA                    NA
## Email_Status                   -0.02                 -0.15
##                           Email_Source_Type Email_Campaign_Type
## Email_Type                            -0.28                0.23
## Subject_Hotness_Score                  0.01               -0.55
## Email_Source_Type                      1.00                0.04
## Email_Campaign_Type                    0.04                1.00
## Total_Past_Communications                NA                  NA
## Time_Email_sent_Category               0.00                0.00
## Word_Count                             0.05                0.06
## Total_Links                              NA                  NA
## Total_Images                             NA                  NA
## Email_Status                          -0.02                0.19
##                           Total_Past_Communications
## Email_Type                                       NA
## Subject_Hotness_Score                            NA
## Email_Source_Type                                NA
## Email_Campaign_Type                              NA
## Total_Past_Communications                         1
## Time_Email_sent_Category                         NA
## Word_Count                                       NA
## Total_Links                                      NA
## Total_Images                                     NA
## Email_Status                                     NA
##                           Time_Email_sent_Category Word_Count Total_Links
## Email_Type                                       0       0.08          NA
## Subject_Hotness_Score                            0      -0.24          NA
## Email_Source_Type                                0       0.05          NA
## Email_Campaign_Type                              0       0.06          NA
## Total_Past_Communications                       NA         NA          NA
## Time_Email_sent_Category                         1       0.00          NA
## Word_Count                                       0       1.00          NA
## Total_Links                                     NA         NA           1
## Total_Images                                    NA         NA          NA
## Email_Status                                     0      -0.17          NA
##                           Total_Images Email_Status
## Email_Type                          NA        -0.02
## Subject_Hotness_Score               NA        -0.15
## Email_Source_Type                   NA        -0.02
## Email_Campaign_Type                 NA         0.19
## Total_Past_Communications           NA           NA
## Time_Email_sent_Category            NA         0.00
## Word_Count                          NA        -0.17
## Total_Links                         NA           NA
## Total_Images                         1           NA
## Email_Status                        NA         1.00

library(corrgram)
corrgram(eml,order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)

#Hypothesis- Effect of various factors on effective email marketing campaign where maximum mails sent are read or read and replied to. #Chi Square Tests

chisq.test(eml$Email_Status, eml$Email_Type)

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Email_Type
## X-squared = 54.842, df = 2, p-value = 1.234e-12

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Email_Campaign_Type)

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Email_Campaign_Type
## X-squared = 6733.5, df = 4, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Email_Source_Type)

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Email_Source_Type
## X-squared = 43.859, df = 2, p-value = 2.993e-10

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Total_Past_Communications)

## Warning in chisq.test(eml$Email_Status, eml$Total_Past_Communications):
## Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Total_Past_Communications
## X-squared = 8778.7, df = 126, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Total_Links)

## Warning in chisq.test(eml$Email_Status, eml$Total_Links): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Total_Links
## X-squared = 3092.2, df = 72, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Total_Images)

## Warning in chisq.test(eml$Email_Status, eml$Total_Images): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Total_Images
## X-squared = 134.77, df = 88, p-value = 0.0009949

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Word_Count)

## Warning in chisq.test(eml$Email_Status, eml$Word_Count): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Word_Count
## X-squared = 3960.9, df = 300, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Subject_Hotness_Score)

## Warning in chisq.test(eml$Email_Status, eml$Subject_Hotness_Score): Chi-
## squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Subject_Hotness_Score
## X-squared = 3579.8, df = 100, p-value < 2.2e-16

Null hypothesis is rejected, they are correlated

chisq.test(eml$Email_Status, eml$Time_Email_sent_Category)

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Time_Email_sent_Category
## X-squared = 1.1065, df = 4, p-value = 0.8932

Null hypothesis is accepted, they are not correlated since p-value is >0.05

chisq.test(eml$Email_Status, eml$Customer_Location)

## 
##  Pearson's Chi-squared test
## 
## data:  eml$Email_Status and eml$Customer_Location
## X-squared = 11.73, df = 14, p-value = 0.6279

Null hypothesis is accepted, they are not correlated since p-value is >0.05

#T tests

t.test(eml$Email_Status, eml$Email_Type)

## 
##  Welch Two Sample t-test
## 
## data:  eml$Email_Status and eml$Email_Type
## t = -410.45, df = 135460, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.059194 -1.049126
## sample estimates:
## mean of x mean of y 
## 0.2309335 1.2850936