COLLEGE / COMPANY: Delhi Technological University Date: December 29, 2017 EMAIL: deepankvarshney14@gmail.com NAME: Deepank Varshney Project Title: Email Marketing Campaign management
SYNOPSIS Initial Analysis of the Effect of various factors on effective email marketing campaign
setwd("~/winter internship")
eml <- read.csv(paste("email_campaign.csv",sep=""))
View(eml)
dim(eml)
## [1] 68353 12
head(eml)
## Email_ID Email_Type Subject_Hotness_Score Email_Source_Type
## 1 EMA00081000034500 1 2.2 2
## 2 EMA00081000045360 2 2.1 1
## 3 EMA00081000066290 2 0.1 1
## 4 EMA00081000076560 1 3.0 2
## 5 EMA00081000109720 1 0.0 2
## 6 EMA00081000131660 1 1.5 1
## Customer_Location Email_Campaign_Type Total_Past_Communications
## 1 E 2 33
## 2 2 15
## 3 B 3 36
## 4 E 2 25
## 5 C 3 18
## 6 G 2 NA
## Time_Email_sent_Category Word_Count Total_Links Total_Images
## 1 1 440 8 0
## 2 2 504 5 0
## 3 2 962 5 0
## 4 2 610 16 0
## 5 2 947 4 0
## 6 2 416 11 0
## Email_Status
## 1 0
## 2 0
## 3 1
## 4 0
## 5 0
## 6 0
library(psych)
describe(eml)
## vars n mean sd median trimmed
## Email_ID* 1 68353 34177.00 19731.96 34177.0 34177.00
## Email_Type 2 68353 1.29 0.45 1.0 1.23
## Subject_Hotness_Score 3 68353 1.10 1.00 0.8 0.97
## Email_Source_Type 4 68353 1.46 0.50 1.0 1.45
## Customer_Location* 5 68353 5.34 2.57 6.0 5.55
## Email_Campaign_Type 6 68353 2.27 0.47 2.0 2.23
## Total_Past_Communications 7 61528 28.93 12.54 28.0 28.60
## Time_Email_sent_Category 8 68353 2.00 0.63 2.0 2.00
## Word_Count 9 68353 699.93 271.72 694.0 699.05
## Total_Links 10 66152 10.43 6.38 9.0 9.59
## Total_Images 11 66676 3.55 5.60 0.0 2.31
## Email_Status 12 68353 0.23 0.50 0.0 0.12
## mad min max range skew kurtosis se
## Email_ID* 25334.67 1 68353 68352 0.00 -1.20 75.47
## Email_Type 0.00 1 2 1 0.95 -1.09 0.00
## Subject_Hotness_Score 1.04 0 5 5 0.90 0.08 0.00
## Email_Source_Type 0.00 1 2 1 0.17 -1.97 0.00
## Customer_Location* 2.97 1 8 7 -0.53 -1.10 0.01
## Email_Campaign_Type 0.00 1 3 2 0.71 -0.69 0.00
## Total_Past_Communications 13.34 0 67 67 0.21 -0.44 0.05
## Time_Email_sent_Category 0.00 1 3 2 0.00 -0.49 0.00
## Word_Count 265.39 40 1316 1276 0.01 -0.35 1.04
## Total_Links 4.45 1 49 48 1.39 2.50 0.02
## Total_Images 0.00 0 45 45 2.09 5.03 0.02
## Email_Status 0.00 0 2 2 2.08 3.53 0.00
table(eml$Email_Type)
##
## 1 2
## 48866 19487
Type 1 email is used more by the company
table(eml$Email_Source_Type)
##
## 1 2
## 37149 31204
Type 1 email source is used more by the company
table(eml$Email_Campaign_Type)
##
## 1 2 3
## 736 48273 19344
Type 2 email campaign is used more by the company
table(eml$Email_Status)
##
## 0 1 2
## 54941 11039 2373
0 - represents mails which weren’t read 1 - represents mails which were read 2 - represents mails which were read and replied to Clearly maximum people don’t open their mails
table(eml$Customer_Location)
##
## A B C D E F G
## 11595 1454 4341 5758 7406 10193 4433 23173
the company has maximum customers in G area
table(eml$Time_Email_sent_Category)
##
## 1 2 3
## 13636 41129 13588
1- represents morning 2- represents afternoon 3- represents night
mytable1 <- xtabs(~ Email_Status + Time_Email_sent_Category,data = eml)
addmargins(mytable1)
## Time_Email_sent_Category
## Email_Status 1 2 3 Sum
## 0 10970 33062 10909 54941
## 1 2186 6631 2222 11039
## 2 480 1436 457 2373
## Sum 13636 41129 13588 68353
Maximum mails are sent in afternoon and email status doesn’t much depend on time
mytable1 <- xtabs(~ Email_Status + Customer_Location ,data = eml)
addmargins(mytable1)
## Customer_Location
## Email_Status A B C D E F G Sum
## 0 9351 1160 3502 4654 5950 8136 3579 18609 54941
## 1 1848 245 714 890 1206 1693 698 3745 11039
## 2 396 49 125 214 250 364 156 819 2373
## Sum 11595 1454 4341 5758 7406 10193 4433 23173 68353
Maximum mails are sent in G area
mytable1 <- xtabs(~ Email_Status + Email_Campaign_Type
,data = eml)
addmargins(mytable1)
## Email_Campaign_Type
## Email_Status 1 2 3 Sum
## 0 77 42115 12749 54941
## 1 486 5446 5107 11039
## 2 173 712 1488 2373
## Sum 736 48273 19344 68353
Campaign 3 seems to be most promising
mytable1 <- xtabs(~ Email_Status +Email_Source_Type ,data = eml)
addmargins(mytable1)
## Email_Source_Type
## Email_Status 1 2 Sum
## 0 29577 25364 54941
## 1 6150 4889 11039
## 2 1422 951 2373
## Sum 37149 31204 68353
mytable1 <- xtabs(~ Email_Status + Email_Type ,data = eml)
addmargins(mytable1)
## Email_Type
## Email_Status 1 2 Sum
## 0 39004 15937 54941
## 1 8208 2831 11039
## 2 1654 719 2373
## Sum 48866 19487 68353
Type 1 seems more promising
aggregate(eml$Word_Count,by=list(eml$Email_Status),mean)
## Group.1 x
## 1 0 725.2568
## 2 1 590.9396
## 3 2 620.6153
For people to read and reply, average word used in mail is 620
aggregate(eml$Total_Images,by=list(eml$Email_Status),mean,na.rm=TRUE)
## Group.1 x
## 1 0 3.617575
## 2 1 3.187361
## 3 2 3.690415
For people to read and reply, average images used in mail is 3.7
aggregate(eml$Total_Links,by=list(eml$Email_Status),mean,na.rm=TRUE)
## Group.1 x
## 1 0 10.543199
## 2 1 9.854375
## 3 2 10.473638
For people to read and reply, average links used in a mail is 10.47
aggregate(eml$Total_Past_Communications,by=list(eml$Email_Status),mean,na.rm=TRUE)
## Group.1 x
## 1 0 27.40013
## 2 1 34.70782
## 3 2 37.59680
For people to read and reply, average past communications in mail is 34.7
aggregate(eml$Subject_Hotness_Score,by=list(eml$Email_Status),mean)
## Group.1 x
## 1 0 1.1595584
## 2 1 0.9075188
## 3 2 0.4863043
For people to read and reply, average subject hotness score is 0.486
boxplot(eml$Total_Images ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of images in the mail",ylab="Email Status" ,las=1,
)
boxplot(eml$Total_Past_Communications ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of past communications with the recipient in the mail",ylab="Email Status" ,las=1,
)
For people to read and reply, past commnications should be higher
boxplot(eml$Total_Links ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of links in the mail",ylab="Email Status" ,las=1,
)
boxplot(eml$Word_Count ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of words in the mail",ylab="Email Status" ,las=1,
)
For people to read and reply, number of words should be higher
boxplot(eml$Subject_Hotness_Score ~ eml$Email_Status, horizontal=TRUE,
xlab="Subject hotness score",ylab="Email Status" ,las=1,
)
For people to read and reply, subject hotness should be lower
hist(eml$Total_Past_Communications,main="Total Past Comunnications frequency",col="lightblue")
hist(eml$Total_Images,main="Total Images frequency",col="lightblue")
hist(eml$Total_Links,main="Total links frequency",col="lightblue")
hist(eml$Subject_Hotness_Score,main="subject hotness score frequency",col="lightblue")
hist(eml$Word_Count,main="word count frequency",col="lightblue")
plot(jitter(eml$Email_Status), jitter(eml$Subject_Hotness_Score) )
plot(jitter(eml$Email_Status), jitter(eml$Word_Count) )
plot(jitter(eml$Email_Status), jitter(eml$Total_Past_Communications) )
plot(jitter(eml$Email_Status), jitter(eml$Total_Links) )
plot(jitter(eml$Email_Status), jitter(eml$Total_Images) )
round(cor(eml[,c(2:4,6:12)]),2)
## Email_Type Subject_Hotness_Score
## Email_Type 1.00 -0.24
## Subject_Hotness_Score -0.24 1.00
## Email_Source_Type -0.28 0.01
## Email_Campaign_Type 0.23 -0.55
## Total_Past_Communications NA NA
## Time_Email_sent_Category 0.00 0.00
## Word_Count 0.08 -0.24
## Total_Links NA NA
## Total_Images NA NA
## Email_Status -0.02 -0.15
## Email_Source_Type Email_Campaign_Type
## Email_Type -0.28 0.23
## Subject_Hotness_Score 0.01 -0.55
## Email_Source_Type 1.00 0.04
## Email_Campaign_Type 0.04 1.00
## Total_Past_Communications NA NA
## Time_Email_sent_Category 0.00 0.00
## Word_Count 0.05 0.06
## Total_Links NA NA
## Total_Images NA NA
## Email_Status -0.02 0.19
## Total_Past_Communications
## Email_Type NA
## Subject_Hotness_Score NA
## Email_Source_Type NA
## Email_Campaign_Type NA
## Total_Past_Communications 1
## Time_Email_sent_Category NA
## Word_Count NA
## Total_Links NA
## Total_Images NA
## Email_Status NA
## Time_Email_sent_Category Word_Count Total_Links
## Email_Type 0 0.08 NA
## Subject_Hotness_Score 0 -0.24 NA
## Email_Source_Type 0 0.05 NA
## Email_Campaign_Type 0 0.06 NA
## Total_Past_Communications NA NA NA
## Time_Email_sent_Category 1 0.00 NA
## Word_Count 0 1.00 NA
## Total_Links NA NA 1
## Total_Images NA NA NA
## Email_Status 0 -0.17 NA
## Total_Images Email_Status
## Email_Type NA -0.02
## Subject_Hotness_Score NA -0.15
## Email_Source_Type NA -0.02
## Email_Campaign_Type NA 0.19
## Total_Past_Communications NA NA
## Time_Email_sent_Category NA 0.00
## Word_Count NA -0.17
## Total_Links NA NA
## Total_Images 1 NA
## Email_Status NA 1.00
library(corrgram)
corrgram(eml,order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)
chisq.test(eml$Email_Status, eml$Email_Type)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Email_Type
## X-squared = 54.842, df = 2, p-value = 1.234e-12
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Email_Campaign_Type)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Email_Campaign_Type
## X-squared = 6733.5, df = 4, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Email_Source_Type)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Email_Source_Type
## X-squared = 43.859, df = 2, p-value = 2.993e-10
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Total_Past_Communications)
## Warning in chisq.test(eml$Email_Status, eml$Total_Past_Communications):
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Total_Past_Communications
## X-squared = 8778.7, df = 126, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Total_Links)
## Warning in chisq.test(eml$Email_Status, eml$Total_Links): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Total_Links
## X-squared = 3092.2, df = 72, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Total_Images)
## Warning in chisq.test(eml$Email_Status, eml$Total_Images): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Total_Images
## X-squared = 134.77, df = 88, p-value = 0.0009949
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Word_Count)
## Warning in chisq.test(eml$Email_Status, eml$Word_Count): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Word_Count
## X-squared = 3960.9, df = 300, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Subject_Hotness_Score)
## Warning in chisq.test(eml$Email_Status, eml$Subject_Hotness_Score): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Subject_Hotness_Score
## X-squared = 3579.8, df = 100, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Time_Email_sent_Category)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Time_Email_sent_Category
## X-squared = 1.1065, df = 4, p-value = 0.8932
Null hypothesis is accepted, they are not correlated
chisq.test(eml$Email_Status, eml$Customer_Location)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Customer_Location
## X-squared = 11.73, df = 14, p-value = 0.6279
Null hypothesis is accepted, they are not correlated
t.test(eml$Email_Status, eml$Email_Type)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Email_Type
## t = -410.45, df = 135460, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.059194 -1.049126
## sample estimates:
## mean of x mean of y
## 0.2309335 1.2850936
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Email_Campaign_Type)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Email_Campaign_Type
## t = -781.21, df = 136240, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.046422 -2.036179
## sample estimates:
## mean of x mean of y
## 0.2309335 2.2722338
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Email_Source_Type)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Email_Source_Type
## t = -455.36, df = 136700, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.230854 -1.220304
## sample estimates:
## mean of x mean of y
## 0.2309335 1.4565125
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Total_Past_Communications)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Total_Past_Communications
## t = -567.5, df = 61701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -28.80145 -28.60319
## sample estimates:
## mean of x mean of y
## 0.2309335 28.9332499
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Total_Links)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Total_Links
## t = -409.73, df = 66927, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.24738 -10.14981
## sample estimates:
## mean of x mean of y
## 0.2309335 10.4295259
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Total_Images)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Total_Images
## t = -152.57, df = 67701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.362391 -3.277097
## sample estimates:
## mean of x mean of y
## 0.2309335 3.5506779
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Word_Count)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Word_Count
## t = -673.24, df = 68352, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -701.7379 -697.6638
## sample estimates:
## mean of x mean of y
## 0.2309335 699.9317513
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Subject_Hotness_Score)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Subject_Hotness_Score
## t = -202.8, df = 100320, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8729027 -0.8561918
## sample estimates:
## mean of x mean of y
## 0.2309335 1.0954808
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Time_Email_sent_Category)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Time_Email_sent_Category
## t = -575.52, df = 129590, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.774387 -1.762342
## sample estimates:
## mean of x mean of y
## 0.2309335 1.9992978
Null hypothesis is rejected, there is significant difference in their means