#COLLEGE / COMPANY: Delhi Technological University
#Date: December 29, 2017
#EMAIL: deepankvarshney14@gmail.com
#NAME: Deepank Varshney
#Project Title: Email Marketing Campaign management
SYNOPSIS Analysis of the Effect of various factors on effective email marketing campaign
setwd("~/winter internship")
eml <- read.csv(paste("email_campaign.csv",sep=""))
View(eml)
#Dimensions
dim(eml)
## [1] 68353 12
#Head
head(eml)
## Email_ID Email_Type Subject_Hotness_Score Email_Source_Type
## 1 EMA00081000034500 1 2.2 2
## 2 EMA00081000045360 2 2.1 1
## 3 EMA00081000066290 2 0.1 1
## 4 EMA00081000076560 1 3.0 2
## 5 EMA00081000109720 1 0.0 2
## 6 EMA00081000131660 1 1.5 1
## Customer_Location Email_Campaign_Type Total_Past_Communications
## 1 E 2 33
## 2 2 15
## 3 B 3 36
## 4 E 2 25
## 5 C 3 18
## 6 G 2 NA
## Time_Email_sent_Category Word_Count Total_Links Total_Images
## 1 1 440 8 0
## 2 2 504 5 0
## 3 2 962 5 0
## 4 2 610 16 0
## 5 2 947 4 0
## 6 2 416 11 0
## Email_Status
## 1 0
## 2 0
## 3 1
## 4 0
## 5 0
## 6 0
#Summary Statistics
library(psych)
describe(eml)
## vars n mean sd median trimmed
## Email_ID* 1 68353 34177.00 19731.96 34177.0 34177.00
## Email_Type 2 68353 1.29 0.45 1.0 1.23
## Subject_Hotness_Score 3 68353 1.10 1.00 0.8 0.97
## Email_Source_Type 4 68353 1.46 0.50 1.0 1.45
## Customer_Location* 5 68353 5.34 2.57 6.0 5.55
## Email_Campaign_Type 6 68353 2.27 0.47 2.0 2.23
## Total_Past_Communications 7 61528 28.93 12.54 28.0 28.60
## Time_Email_sent_Category 8 68353 2.00 0.63 2.0 2.00
## Word_Count 9 68353 699.93 271.72 694.0 699.05
## Total_Links 10 66152 10.43 6.38 9.0 9.59
## Total_Images 11 66676 3.55 5.60 0.0 2.31
## Email_Status 12 68353 0.23 0.50 0.0 0.12
## mad min max range skew kurtosis se
## Email_ID* 25334.67 1 68353 68352 0.00 -1.20 75.47
## Email_Type 0.00 1 2 1 0.95 -1.09 0.00
## Subject_Hotness_Score 1.04 0 5 5 0.90 0.08 0.00
## Email_Source_Type 0.00 1 2 1 0.17 -1.97 0.00
## Customer_Location* 2.97 1 8 7 -0.53 -1.10 0.01
## Email_Campaign_Type 0.00 1 3 2 0.71 -0.69 0.00
## Total_Past_Communications 13.34 0 67 67 0.21 -0.44 0.05
## Time_Email_sent_Category 0.00 1 3 2 0.00 -0.49 0.00
## Word_Count 265.39 40 1316 1276 0.01 -0.35 1.04
## Total_Links 4.45 1 49 48 1.39 2.50 0.02
## Total_Images 0.00 0 45 45 2.09 5.03 0.02
## Email_Status 0.00 0 2 2 2.08 3.53 0.00
#1Way Contingency Tables
table(eml$Email_Type)
##
## 1 2
## 48866 19487
Type 1 email is used more by the company
table(eml$Email_Source_Type)
##
## 1 2
## 37149 31204
Type 1 email source is used more by the company
table(eml$Email_Campaign_Type)
##
## 1 2 3
## 736 48273 19344
Type 2 email campaign is used more by the company
table(eml$Email_Status)
##
## 0 1 2
## 54941 11039 2373
0 - represents mails which weren’t read 1 - represents mails which were read 2 - represents mails which were read and replied to Clearly maximum people don’t open their mails
table(eml$Customer_Location)
##
## A B C D E F G
## 11595 1454 4341 5758 7406 10193 4433 23173
the company has maximum customers in G area
table(eml$Time_Email_sent_Category)
##
## 1 2 3
## 13636 41129 13588
1- represents morning 2- represents afternoon 3- represents night
#2 way contingency tables
mytable1 <- xtabs(~ Email_Status + Time_Email_sent_Category,data = eml)
addmargins(mytable1)
## Time_Email_sent_Category
## Email_Status 1 2 3 Sum
## 0 10970 33062 10909 54941
## 1 2186 6631 2222 11039
## 2 480 1436 457 2373
## Sum 13636 41129 13588 68353
Maximum mails are sent in afternoon and email status doesn’t much depend on time
mytable1 <- xtabs(~ Email_Status + Customer_Location ,data = eml)
addmargins(mytable1)
## Customer_Location
## Email_Status A B C D E F G Sum
## 0 9351 1160 3502 4654 5950 8136 3579 18609 54941
## 1 1848 245 714 890 1206 1693 698 3745 11039
## 2 396 49 125 214 250 364 156 819 2373
## Sum 11595 1454 4341 5758 7406 10193 4433 23173 68353
Maximum mails are sent in G area
mytable1 <- xtabs(~ Email_Status + Email_Campaign_Type
,data = eml)
addmargins(mytable1)
## Email_Campaign_Type
## Email_Status 1 2 3 Sum
## 0 77 42115 12749 54941
## 1 486 5446 5107 11039
## 2 173 712 1488 2373
## Sum 736 48273 19344 68353
Campaign 3 seems to be most promising
mytable1 <- xtabs(~ Email_Status +Email_Source_Type ,data = eml)
addmargins(mytable1)
## Email_Source_Type
## Email_Status 1 2 Sum
## 0 29577 25364 54941
## 1 6150 4889 11039
## 2 1422 951 2373
## Sum 37149 31204 68353
mytable1 <- xtabs(~ Email_Status + Email_Type ,data = eml)
addmargins(mytable1)
## Email_Type
## Email_Status 1 2 Sum
## 0 39004 15937 54941
## 1 8208 2831 11039
## 2 1654 719 2373
## Sum 48866 19487 68353
Type 1 seems more promising
#Comparitive averages of various factors by email status
mean1 <- aggregate(eml$Word_Count,by=list(eml$Email_Status),mean)
mean1
## Group.1 x
## 1 0 725.2568
## 2 1 590.9396
## 3 2 620.6153
library(lattice)
barchart(Group.1 ~ x,data=mean1, main="averages of Word Count by email Status(1.UnRead;2.read;3.Read And Replied")
For people to read and reply, average word used in mail is 620. Less than this and more than this results in less attractive email
mean2 <-aggregate(eml$Total_Images,by=list(eml$Email_Status),mean,na.rm=TRUE)
mean2
## Group.1 x
## 1 0 3.617575
## 2 1 3.187361
## 3 2 3.690415
library(lattice)
barchart(Group.1 ~ x,data=mean2, main="averages of Images by email Status(1.UnRead;2.read;3.Read And Replied")
For people to read and reply, average images used in mail is 3.7.Less than this results in less attractive email
mean3<-aggregate(eml$Total_Links,by=list(eml$Email_Status),mean,na.rm=TRUE)
mean3
## Group.1 x
## 1 0 10.543199
## 2 1 9.854375
## 3 2 10.473638
library(lattice)
barchart(Group.1 ~ x,data=mean3, main="averages of links by email Status(1.UnRead;2.read;3.Read And Replied")
For people to read and reply, average links used in a mail is 10.47. Less than this and more than this results in less attractive email
mean4<-aggregate(eml$Total_Past_Communications,by=list(eml$Email_Status),mean,na.rm=TRUE)
mean4
## Group.1 x
## 1 0 27.40013
## 2 1 34.70782
## 3 2 37.59680
library(lattice)
barchart(Group.1 ~ x,data=mean4, main="averages of past comm. by email Status(1.UnRead;2.read;3.Read And Replied")
For people to read and reply, average past communications in mail is 34.7 .Less than this results in less attractive email
mean5<-aggregate(eml$Subject_Hotness_Score,by=list(eml$Email_Status),mean)
mean5
## Group.1 x
## 1 0 1.1595584
## 2 1 0.9075188
## 3 2 0.4863043
library(lattice)
barchart(Group.1 ~ x,data=mean5, main="averages of subject score by email Status(1.UnRead;2.read;3.Read And Replied")
For people to read and reply, average subject hotness score is 0.486. More than this results in less attractive email
#Boxplots
boxplot(eml$Total_Images ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of images in the mail",ylab="Email Status" ,las=1,
)
boxplot(eml$Total_Past_Communications ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of past communications with the recipient in the mail",ylab="Email Status" ,las=1,
)
For people to read and reply, past commnications should be higher
boxplot(eml$Total_Links ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of links in the mail",ylab="Email Status" ,las=1,
)
boxplot(eml$Word_Count ~ eml$Email_Status, horizontal=TRUE,
xlab="Number of words in the mail",ylab="Email Status" ,las=1,
)
For people to read and reply, number of words should be higher
boxplot(eml$Subject_Hotness_Score ~ eml$Email_Status, horizontal=TRUE,
xlab="Subject hotness score",ylab="Email Status" ,las=1,
)
For people to read and reply, subject hotness should be lower
#Histograms
hist(eml$Total_Past_Communications,main="Total Past Comunnications frequency",col="lightblue")
hist(eml$Total_Images,main="Total Images frequency",col="lightblue")
hist(eml$Total_Links,main="Total links frequency",col="lightblue")
hist(eml$Subject_Hotness_Score,main="subject hotness score frequency",col="lightblue")
hist(eml$Word_Count,main="word count frequency",col="lightblue")
#histogram using library lattice
library(lattice)
histogram(~ Customer_Location | Email_Status ,data = eml,col="brown")
percent of total emails by location is nearly same in all the 3 cases of email status(read;unread;read and replied)
library(lattice)
histogram(~ Email_Type | Email_Status ,data = eml,col="brown")
percent of total emails by email type is not same in all the 3 cases of email status(read;unread;read and replied). For status read and replied, type 2 email type is used more
library(lattice)
histogram(~ Email_Campaign_Type | Email_Status ,data = eml,col="brown")
percent of total emails by Email Campaign type is not same in all the 3 cases of email status(read;unread;read and replied).For status read and replied, type 3 email campaign is used more
library(lattice)
histogram(~ Email_Source_Type | Email_Status ,data = eml,col="brown")
percent of total emails by email source type is not same in all the 3 cases of email status(read;unread;read and replied).For status read and replied, type 1 email source type is used more
library(lattice)
histogram(~ Time_Email_sent_Category | Email_Status ,data = eml,col="brown")
percent of total emails by time at which it is sent is nearly same in all the 3 cases of email status(read;unread;read and replied)
#plots
plot(jitter(eml$Email_Status), jitter(eml$Subject_Hotness_Score) )
plot(jitter(eml$Email_Status), jitter(eml$Word_Count) )
plot(jitter(eml$Email_Status), jitter(eml$Total_Past_Communications) )
plot(jitter(eml$Email_Status), jitter(eml$Total_Links) )
plot(jitter(eml$Email_Status), jitter(eml$Total_Images) )
round(cor(eml[,c(2:4,6:12)]),2)
## Email_Type Subject_Hotness_Score
## Email_Type 1.00 -0.24
## Subject_Hotness_Score -0.24 1.00
## Email_Source_Type -0.28 0.01
## Email_Campaign_Type 0.23 -0.55
## Total_Past_Communications NA NA
## Time_Email_sent_Category 0.00 0.00
## Word_Count 0.08 -0.24
## Total_Links NA NA
## Total_Images NA NA
## Email_Status -0.02 -0.15
## Email_Source_Type Email_Campaign_Type
## Email_Type -0.28 0.23
## Subject_Hotness_Score 0.01 -0.55
## Email_Source_Type 1.00 0.04
## Email_Campaign_Type 0.04 1.00
## Total_Past_Communications NA NA
## Time_Email_sent_Category 0.00 0.00
## Word_Count 0.05 0.06
## Total_Links NA NA
## Total_Images NA NA
## Email_Status -0.02 0.19
## Total_Past_Communications
## Email_Type NA
## Subject_Hotness_Score NA
## Email_Source_Type NA
## Email_Campaign_Type NA
## Total_Past_Communications 1
## Time_Email_sent_Category NA
## Word_Count NA
## Total_Links NA
## Total_Images NA
## Email_Status NA
## Time_Email_sent_Category Word_Count Total_Links
## Email_Type 0 0.08 NA
## Subject_Hotness_Score 0 -0.24 NA
## Email_Source_Type 0 0.05 NA
## Email_Campaign_Type 0 0.06 NA
## Total_Past_Communications NA NA NA
## Time_Email_sent_Category 1 0.00 NA
## Word_Count 0 1.00 NA
## Total_Links NA NA 1
## Total_Images NA NA NA
## Email_Status 0 -0.17 NA
## Total_Images Email_Status
## Email_Type NA -0.02
## Subject_Hotness_Score NA -0.15
## Email_Source_Type NA -0.02
## Email_Campaign_Type NA 0.19
## Total_Past_Communications NA NA
## Time_Email_sent_Category NA 0.00
## Word_Count NA -0.17
## Total_Links NA NA
## Total_Images 1 NA
## Email_Status NA 1.00
library(corrgram)
corrgram(eml,order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)
#Hypothesis- Effect of various factors on effective email marketing campaign where maximum mails sent are read or read and replied to. #Chi Square Tests
chisq.test(eml$Email_Status, eml$Email_Type)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Email_Type
## X-squared = 54.842, df = 2, p-value = 1.234e-12
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Email_Campaign_Type)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Email_Campaign_Type
## X-squared = 6733.5, df = 4, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Email_Source_Type)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Email_Source_Type
## X-squared = 43.859, df = 2, p-value = 2.993e-10
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Total_Past_Communications)
## Warning in chisq.test(eml$Email_Status, eml$Total_Past_Communications):
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Total_Past_Communications
## X-squared = 8778.7, df = 126, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Total_Links)
## Warning in chisq.test(eml$Email_Status, eml$Total_Links): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Total_Links
## X-squared = 3092.2, df = 72, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Total_Images)
## Warning in chisq.test(eml$Email_Status, eml$Total_Images): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Total_Images
## X-squared = 134.77, df = 88, p-value = 0.0009949
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Word_Count)
## Warning in chisq.test(eml$Email_Status, eml$Word_Count): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Word_Count
## X-squared = 3960.9, df = 300, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Subject_Hotness_Score)
## Warning in chisq.test(eml$Email_Status, eml$Subject_Hotness_Score): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Subject_Hotness_Score
## X-squared = 3579.8, df = 100, p-value < 2.2e-16
Null hypothesis is rejected, they are correlated
chisq.test(eml$Email_Status, eml$Time_Email_sent_Category)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Time_Email_sent_Category
## X-squared = 1.1065, df = 4, p-value = 0.8932
Null hypothesis is accepted, they are not correlated since p-value is >0.05
chisq.test(eml$Email_Status, eml$Customer_Location)
##
## Pearson's Chi-squared test
##
## data: eml$Email_Status and eml$Customer_Location
## X-squared = 11.73, df = 14, p-value = 0.6279
Null hypothesis is accepted, they are not correlated since p-value is >0.05
#T tests
t.test(eml$Email_Status, eml$Email_Type)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Email_Type
## t = -410.45, df = 135460, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.059194 -1.049126
## sample estimates:
## mean of x mean of y
## 0.2309335 1.2850936
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Email_Campaign_Type)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Email_Campaign_Type
## t = -781.21, df = 136240, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.046422 -2.036179
## sample estimates:
## mean of x mean of y
## 0.2309335 2.2722338
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Email_Source_Type)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Email_Source_Type
## t = -455.36, df = 136700, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.230854 -1.220304
## sample estimates:
## mean of x mean of y
## 0.2309335 1.4565125
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Total_Past_Communications)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Total_Past_Communications
## t = -567.5, df = 61701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -28.80145 -28.60319
## sample estimates:
## mean of x mean of y
## 0.2309335 28.9332499
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Total_Links)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Total_Links
## t = -409.73, df = 66927, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.24738 -10.14981
## sample estimates:
## mean of x mean of y
## 0.2309335 10.4295259
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Total_Images)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Total_Images
## t = -152.57, df = 67701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.362391 -3.277097
## sample estimates:
## mean of x mean of y
## 0.2309335 3.5506779
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Word_Count)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Word_Count
## t = -673.24, df = 68352, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -701.7379 -697.6638
## sample estimates:
## mean of x mean of y
## 0.2309335 699.9317513
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Subject_Hotness_Score)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Subject_Hotness_Score
## t = -202.8, df = 100320, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8729027 -0.8561918
## sample estimates:
## mean of x mean of y
## 0.2309335 1.0954808
Null hypothesis is rejected, there is significant difference in their means
t.test(eml$Email_Status, eml$Time_Email_sent_Category)
##
## Welch Two Sample t-test
##
## data: eml$Email_Status and eml$Time_Email_sent_Category
## t = -575.52, df = 129590, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.774387 -1.762342
## sample estimates:
## mean of x mean of y
## 0.2309335 1.9992978
Null hypothesis is rejected, there is significant difference in their means
x1 <- read.csv("email_campaign.csv",header=T,na.strings=c(""))
View(x1)
#Number of missing values in each column
sapply(x1,function(x) sum(is.na(x)))
## Email_ID Email_Type
## 0 0
## Subject_Hotness_Score Email_Source_Type
## 0 0
## Customer_Location Email_Campaign_Type
## 11595 0
## Total_Past_Communications Time_Email_sent_Category
## 6825 0
## Word_Count Total_Links
## 0 2201
## Total_Images Email_Status
## 1677 0
#Number of unique values in each column
sapply(x1, function(x) length(unique(x)))
## Email_ID Email_Type
## 68353 2
## Subject_Hotness_Score Email_Source_Type
## 51 2
## Customer_Location Email_Campaign_Type
## 8 3
## Total_Past_Communications Time_Email_sent_Category
## 65 3
## Word_Count Total_Links
## 151 38
## Total_Images Email_Status
## 46 3
#Missing Values vs Observed
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(x1, main = "Missing values vs observed")
#Adjusting for the missing values
x1$Total_Past_Communications[is.na(x1$Total_Past_Communications)] <- mean(x1$Total_Past_Communications,na.rm=T)
x1$Total_Images[is.na(x1$Total_Images)] <- mean(x1$Total_Images,na.rm=T)
x1$Total_Links[is.na(x1$Total_Links)] <- mean(x1$Total_Links,na.rm=T)
#Multinomial Logistic Regression
#Model1
library(nnet)
model1 <- multinom(Email_Status ~Email_Type + Email_Source_Type + Subject_Hotness_Score + Total_Past_Communications + Total_Images + Total_Links + Word_Count + Email_Campaign_Type , data = x1)
## # weights: 30 (18 variable)
## initial value 75093.445767
## iter 10 value 55666.211481
## iter 20 value 48219.812614
## iter 30 value 37664.590816
## final value 36700.424898
## converged
summary(model1)
## Call:
## multinom(formula = Email_Status ~ Email_Type + Email_Source_Type +
## Subject_Hotness_Score + Total_Past_Communications + Total_Images +
## Total_Links + Word_Count + Email_Campaign_Type, data = x1)
##
## Coefficients:
## (Intercept) Email_Type Email_Source_Type Subject_Hotness_Score
## 1 -1.744351 -0.3990973 -0.09244969 -0.1116074
## 2 -2.357175 -0.3788657 -0.26763771 -0.8572034
## Total_Past_Communications Total_Images Total_Links Word_Count
## 1 0.01988197 0.004920025 -0.02341395 -0.001721480
## 2 0.02199097 0.011561206 -0.01076142 -0.001586621
## Email_Campaign_Type
## 1 0.7094060
## 2 0.5025526
##
## Std. Errors:
## (Intercept) Email_Type Email_Source_Type Subject_Hotness_Score
## 1 0.11537644 0.02700582 0.02245943 0.01654177
## 2 0.01142859 0.04598975 0.04208503 0.03404208
## Total_Past_Communications Total_Images Total_Links Word_Count
## 1 0.001209576 0.003099949 0.002779562 5.098161e-05
## 2 0.001902322 0.005822980 0.005272876 8.675143e-05
## Email_Campaign_Type
## 1 0.02878372
## 2 0.04373899
##
## Residual Deviance: 73400.85
## AIC: 73436.85
#model2
library(nnet)
model2 <- multinom(Email_Status ~ Subject_Hotness_Score + Total_Past_Communications + Total_Images + Total_Links + Word_Count + Email_Campaign_Type , data = x1)
## # weights: 24 (14 variable)
## initial value 75093.445767
## iter 10 value 55916.752578
## iter 20 value 39695.126815
## iter 30 value 36847.544934
## iter 40 value 36841.948101
## final value 36838.518566
## converged
summary(model2)
## Call:
## multinom(formula = Email_Status ~ Subject_Hotness_Score + Total_Past_Communications +
## Total_Images + Total_Links + Word_Count + Email_Campaign_Type,
## data = x1)
##
## Coefficients:
## (Intercept) Subject_Hotness_Score Total_Past_Communications Total_Images
## 1 -2.426955 -0.06898217 0.02335415 0.004469296
## 2 -3.260972 -0.81295725 0.02599598 0.011863567
## Total_Links Word_Count Email_Campaign_Type
## 1 -0.019787692 -0.001672989 0.6326252
## 2 -0.008730013 -0.001535253 0.4216581
##
## Std. Errors:
## (Intercept) Subject_Hotness_Score Total_Past_Communications Total_Images
## 1 0.10274726 0.01625189 0.001184220 0.003096449
## 2 0.01174025 0.03204970 0.001906138 0.005828354
## Total_Links Word_Count Email_Campaign_Type
## 1 0.002759127 5.065930e-05 0.02811320
## 2 0.005270137 8.488129e-05 0.03515953
##
## Residual Deviance: 73677.04
## AIC: 73705.04
#model3
library(nnet)
model3 <- multinom(Email_Status ~ Subject_Hotness_Score +Total_Past_Communications+ Word_Count + Total_Images + Email_Campaign_Type , data = x1)
## # weights: 21 (12 variable)
## initial value 75093.445767
## iter 10 value 55776.419192
## iter 20 value 39923.210209
## iter 30 value 36905.816124
## iter 40 value 36864.892535
## iter 40 value 36864.892204
## iter 40 value 36864.892204
## final value 36864.892204
## converged
summary(model3)
## Call:
## multinom(formula = Email_Status ~ Subject_Hotness_Score + Total_Past_Communications +
## Word_Count + Total_Images + Email_Campaign_Type, data = x1)
##
## Coefficients:
## (Intercept) Subject_Hotness_Score Total_Past_Communications Word_Count
## 1 -2.528275 -0.07320525 0.02238641 -0.001711529
## 2 -3.298855 -0.81623065 0.02551313 -0.001552492
## Total_Images Email_Campaign_Type
## 1 -0.012027182 0.6388005
## 2 0.004518791 0.4225487
##
## Std. Errors:
## (Intercept) Subject_Hotness_Score Total_Past_Communications Word_Count
## 1 0.10243115 0.01631211 0.001179262 5.049971e-05
## 2 0.01171327 0.03151871 0.001868996 8.307023e-05
## Total_Images Email_Campaign_Type
## 1 0.002084386 0.02816958
## 2 0.003814353 0.03494645
##
## Residual Deviance: 73729.78
## AIC: 73753.78
#We can see that model 1 is the best fit model since it has least residual deviance
#Value prediction of the model
predict(model1,x1[c(1:1000),])
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [35] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [69] 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [103] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [137] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [171] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [205] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [239] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [273] 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [307] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [341] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [375] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
## [409] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [443] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [477] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [511] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [545] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [579] 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [613] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [647] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [681] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [715] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [749] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [783] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [817] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [851] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [885] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [919] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [953] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [987] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1 2
#Probabilty prediction of the model
predict(model1,x1[c(1:250),],type="prob")
## 0 1 2
## 1 0.7994347 0.18870321 0.011862040
## 2 0.8872405 0.10469920 0.008060293
## 3 0.7868262 0.16282683 0.050346964
## 4 0.8941847 0.10188840 0.003926873
## 5 0.7916789 0.16534014 0.042981009
## 6 0.7792162 0.19545938 0.025324439
## 7 0.6812883 0.30870792 0.010003753
## 8 0.9339780 0.05472462 0.011297329
## 9 0.8506947 0.13639569 0.012909652
## 10 0.8325265 0.12936454 0.038108988
## 11 0.7629825 0.18561695 0.051400557
## 12 0.9040086 0.07668014 0.019311292
## 13 0.6242989 0.28900289 0.086698165
## 14 0.9334508 0.06524469 0.001304550
## 15 0.8996868 0.09587990 0.004433333
## 16 0.8976367 0.09373305 0.008630229
## 17 0.8709431 0.11904924 0.010007623
## 18 0.9164345 0.06745793 0.016107523
## 19 0.8641325 0.10738471 0.028482749
## 20 0.7962893 0.18548533 0.018225381
## 21 0.8007478 0.15181595 0.047436202
## 22 0.8355161 0.14080658 0.023677328
## 23 0.5903011 0.32846626 0.081232619
## 24 0.9505582 0.04673992 0.002701897
## 25 0.9186483 0.07438351 0.006968180
## 26 0.8965194 0.09124814 0.012232476
## 27 0.7510600 0.18899379 0.059946226
## 28 0.8991881 0.09013886 0.010673033
## 29 0.7850925 0.20428789 0.010619591
## 30 0.8937606 0.09075157 0.015487848
## 31 0.7342064 0.22608943 0.039704146
## 32 0.5336437 0.36918157 0.097174760
## 33 0.8398262 0.13785382 0.022319961
## 34 0.8712404 0.09964071 0.029118930
## 35 0.9052508 0.07861932 0.016129896
## 36 0.8317971 0.16610496 0.002097907
## 37 0.7646116 0.22086966 0.014518720
## 38 0.5233914 0.37170759 0.104901026
## 39 0.8898010 0.09968804 0.010511004
## 40 0.4772321 0.39411555 0.128652390
## 41 0.8105773 0.17383037 0.015592375
## 42 0.8845984 0.10315328 0.012248314
## 43 0.9217975 0.07431488 0.003887642
## 44 0.9395194 0.05329981 0.007180820
## 45 0.8597951 0.11516282 0.025042067
## 46 0.7175675 0.23399930 0.048433169
## 47 0.6168754 0.27657401 0.106550620
## 48 0.8058703 0.17901518 0.015114521
## 49 0.8031730 0.16315189 0.033675139
## 50 0.8739108 0.10168296 0.024406283
## 51 0.7888110 0.18844690 0.022742119
## 52 0.7764843 0.19316454 0.030351121
## 53 0.6715543 0.23207308 0.096372608
## 54 0.8597942 0.10816467 0.032041174
## 55 0.9258159 0.06705464 0.007129481
## 56 0.8996437 0.09740757 0.002948721
## 57 0.8171831 0.17642617 0.006390758
## 58 0.7935609 0.19733884 0.009100218
## 59 0.8625735 0.09476753 0.042658997
## 60 0.7978934 0.16214559 0.039961054
## 61 0.8816845 0.09943326 0.018882218
## 62 0.8991296 0.09183396 0.009036434
## 63 0.8520174 0.12558841 0.022394157
## 64 0.8745080 0.11785080 0.007641182
## 65 0.9376944 0.06029867 0.002006912
## 66 0.8248725 0.15250051 0.022626956
## 67 0.7079917 0.25742734 0.034580980
## 68 0.8303002 0.13869852 0.031001251
## 69 0.8286654 0.13684028 0.034494359
## 70 0.8220704 0.15726280 0.020666795
## 71 0.4118437 0.45312278 0.135033537
## 72 0.9322045 0.06110630 0.006689217
## 73 0.9015989 0.09018898 0.008212087
## 74 0.3550399 0.50666982 0.138290282
## 75 0.5581604 0.32499753 0.116842112
## 76 0.9163870 0.07543874 0.008174302
## 77 0.8516384 0.12934823 0.019013384
## 78 0.8503985 0.13874528 0.010856238
## 79 0.9083472 0.08659881 0.005054022
## 80 0.8882492 0.10997127 0.001779568
## 81 0.7831074 0.16730592 0.049586660
## 82 0.7829502 0.18121489 0.035834943
## 83 0.8181711 0.15512594 0.026702944
## 84 0.9111626 0.08569785 0.003139536
## 85 0.9273157 0.06388122 0.008803113
## 86 0.6925191 0.28175212 0.025728816
## 87 0.8839191 0.10744889 0.008631974
## 88 0.8656274 0.12566431 0.008708323
## 89 0.8585198 0.11970326 0.021776933
## 90 0.3750867 0.50311413 0.121799158
## 91 0.8336228 0.14464907 0.021728165
## 92 0.9406155 0.05367000 0.005714551
## 93 0.6565558 0.26721791 0.076226303
## 94 0.9013341 0.09709491 0.001570994
## 95 0.9188008 0.07078413 0.010415047
## 96 0.6691154 0.25365491 0.077229682
## 97 0.7589752 0.19452290 0.046501905
## 98 0.8642685 0.11102342 0.024708111
## 99 0.8639685 0.10779094 0.028240537
## 100 0.8868225 0.09987918 0.013298320
## 101 0.6029462 0.32304337 0.074010467
## 102 0.9175720 0.07157018 0.010857860
## 103 0.8226498 0.15661078 0.020739371
## 104 0.8110053 0.18298506 0.006009673
## 105 0.7368868 0.19520900 0.067904220
## 106 0.5414560 0.35790952 0.100634457
## 107 0.7168210 0.23072587 0.052453177
## 108 0.7313510 0.18701919 0.081629789
## 109 0.6077993 0.30947421 0.082726453
## 110 0.7681067 0.18082240 0.051070859
## 111 0.6061836 0.29081037 0.103006036
## 112 0.9098594 0.08598020 0.004160452
## 113 0.8509468 0.13204283 0.017010342
## 114 0.8304453 0.13753522 0.032019524
## 115 0.8738766 0.11022549 0.015897891
## 116 0.8140457 0.14851128 0.037443042
## 117 0.7694220 0.20015447 0.030423527
## 118 0.8844747 0.10459343 0.010931898
## 119 0.7032862 0.24501647 0.051697288
## 120 0.8563424 0.12993280 0.013724829
## 121 0.8557625 0.11327153 0.030965958
## 122 0.8761564 0.10297225 0.020871332
## 123 0.7156842 0.23888519 0.045430567
## 124 0.8771888 0.11996511 0.002846064
## 125 0.7078937 0.23784946 0.054256827
## 126 0.8608856 0.11372559 0.025388830
## 127 0.7432554 0.19417238 0.062572254
## 128 0.9110823 0.07999770 0.008919961
## 129 0.6793869 0.26755026 0.053062877
## 130 0.8909160 0.08854778 0.020536257
## 131 0.7470816 0.19863522 0.054283186
## 132 0.8818443 0.10234806 0.015807623
## 133 0.7317886 0.23221497 0.035996472
## 134 0.9015272 0.08351457 0.014958231
## 135 0.5572867 0.35108079 0.091632464
## 136 0.9277982 0.06354196 0.008659830
## 137 0.9042865 0.07397748 0.021735974
## 138 0.6129477 0.29435629 0.092696039
## 139 0.6186020 0.29612395 0.085274008
## 140 0.9027243 0.07237283 0.024902897
## 141 0.7065439 0.23098264 0.062473439
## 142 0.8556328 0.11194532 0.032421838
## 143 0.8900613 0.10079316 0.009145508
## 144 0.5256850 0.37884625 0.095468728
## 145 0.6054727 0.30346206 0.091065284
## 146 0.9171321 0.06999101 0.012876852
## 147 0.7781250 0.17511472 0.046760266
## 148 0.9067645 0.07719690 0.016038607
## 149 0.5733338 0.34885580 0.077810425
## 150 0.8110802 0.16763753 0.021282298
## 151 0.7900728 0.18370984 0.026217384
## 152 0.6136275 0.31697214 0.069400348
## 153 0.8246090 0.15371789 0.021673071
## 154 0.6589650 0.25745025 0.083584721
## 155 0.8601830 0.12926644 0.010550513
## 156 0.8353377 0.12769507 0.036967225
## 157 0.8231643 0.13821903 0.038616643
## 158 0.8114634 0.16778580 0.020750796
## 159 0.5550993 0.35381949 0.091081199
## 160 0.8830565 0.11166628 0.005277229
## 161 0.5476790 0.34639033 0.105930638
## 162 0.9629018 0.03176017 0.005338017
## 163 0.9553495 0.04070324 0.003947262
## 164 0.6562856 0.26143815 0.082276222
## 165 0.9190037 0.05830522 0.022691100
## 166 0.5311385 0.37468213 0.094179368
## 167 0.9255814 0.06519092 0.009227677
## 168 0.9108564 0.06558863 0.023555011
## 169 0.8294097 0.14593953 0.024650729
## 170 0.9247922 0.06869228 0.006515487
## 171 0.7211574 0.21941031 0.059432328
## 172 0.8350814 0.12831836 0.036600191
## 173 0.8733258 0.11130409 0.015370150
## 174 0.5994711 0.28820726 0.112321633
## 175 0.8804894 0.10252088 0.016989691
## 176 0.8988317 0.08290761 0.018260666
## 177 0.7661628 0.18695831 0.046878917
## 178 0.9481249 0.04397490 0.007900178
## 179 0.8450440 0.15154153 0.003414477
## 180 0.8721303 0.09222892 0.035640798
## 181 0.4414992 0.42022707 0.138273679
## 182 0.7587850 0.18254316 0.058671813
## 183 0.8009278 0.14947575 0.049596417
## 184 0.8804995 0.11064005 0.008860471
## 185 0.9302343 0.05749523 0.012270465
## 186 0.7095972 0.21892924 0.071473564
## 187 0.7478191 0.19288501 0.059295840
## 188 0.6056358 0.31176858 0.082595654
## 189 0.7669533 0.16491930 0.068127385
## 190 0.9504421 0.03601293 0.013544992
## 191 0.6335416 0.27818028 0.088278119
## 192 0.7549031 0.18992951 0.055167390
## 193 0.8142402 0.14063260 0.045127160
## 194 0.6465204 0.29909295 0.054386626
## 195 0.8670788 0.12512084 0.007800351
## 196 0.9280393 0.06572036 0.006240295
## 197 0.7103166 0.23381420 0.055869248
## 198 0.9499509 0.04530185 0.004747285
## 199 0.8879090 0.09711760 0.014973391
## 200 0.8447750 0.15194745 0.003277596
## 201 0.9595555 0.03683425 0.003610250
## 202 0.8352167 0.12463081 0.040152501
## 203 0.9000328 0.08886075 0.011106486
## 204 0.8920527 0.09600793 0.011939404
## 205 0.3708325 0.48773241 0.141435074
## 206 0.8470597 0.11269609 0.040244249
## 207 0.9300680 0.05323318 0.016698847
## 208 0.9231273 0.05877929 0.018093413
## 209 0.8557700 0.10396241 0.040267628
## 210 0.8133273 0.15347002 0.033202687
## 211 0.5876095 0.32635584 0.086034630
## 212 0.7808546 0.17398164 0.045163778
## 213 0.7875909 0.16683506 0.045574080
## 214 0.9266210 0.06171750 0.011661496
## 215 0.7303928 0.20888832 0.060718890
## 216 0.8882928 0.10684629 0.004860949
## 217 0.8051601 0.17507984 0.019760088
## 218 0.8288521 0.13370663 0.037441221
## 219 0.8738252 0.10866440 0.017510431
## 220 0.9513406 0.04383167 0.004827732
## 221 0.7711366 0.19719873 0.031664631
## 222 0.8819245 0.08640262 0.031672911
## 223 0.8878947 0.10627873 0.005826605
## 224 0.9499919 0.04260709 0.007401007
## 225 0.7016932 0.28951365 0.008793193
## 226 0.7756352 0.17475891 0.049605931
## 227 0.8180010 0.15363377 0.028365258
## 228 0.8776260 0.09178730 0.030586745
## 229 0.6304827 0.28787916 0.081638165
## 230 0.7255362 0.21831263 0.056151201
## 231 0.9292908 0.06417201 0.006537209
## 232 0.6978738 0.26551930 0.036606860
## 233 0.7703275 0.21282769 0.016844820
## 234 0.9093400 0.06667031 0.023989681
## 235 0.8210865 0.14225589 0.036657649
## 236 0.7082252 0.25205919 0.039715650
## 237 0.8162781 0.12745115 0.056270775
## 238 0.8072259 0.13935522 0.053418918
## 239 0.9029757 0.09210324 0.004921074
## 240 0.8471856 0.11471311 0.038101250
## 241 0.8808709 0.09948099 0.019648095
## 242 0.8036361 0.15226930 0.044094598
## 243 0.8200968 0.16705286 0.012850346
## 244 0.9468003 0.04917718 0.004022506
## 245 0.8624065 0.12904168 0.008551844
## 246 0.7609966 0.17939806 0.059605327
## 247 0.8621722 0.10666506 0.031162767
## 248 0.6394304 0.26724466 0.093324908
## 249 0.8954822 0.09645938 0.008058391
## 250 0.8322736 0.14189734 0.025829045
#Misclassification error
cm <- table(predict(model1),x1$Email_Status)
print(cm)
##
## 0 1 2
## 0 54409 10449 2227
## 1 532 590 146
## 2 0 0 0
Exact match values are the ones on the major diagonal
1- sum(diag(cm))/sum(cm)
## [1] 0.1953682
Therefore model misclassifies 19.53% of the times which proves the model to be a good fit
#Test to study significance of coefficients using p-values
z <- summary(model1)$coefficients/summary(model1)$standard.errors
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
## (Intercept) Email_Type Email_Source_Type Subject_Hotness_Score
## 1 0 0.000000e+00 3.850088e-05 1.509304e-11
## 2 0 2.220446e-16 2.024756e-10 0.000000e+00
## Total_Past_Communications Total_Images Total_Links Word_Count
## 1 0 0.1124829 0.00000000 0
## 2 0 0.0470950 0.04126058 0
## Email_Campaign_Type
## 1 0
## 2 0
#CONCLUSION Email Status(read; unread; read and replied) depends on the factors:- Email Type , Email Source Type , Subject Hotness Score , Total Past Communications , Total Images , Total Links , Word Count And Email Campaign Type
1)Out of these for an email to be read and replied to Statistically Significant(p value<0.05) are- -Email Type -Email Source Type -Subject Hotness score -Total Past Communications -Total images -Total Links -Word Count -Email Campaign Type
Statistically insignificant are-
None
2)Out of these for an email to be read Statistically Significant are- -Email Type -Email Source Type -Subject Hotness score -Total Past Communications -Total Links -Word Count -Email Campaign Type
Statistically insignificant are-
-Total images
Email Status(read; unread; read and replied)does not depend on the factors:- Customer Location and Time Email Sent