Data source: Pet Food Customer Orders Online
Reference notebook: Pet Food Customer Orders Data Insights
Dataset description: Dataset containing customer orders from a subscription business, where each order is a dry food order along with pet characteristics and some of the orders contain wet food and treats purchase.
Research questions:
What are the key features for explaining:
Notebook contents:
library(tidyverse)
library(Hmisc)
library(skimr)
library(lubridate)
library(ggsci)
library(viridis)
library(hrbrthemes)
library(corrplot)
library(ggpubr)
library(factoextra)
library(pscl)
library(rpart)
library(rpart.plot)
library(rattle)
library(randomForest)
library(caret)
library(pROC)
petfood = read.csv("pet_food_customer_orders.csv",header=TRUE)
dim(petfood)
[1] 49042 36
skim(petfood)
── Data Summary ────────────────────────
Values
Name petfood
Number of rows 49042
Number of columns 36
_______________________
Column type frequency:
character 18
numeric 18
________________________
Group variables None
── Variable type: character ────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 pet_has_active_subscription 0 1 4 5 0 2 0
2 pet_food_tier 0 1 3 12 0 3 0
3 pet_signup_datetime 0 1 29 29 0 12508 0
4 pet_allergen_list 0 1 0 47 38284 201 0
5 pet_fav_flavour_list 0 1 0 27 28020 17 0
6 pet_health_issue_list 0 1 0 46 25326 16 0
7 neutered 0 1 4 5 0 2 0
8 gender 0 1 4 6 0 2 0
9 pet_breed_size 0 1 3 6 0 5 0
10 signup_promo 0 1 3 16 0 13 0
11 ate_wet_food_pre_tails 0 1 4 5 0 2 0
12 dry_food_brand_pre_tails 0 1 0 33 6372 137 0
13 pet_life_stage_at_order 0 1 6 13 0 4 0
14 order_payment_date 0 1 29 29 0 435 0
15 wet_tray_size 0 1 4 4 0 4 0
16 wet_food_textures_in_order 0 1 0 16 36254 8 0
17 last_customer_support_ticket_date 0 1 0 25 38762 3494 0
18 customer_support_ticket_category 0 1 0 18 38801 28 0
── Variable type: numeric ──────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
1 customer_id 0 1 9.24e+18 5.27e+18 1.97e15 4.75e18 9.20e18 1.38e+19 1.84e19
2 pet_id 0 1 9.25e+18 5.30e+18 3.63e14 4.65e18 9.32e18 1.38e+19 1.84e19
3 pet_order_number 0 1 3.51e+ 0 2.86e+ 0 1.00e 0 1.00e 0 3.00e 0 5.00e+ 0 2.00e 1
4 wet_food_order_number 36254 0.261 2.91e+ 0 2.31e+ 0 1.00e 0 1.00e 0 2.00e 0 4.00e+ 0 2.00e 1
5 orders_since_first_wet_trays_order 34670 0.293 3.16e+ 0 2.49e+ 0 1.00e 0 1.00e 0 2.00e 0 4.00e+ 0 2.00e 1
6 kibble_kcal 0 1 1.98e+ 4 1.33e+ 4 5.97e 2 1.01e 4 1.64e 4 2.64e+ 4 1.69e 5
7 wet_kcal 0 1 8.61e+ 2 1.90e+ 3 0. 0. 0. 9.73e+ 2 5.59e 4
8 total_order_kcal 0 1 2.08e+ 4 1.34e+ 4 1.48e 3 1.11e 4 1.75e 4 2.76e+ 4 1.69e 5
9 wet_trays 0 1 5.43e+ 0 1.09e+ 1 0. 0. 0. 8.00e+ 0 2.48e 2
10 wet_food_discount_percent 36254 0.261 1.50e- 1 2.52e- 1 0. 0. 0. 5.00e- 1 2.00e 0
11 premium_treat_packs 0 1 8.74e- 2 4.71e- 1 0. 0. 0. 0. 2.30e 1
12 dental_treat_packs 0 1 2.76e- 1 9.52e- 1 0. 0. 0. 0. 2.00e 1
13 total_web_sessions 0 1 7.96e+ 0 9.73e+ 0 0. 1.00e 0 5.00e 0 1.10e+ 1 1.24e 2
14 total_web_sessions_since_last_order 0 1 2.04e+ 0 2.62e+ 0 0. 0. 1.00e 0 3.00e+ 0 5.10e 1
15 total_minutes_on_website 0 1 3.84e+ 2 8.22e+ 2 0. 1.80e 1 5.90e 1 4.33e+ 2 2.37e 4
16 total_minutes_on_website_since_last_order 0 1 9.27e+ 1 2.91e+ 2 0. 0. 2.00e 0 1.70e+ 1 8.20e 3
17 total_wet_food_updates 0 1 4.54e- 2 3.17e- 1 0. 0. 0. 0. 9.00e 0
18 total_wet_food_updates_since_last_order 0 1 3.02e- 2 2.47e- 1 0. 0. 0. 0. 9.00e 0
hist
1 ▇▇▇▇▇
2 ▇▇▇▇▇
3 ▇▂▁▁▁
4 ▇▂▁▁▁
5 ▇▂▁▁▁
6 ▇▁▁▁▁
7 ▇▁▁▁▁
8 ▇▂▁▁▁
9 ▇▁▁▁▁
10 ▇▃▁▁▁
11 ▇▁▁▁▁
12 ▇▁▁▁▁
13 ▇▁▁▁▁
14 ▇▁▁▁▁
15 ▇▁▁▁▁
16 ▇▁▁▁▁
17 ▇▁▁▁▁
18 ▇▁▁▁▁
df = petfood
df$order_payment_date = as_date(df$order_payment_date)
df$last_customer_support_ticket_date = as_date(df$last_customer_support_ticket_date)
df %>% summarise (min_date = min(order_payment_date), max_date = max(order_payment_date))
#get year month of order payment date
df = df %>%
mutate(
date = ymd(order_payment_date),
ym = format_ISO8601(date, precision = "ym")
)
df %>% group_by(ym) %>% tally() %>% mutate(prop=round(n/sum(n),4))
#count of unique customer ID
length(unique(df$customer_id))
[1] 11168
#count of unique pets ID
length(unique(df$pet_id))
[1] 13087
#no of orders by customer ID
orders_c = df %>% group_by(customer_id) %>% tally(sort=TRUE) %>% as.data.frame
summary(orders_c$n)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 3.000 4.391 6.000 51.000
#no of orders by pet ID
orders_p= df %>% group_by(pet_id) %>% tally(sort=TRUE) %>% as.data.frame
summary(orders_p$n)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 3.000 3.747 5.000 18.000
df %>% group_by(pet_has_active_subscription) %>% tally(sort=TRUE) %>% mutate(prop=round(n/sum(n),3))
df = df %>% group_by(customer_id) %>% mutate(pets_household = n_distinct(pet_id))
summary(df$pets_household)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 1.000 1.367 2.000 10.000
df %>% group_by(customer_id) %>% filter(!duplicated(customer_id)) %>% group_by(pets_household) %>% tally() %>% mutate(prop= round(n/sum(n),4))
#communication gap (days between order_payment_date & last_support_ticket_date)
df$comm_gap = as.numeric(df$date-df$last_customer_support_ticket_date)
summary(df$comm_gap)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-7.0 12.0 45.0 105.7 117.0 1814.0 38762
#has communication
df = df %>% mutate_at(vars(comm_gap), ~replace(., is.na(.), 0))
df$has_comm = ifelse(df$comm_gap >0,"1","0")
Hmisc::describe(df$has_comm)
df$has_comm
n missing distinct
49042 0 2
Value 0 1
Frequency 39756 9286
Proportion 0.811 0.189
#comm gap by customer support ticket category
df %>% group_by(customer_support_ticket_category) %>% summarise(mean_comm_gap = mean(comm_gap)) %>% arrange(mean_comm_gap)
`summarise()` ungrouping output (override with `.groups` argument)
#customer support ticket category
df %>% filter(has_comm==1) %>% group_by(customer_support_ticket_category) %>% tally(sort=TRUE) %>% mutate(prop=n/sum(n)) %>% as.data.frame()
#days between order_payment_date and max order_payment date
df$days_before_closing = as.numeric(max(df$date) - df$order_payment_date)
summary(df$days_before_closing)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 62.0 116.0 132.3 180.0 456.0
df = df %>% mutate(ratio_kcal = wet_kcal/kibble_kcal) %>% mutate(ratio_kcal= round(ratio_kcal,2))
summary(df$ratio_kcal)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.07236 0.06000 15.70000
#drop customer_id
df2 = df %>% ungroup %>% select(-customer_id, -comm_gap, -ym, -date)
#change na to 0
df2 = df2 %>% mutate_at(vars(wet_food_discount_percent, wet_food_order_number), ~replace(., is.na(.), 0))
df2 = df2 %>% group_by(pet_id) %>% mutate (
wet_food_order_number_max = max(wet_food_order_number),
pet_order_number_max = max(pet_order_number),
kibble_kcal_mean= mean(kibble_kcal),
wet_food_discount_percent_mean = mean(wet_food_discount_percent),
total_minutes_on_website_since_last_order_mean = mean(total_minutes_on_website_since_last_order),
pets_household_mean = mean(pets_household),
has_comm_max = max(has_comm),
days_before_closing_max = max(days_before_closing),
ratio_kcal_mean = mean(ratio_kcal),
premium_treat_packs_sum = sum(premium_treat_packs),
dental_treat_packs_sum = sum(dental_treat_packs)
) %>% as.data.frame()
df2a = df2
df2a$allergen_specified = ifelse(df2a$pet_allergen_list=="","0","1")
Hmisc::describe(df2a$allergen_specified)
df2a$allergen_specified
n missing distinct
49042 0 2
Value 0 1
Frequency 38284 10758
Proportion 0.781 0.219
df2a$fav_flavour_specified = ifelse(df2a$pet_fav_flavour_list=="","0","1")
Hmisc::describe(df2a$fav_flavour_specified)
df2a$fav_flavour_specified
n missing distinct
49042 0 2
Value 0 1
Frequency 28020 21022
Proportion 0.571 0.429
df2a$health_issue_specified = ifelse(df2a$pet_health_issue_list=="","0","1")
Hmisc::describe(df2a$health_issue_specified)
df2a$health_issue_specified
n missing distinct
49042 0 2
Value 0 1
Frequency 25326 23716
Proportion 0.516 0.484
df2a$dry_food_brand_specified = ifelse(df2a$dry_food_brand_pre_tails=="","0","1")
Hmisc::describe(df2a$dry_food_brand_specified)
df2a$dry_food_brand_specified
n missing distinct
49042 0 2
Value 0 1
Frequency 6372 42670
Proportion 0.13 0.87
df2a = df2a %>% mutate_at(vars(pet_has_active_subscription, pet_food_tier, allergen_specified, fav_flavour_specified, health_issue_specified, neutered, gender, pet_breed_size, signup_promo, ate_wet_food_pre_tails, dry_food_brand_specified, pet_life_stage_at_order, has_comm_max), list(factor))
df3 = df2a %>% select (pet_id, wet_food_order_number_max,pet_order_number_max,kibble_kcal_mean,wet_food_discount_percent_mean,total_minutes_on_website_since_last_order_mean,pets_household_mean, days_before_closing_max, ratio_kcal_mean,premium_treat_packs_sum, dental_treat_packs_sum, pet_has_active_subscription, pet_food_tier, allergen_specified, fav_flavour_specified, health_issue_specified, neutered, gender, pet_breed_size, signup_promo, ate_wet_food_pre_tails, dry_food_brand_specified, pet_life_stage_at_order, has_comm_max,)
df3 = df3[!duplicated(df3$pet_id,),]
df3 = df3 %>% ungroup %>% select(-pet_id) #drop pet id
dim(df3)
[1] 13087 23
summary(df3)
wet_food_order_number_max pet_order_number_max kibble_kcal_mean wet_food_discount_percent_mean
Min. : 0.000 Min. : 1.000 Min. : 1482 Min. :0.00000
1st Qu.: 0.000 1st Qu.: 2.000 1st Qu.: 9435 1st Qu.:0.00000
Median : 0.000 Median : 3.000 Median : 15225 Median :0.00000
Mean : 1.012 Mean : 3.852 Mean : 18230 Mean :0.06205
3rd Qu.: 1.000 3rd Qu.: 5.000 3rd Qu.: 24406 3rd Qu.:0.07143
Max. :20.000 Max. :20.000 Max. :108098 Max. :1.50000
total_minutes_on_website_since_last_order_mean pets_household_mean days_before_closing_max ratio_kcal_mean
Min. : 0.000 Min. : 1.00 Min. : 1 Min. :0.00000
1st Qu.: 0.134 1st Qu.: 1.00 1st Qu.: 75 1st Qu.:0.00000
Median : 6.400 Median : 1.00 Median :135 Median :0.00000
Mean : 82.721 Mean : 1.34 Mean :151 Mean :0.07097
3rd Qu.: 79.450 3rd Qu.: 2.00 3rd Qu.:197 3rd Qu.:0.10155
Max. :4821.000 Max. :10.00 Max. :456 Max. :5.08250
premium_treat_packs_sum dental_treat_packs_sum pet_has_active_subscription pet_food_tier allergen_specified
Min. : 0.0000 Min. : 0.000 False:5178 mid :3131 0:10477
1st Qu.: 0.0000 1st Qu.: 0.000 True :7909 premium :2019 1: 2610
Median : 0.0000 Median : 0.000 superpremium:7937
Mean : 0.3273 Mean : 1.033
3rd Qu.: 0.0000 3rd Qu.: 0.000
Max. :56.0000 Max. :80.000
fav_flavour_specified health_issue_specified neutered gender pet_breed_size signup_promo
0:7560 0:6877 False:6113 female:6060 giant : 312 Null & Default :3192
1:5527 1:6210 True :6974 male :7027 large :2917 Search Generic :2255
medium:3778 Digital Display:1809
small :4160 Refer a Friend :1358
toy :1920 Incompletes :1300
Search Brand :1196
(Other) :1977
ate_wet_food_pre_tails dry_food_brand_specified pet_life_stage_at_order has_comm_max
False:7947 0: 1988 half_maturity:2347 0:10391
True :5140 1:11099 mature :7585 1: 2696
senior :2077
weaning :1078
#check correlation
df3_numeric = select_if(df3,is.numeric)
res=cor(df3_numeric)
corrplot(res, method="color", type="upper", tl.col="#636363", tl.cex=0.5 )
#has wet food order
df3$wetfood = ifelse(df3$wet_food_order_number_max >0,"1","0")
Hmisc::describe(df3$wetfood)
#wet food: follow up order
df3 = df3 %>% mutate(wetfood2 = case_when(wet_food_order_number_max == 0 ~ "0", wet_food_order_number_max == 1 ~ "1", wet_food_order_number_max >= 2 ~ "2")) %>% as.data.frame
Hmisc::describe(df3$wetfood2)
df3$wetfood2
n missing distinct
13087 0 3
Value 0 1 2
Frequency 8824 1131 3132
Proportion 0.674 0.086 0.239
#has premium treats
df3$premium_t = ifelse(df3$premium_treat_packs_sum >0, "1","0")
Hmisc::describe(df3$premium_t)
df3$premium_t
n missing distinct
13087 0 2
Value 0 1
Frequency 11755 1332
Proportion 0.898 0.102
#has dental treats
df3$dental_t = ifelse(df3$dental_treat_packs_sum >0, "1","0")
Hmisc::describe(df3$dental_t)
df3$dental_t
n missing distinct
13087 0 2
Value 0 1
Frequency 10852 2235
Proportion 0.829 0.171
#has both premium and dental treats
df3$both_treats = ifelse(df3$premium_treat_packs_sum >0 & df3$dental_treat_packs_sum >0, "1","0")
Hmisc::describe(df3$both_treats)
df3$both_treats
n missing distinct
13087 0 2
Value 0 1
Frequency 12654 433
Proportion 0.967 0.033
#has either premium or dental treat pack
df3$total_treat_packs = df3$premium_treat_packs_sum + df3$dental_treat_packs_sum
df3$treats = ifelse(df3$total_treat_packs >0,"1","0")
Hmisc::describe(df3$treats)
df3$treats
n missing distinct
13087 0 2
Value 0 1
Frequency 9953 3134
Proportion 0.761 0.239
# kibble_kcal_mean
kk_treats = ggplot(df3, aes(x=treats, y= kibble_kcal_mean)) + geom_boxplot(color="#606c38") + coord_flip() + theme(axis.title=element_text(size=10))
kk_wetfood = ggplot(df3, aes(x=wetfood, y= kibble_kcal_mean)) + geom_boxplot(color="#457b9d") + coord_flip() + theme(axis.title=element_text(size=10))
kk_wetfood2 = ggplot(df3, aes(x=wetfood2, y= kibble_kcal_mean)) + geom_boxplot(color="#e09f3e") + coord_flip() + theme(axis.title=element_text(size=10))
ggarrange(kk_treats, kk_wetfood,kk_wetfood2, ncol=1, nrow=3)
#total_minutes_on_website_since_last_order_mean
wm_treats= ggplot(df3, aes(x=treats, y=total_minutes_on_website_since_last_order_mean)) + geom_jitter(alpha=0.5, size=0.8, width=0.2, color="#606c38") + theme(legend.position="none", axis.title=element_text(size=9)) + coord_flip() + labs(x="active_sub")
wm_wetfood= ggplot(df3, aes(x=wetfood, y=total_minutes_on_website_since_last_order_mean)) + geom_jitter(alpha=0.5, size=0.8, width=0.2, color="#457b9d") + theme(legend.position="none", axis.title=element_text(size=9)) + coord_flip() + labs(x="active_sub")
wm_wetfood2= ggplot(df3, aes(x=wetfood2, y=total_minutes_on_website_since_last_order_mean)) + geom_jitter(alpha=0.5, size=0.8, width=0.2, color="#e09f3e") + theme(legend.position="none", axis.title=element_text(size=9)) + coord_flip() + labs(x="active_sub")
ggarrange(wm_treats, wm_wetfood,wm_wetfood2, ncol=1, nrow=3)
#days_before_closing_max
bc_treats = ggplot(df3, aes(x=treats, y= days_before_closing_max)) + geom_boxplot(color="#606c38") + coord_flip() + theme(axis.title=element_text(size=10))
bc_wetfood = ggplot(df3, aes(x=wetfood, y= days_before_closing_max)) + geom_boxplot(color="#457b9d") + coord_flip() + theme(axis.title=element_text(size=10))
bc_wetfood2 = ggplot(df3, aes(x=wetfood2, y= days_before_closing_max)) + geom_boxplot(color="#e09f3e") + coord_flip() + theme(axis.title=element_text(size=10))
ggarrange(bc_treats, bc_wetfood,bc_wetfood2, ncol=1, nrow=3)
#proportion: pet_has_active_subscription
as_order = df %>% group_by(pet_has_active_subscription) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=pet_has_active_subscription, fill = pet_has_active_subscription, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Transactions") + scale_fill_uchicago() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
as_treats = df3 %>% group_by(treats, pet_has_active_subscription) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=treats, fill = pet_has_active_subscription, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: treats") + scale_fill_uchicago() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
as_wetfood = df3 %>% group_by(wetfood,pet_has_active_subscription) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=wetfood, fill = pet_has_active_subscription, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: wetfood") + scale_fill_uchicago() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
as_wetfood2 = df3 %>% group_by(wetfood2,pet_has_active_subscription) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=wetfood2, fill = pet_has_active_subscription, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion",title="Label: wetfood2") + scale_fill_uchicago() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
ggarrange(as_order, as_treats, as_wetfood, as_wetfood2, ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
#proportion: pet_life_stage_at_order
s_order = df %>% group_by(pet_life_stage_at_order) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=pet_life_stage_at_order, fill = pet_life_stage_at_order, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Transactions") + scale_fill_brewer(palette = "Set1") + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
s_wetfood = df3 %>% group_by(wetfood,pet_life_stage_at_order) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=wetfood, fill = pet_life_stage_at_order, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: wetfood") + scale_fill_brewer(palette = "Set1") + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
s_wetfood2 = df3 %>% group_by(wetfood2,pet_life_stage_at_order) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=wetfood2, fill = pet_life_stage_at_order, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: wetfood2") + scale_fill_brewer(palette = "Set1") + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
s_treats = df3 %>% group_by(treats,pet_life_stage_at_order) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=treats, fill = pet_life_stage_at_order, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: treats") + scale_fill_brewer(palette = "Set1") + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
p = ggarrange(s_order, s_treats, s_wetfood, s_wetfood2, ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
p
#proportion: pet_food_tier
t_order = df %>% group_by(pet_food_tier) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=pet_food_tier, fill = pet_food_tier, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Transactions") + scale_fill_jco() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
t_treats = df3 %>% group_by(treats, pet_food_tier) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=treats, fill = pet_food_tier, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: treats") + scale_fill_jco() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
t_wetfood = df3 %>% group_by(wetfood,pet_food_tier) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=wetfood, fill = pet_food_tier, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: wetfood") + scale_fill_jco() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
t_wetfood2 = df3 %>% group_by(wetfood2,pet_food_tier) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=wetfood2, fill = pet_food_tier, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion",title="Label: wetfood2") + scale_fill_jco() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11))
ggarrange(t_order, t_treats, t_wetfood, t_wetfood2, ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
#proportion: signup_promo
sp_order = df %>% group_by(signup_promo) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot( aes(x=signup_promo, y=prop) ) +
geom_segment( aes(x=signup_promo,xend=signup_promo, y=0, yend=prop), color="grey") +
geom_point(size=2, color="#294c60") + coord_flip() + theme_light() + theme(panel.grid.major.x = element_blank(), panel.border = element_blank(), axis.ticks.x = element_blank(),axis.ticks.y = element_blank(),panel.grid.major.y = element_blank(), panel.grid.minor.x = element_blank(),axis.title = element_text(size=9),plot.title = element_text(size=11)
) + labs(x="signup_promo", y="proportion", title="Transactions") + ylim(0.00,0.30)
sp_treats = df3 %>% group_by(treats, signup_promo) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=signup_promo, fill = treats, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: treats") + scale_fill_jama() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11)) + coord_flip() + ylim(0.00,0.30)
sp_wetfood = df3 %>% group_by(wetfood, signup_promo) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=signup_promo, fill = wetfood, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: wetfood") + scale_fill_jama() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11)) + coord_flip() + ylim(0.00,0.30)
sp_wetfood2 = df3 %>% group_by(wetfood2, signup_promo) %>% tally() %>% mutate (prop = n/sum(n)) %>% ggplot(aes(x=signup_promo, fill = wetfood2, y= prop)) + geom_bar(position="dodge", stat="identity") + labs(y="proportion", title="Label: wetfood2") + scale_fill_jama() + theme(axis.title = element_text(size=9), plot.title = element_text(size=11)) + coord_flip() + ylim(0.00,0.30)
#scale
cdf = df3 %>% select(pet_order_number_max,wet_food_order_number_max, total_treat_packs)
cdfscaled = scale(cdf)
head(cdfscaled)
#hierarchical clustering dendrogram
set.seed(1234)
h2= hclust(dist(cdfscaled))
plot(h2)
library(factoextra)
set.seed(123)
fviz_nbclust(cdfscaled,kmeans,method="wss")
set.seed(123)
km4= kmeans(cdfscaled,centers=4,nstart=50)
km4
K-means clustering with 4 clusters of sizes 1927, 335, 1306, 9519
Cluster means:
pet_order_number_max wet_food_order_number_max total_treat_packs
1 1.4867859 -0.4373495 -0.02144086
2 1.5157599 1.0995342 4.89349693
3 0.8372811 2.2804515 0.05756638
4 -0.4691990 -0.2630362 -0.17577336
Clustering vector:
[1] 3 1 4 4 4 4 4 4 1 1 4 3 4 1 1 4 4 3 1 3 1 4 1 1 4 1 1 1 4 4 4 1 1 3 1 4 1 4 4 4 1 1 4 4 4 1 1 4 4 1 4 4 4 1 1 1 1 4 1
[60] 4 4 3 1 4 3 3 3 1 4 1 4 2 4 4 2 3 4 3 3 4 4 4 3 4 4 4 1 3 2 1 2 2 4 3 1 4 4 3 4 4 1 4 1 1 1 4 3 4 4 3 1 1 1 4 4 1 1 4
[119] 4 4 3 4 4 4 3 2 1 4 4 4 1 4 1 3 4 4 4 4 1 1 4 4 3 3 4 3 2 1 3 4 4 4 4 3 2 4 1 4 4 3 1 3 1 1 4 4 2 1 1 1 1 4 4 1 1 4 4
[178] 1 1 1 4 3 4 3 2 1 4 4 4 1 1 1 4 1 1 1 4 1 1 1 1 4 1 3 4 4 4 3 4 1 4 4 4 4 4 4 4 3 3 3 1 4 3 3 2 1 4 2 4 3 4 4 1 4 4 1
[237] 4 1 4 3 3 4 4 1 4 4 1 3 2 1 4 4 4 4 4 4 3 1 4 1 4 1 1 4 1 2 4 4 4 1 4 3 4 1 1 4 1 4 4 1 4 1 1 4 4 4 4 3 3 4 4 1 4 1 1
[296] 4 1 4 1 1 3 4 1 4 4 4 1 4 4 4 3 4 4 1 4 3 4 1 4 1 1 3 4 4 4 3 3 3 4 4 1 4 3 3 4 3 4 4 3 1 4 4 1 4 1 3 1 1 4 4 1 4 4 4
[355] 3 1 4 4 1 4 4 4 4 1 1 3 4 1 2 4 1 4 4 4 4 4 4 1 4 4 4 4 4 4 3 4 3 1 4 3 4 1 1 1 1 4 1 1 3 3 1 1 4 1 3 1 4 4 1 1 1 4 1
[414] 4 4 4 4 4 4 1 4 1 4 1 4 4 3 3 4 4 3 1 3 4 4 4 4 3 4 4 3 4 4 1 4 3 4 1 1 2 4 3 4 4 4 1 3 4 4 4 4 4 3 1 4 1 4 4 4 4 1 4
[473] 3 4 1 4 4 4 4 3 1 4 4 4 4 2 1 4 1 3 4 4 2 1 3 1 4 3 4 2 4 1 1 1 4 1 4 4 1 4 4 3 2 1 4 4 4 1 4 4 3 4 3 1 1 1 3 4 1 4 2
[532] 4 4 4 4 1 1 4 4 4 1 3 4 1 3 3 3 4 3 1 1 4 3 1 4 3 4 3 4 4 4 1 3 1 4 4 1 3 4 4 3 4 4 1 1 4 1 4 4 1 3 1 1 2 1 4 4 1 2 4
[591] 1 1 1 3 1 4 1 4 1 4 1 1 1 4 1 1 2 1 4 4 3 1 4 4 1 1 2 1 1 3 1 4 4 3 1 4 4 4 4 1 4 2 3 4 4 2 3 4 3 3 1 4 4 4 4 4 3 1 4
[650] 3 4 4 3 4 3 1 4 4 4 4 3 4 1 4 4 4 4 4 4 4 4 4 3 1 1 4 1 4 1 4 2 1 4 4 2 3 4 1 1 4 2 1 1 3 1 1 4 1 4 4 1 1 2 1 3 4 4 4
[709] 4 4 4 1 4 1 2 4 3 4 4 1 1 4 3 4 1 4 2 3 1 3 4 3 1 4 4 4 4 1 1 4 1 4 4 1 1 2 4 4 3 2 1 4 4 4 1 4 2 4 3 3 1 4 4 4 3 4 3
[768] 4 4 4 4 1 3 1 4 1 4 1 4 4 4 4 2 4 4 3 4 4 4 4 4 4 4 4 4 4 3 4 4 4 1 1 4 2 3 4 1 2 4 1 4 4 4 3 1 4 4 4 2 4 4 4 4 4 4 3
[827] 4 4 1 4 4 3 4 3 4 4 1 2 3 4 2 4 3 4 4 3 1 1 4 1 4 1 4 4 4 4 1 1 4 4 1 4 3 2 3 4 4 4 4 3 3 4 4 3 3 4 1 4 1 2 3 4 1 4 3
[886] 1 4 4 2 1 3 2 2 4 4 3 3 4 4 4 3 1 4 1 3 4 1 3 1 4 1 4 4 4 4 4 4 4 3 4 1 4 4 1 1 4 1 3 4 3 1 4 4 1 4 1 4 4 4 3 4 4 1 2
[945] 4 4 4 3 4 4 1 1 3 4 4 4 4 4 2 3 4 1 1 4 4 4 1 4 4 4 4 4 4 4 4 3 4 3 4 4 4 1 1 4 3 1 4 4 4 3 4 4 1 4 4 4 4 4 4 1
[ reached getOption("max.print") -- omitted 12087 entries ]
Within cluster sum of squares by cluster:
[1] 2750.710 4006.152 3126.080 4789.224
(between_SS / total_SS = 62.6 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
#cluster plot
fviz_cluster(km4, data=cdfscaled, labelsize=0)
#pair plot
with(cdf,pairs(cdfscaled,col=(1:4)[km4$cluster]))
cdf1 = df3 %>% select(pet_order_number_max,wet_food_order_number_max, total_treat_packs, pet_has_active_subscription, pets_household_mean)
#summary by km4 cluster id
cdf1$clusterid=km4$cluster
cdf1$clusterid=as.factor(cdf1$clusterid)
by(cdf1,cdf1$clusterid,summary)
cdf1$clusterid: 1
pet_order_number_max wet_food_order_number_max total_treat_packs pet_has_active_subscription pets_household_mean clusterid
Min. : 5.000 Min. :0.0000 Min. : 0.000 False: 492 Min. :1.000 1:1927
1st Qu.: 6.000 1st Qu.:0.0000 1st Qu.: 0.000 True :1435 1st Qu.:1.000 2: 0
Median : 7.000 Median :0.0000 Median : 0.000 Median :1.000 3: 0
Mean : 8.225 Mean :0.1463 Mean : 1.265 Mean :1.403 4: 0
3rd Qu.: 9.000 3rd Qu.:0.0000 3rd Qu.: 0.000 3rd Qu.:2.000
Max. :20.000 Max. :3.0000 Max. :13.000 Max. :5.000
---------------------------------------------------------------------------------------------
cdf1$clusterid: 2
pet_order_number_max wet_food_order_number_max total_treat_packs pet_has_active_subscription pets_household_mean clusterid
Min. : 2.00 Min. : 0.000 Min. :12.00 False: 83 Min. :1.000 1: 0
1st Qu.: 6.00 1st Qu.: 0.000 1st Qu.:16.00 True :252 1st Qu.:1.000 2:335
Median : 7.00 Median : 1.000 Median :20.00 Median :1.000 3: 0
Mean : 8.31 Mean : 3.188 Mean :23.07 Mean :1.337 4: 0
3rd Qu.:11.00 3rd Qu.: 5.000 3rd Qu.:26.00 3rd Qu.:2.000
Max. :20.00 Max. :20.000 Max. :92.00 Max. :4.000
---------------------------------------------------------------------------------------------
cdf1$clusterid: 3
pet_order_number_max wet_food_order_number_max total_treat_packs pet_has_active_subscription pets_household_mean clusterid
Min. : 4.000 Min. : 3.000 Min. : 0.000 False:371 Min. :1.000 1: 0
1st Qu.: 5.000 1st Qu.: 4.000 1st Qu.: 0.000 True :935 1st Qu.:1.000 2: 0
Median : 6.000 Median : 5.000 Median : 0.000 Median :1.000 3:1306
Mean : 6.315 Mean : 5.525 Mean : 1.616 Mean :1.337 4: 0
3rd Qu.: 7.000 3rd Qu.: 6.000 3rd Qu.: 2.000 3rd Qu.:2.000
Max. :20.000 Max. :20.000 Max. :13.000 Max. :5.000
---------------------------------------------------------------------------------------------
cdf1$clusterid: 4
pet_order_number_max wet_food_order_number_max total_treat_packs pet_has_active_subscription pets_household_mean clusterid
Min. :1.000 Min. :0.0000 Min. : 0.0000 False:4232 Min. : 1.000 1: 0
1st Qu.:2.000 1st Qu.:0.0000 1st Qu.: 0.0000 True :5287 1st Qu.: 1.000 2: 0
Median :2.000 Median :0.0000 Median : 0.0000 Median : 1.000 3: 0
Mean :2.472 Mean :0.4913 Mean : 0.5806 Mean : 1.328 4:9519
3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.: 0.0000 3rd Qu.: 2.000
Max. :5.000 Max. :3.0000 Max. :14.0000 Max. :10.000
library(pscl)
library(tree)
library(rpart)
library(rpart.plot)
library(rattle)
library(randomForest)
library(caret)
library(pROC)
library(sjPlot)
library(corrplot)
library(viridis)
dim(df3)
[1] 13087 30
13087 * 0.8
[1] 10469.6
set.seed(123)
y1= sample(1:13087,10470)
xtrain=d1[y1,]
xtest=d1[-y1,]
Hmisc:: describe(xtrain$treats)
xtrain$treats
n missing distinct
10470 0 2
Value 0 1
Frequency 7960 2510
Proportion 0.76 0.24
Hmisc:: describe(xtest$treats)
xtest$treats
n missing distinct
2617 0 2
Value 0 1
Frequency 1993 624
Proportion 0.762 0.238
mt = rpart(treats ~., data = xtrain, method = "class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.01))
fancyRpartPlot(mt)
printcp(mt)
Classification tree:
rpart(formula = treats ~ ., data = xtrain, method = "class",
control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.01))
Variables actually used in tree construction:
[1] days_before_closing_max pet_life_stage_at_order signup_promo
Root node error: 2510/10470 = 0.23973
n= 10470
CP nsplit rel error xerror xstd
1 0.014641 0 1.00000 1.00000 0.017404
2 0.010000 6 0.90319 0.94422 0.017060
mt$variable.importance
signup_promo days_before_closing_max
125.487528 105.515338
pet_life_stage_at_order neutered
61.124696 11.711849
pet_order_number_max total_minutes_on_website_since_last_order_mean
7.494945 6.345367
wet_food_discount_percent_mean kibble_kcal_mean
4.420096 4.206598
ratio_kcal_mean
1.441336
#visualize variable importance
v1 = data.frame(imp = mt$variable.importance)
v2 <- v1 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(v2) +
geom_col(aes(x = variable, y = imp, fill= imp),
col = "white", show.legend = F) +
coord_flip() +
scale_fill_viridis() +
theme_minimal() + labs(x="Variable", y="Importance")
#prediction
tree.p = predict(mt, xtest, type = "class")
cmt = confusionMatrix(tree.p, xtest$treats)
cmt
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1936 516
1 57 108
Accuracy : 0.781
95% CI : (0.7647, 0.7968)
No Information Rate : 0.7616
P-Value [Acc > NIR] : 0.009775
Kappa : 0.1933
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9714
Specificity : 0.1731
Pos Pred Value : 0.7896
Neg Pred Value : 0.6545
Prevalence : 0.7616
Detection Rate : 0.7398
Detection Prevalence : 0.9370
Balanced Accuracy : 0.5722
'Positive' Class : 0
round(cmt$byClass["F1"], 4)
F1
0.8711
xtest$tp1= tree.p
roc_t1= roc(response= xtest$treats, predictor = factor(xtest$tp1, ordered=TRUE), plot=TRUE, print.auc=TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
model1= glm(treats ~., data=xtrain, family = "binomial")
summary(model1)
Call:
glm(formula = treats ~ ., family = "binomial", data = xtrain)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.0994 -0.7397 -0.5691 -0.1957 2.8116
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.339e+00 2.941e-01 -7.952 1.84e-15 ***
wet_food_order_number_max 1.083e-01 1.551e-02 6.981 2.92e-12 ***
pet_order_number_max 1.028e-01 1.364e-02 7.535 4.88e-14 ***
kibble_kcal_mean 2.967e-06 3.788e-06 0.783 0.433586
wet_food_discount_percent_mean 9.963e-01 2.287e-01 4.357 1.32e-05 ***
total_minutes_on_website_since_last_order_mean 1.051e-03 1.242e-04 8.462 < 2e-16 ***
pets_household_mean -2.810e-01 4.451e-02 -6.314 2.72e-10 ***
has_comm_max1 1.923e-02 6.270e-02 0.307 0.759020
days_before_closing_max -8.824e-04 3.880e-04 -2.274 0.022942 *
ratio_kcal_mean -8.926e-03 1.900e-01 -0.047 0.962531
pet_has_active_subscriptionTrue 2.031e-01 5.330e-02 3.811 0.000138 ***
pet_food_tierpremium 5.331e-02 8.520e-02 0.626 0.531501
pet_food_tiersuperpremium 1.623e-01 6.302e-02 2.575 0.010027 *
allergen_specified1 -1.111e-01 6.606e-02 -1.682 0.092629 .
fav_flavour_specified1 6.843e-02 4.997e-02 1.370 0.170841
health_issue_specified1 -2.532e-02 5.433e-02 -0.466 0.641146
dry_food_brand_specified1 1.487e-01 7.499e-02 1.983 0.047388 *
neuteredTrue -1.047e-02 5.594e-02 -0.187 0.851562
gendermale -1.742e-02 4.993e-02 -0.349 0.727178
pet_breed_sizelarge -2.731e-01 1.793e-01 -1.523 0.127658
pet_breed_sizemedium -1.090e-01 1.931e-01 -0.565 0.572343
pet_breed_sizesmall 5.786e-02 2.088e-01 0.277 0.781702
pet_breed_sizetoy 4.891e-02 2.236e-01 0.219 0.826842
signup_promoDigital Display 1.284e+00 1.366e-01 9.399 < 2e-16 ***
signup_promoEvents -2.982e-01 1.791e-01 -1.665 0.095990 .
signup_promoIncompletes 4.869e-01 1.442e-01 3.377 0.000733 ***
signup_promoInserts -4.413e-01 2.496e-01 -1.768 0.076989 .
signup_promoNull & Default 2.014e-01 1.337e-01 1.506 0.132016
signup_promoOther 4.213e-02 2.442e-01 0.173 0.863009
signup_promoRefer a Friend -2.645e-01 1.514e-01 -1.747 0.080625 .
signup_promoSearch Brand 1.239e-01 1.493e-01 0.830 0.406650
signup_promoSearch Generic 6.007e-02 1.386e-01 0.433 0.664704
signup_promoShopping Centres -2.119e+00 7.291e-01 -2.907 0.003653 **
signup_promoSocial Marketing 1.318e+00 2.056e-01 6.409 1.47e-10 ***
signup_promoVet 5.850e-01 5.499e-01 1.064 0.287431
ate_wet_food_pre_tailsTrue -2.308e-01 6.144e-02 -3.757 0.000172 ***
pet_life_stage_at_ordermature 6.184e-01 7.736e-02 7.993 1.32e-15 ***
pet_life_stage_at_ordersenior 6.743e-01 9.843e-02 6.850 7.38e-12 ***
pet_life_stage_at_orderweaning -1.385e+00 1.731e-01 -8.005 1.19e-15 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 11533 on 10469 degrees of freedom
Residual deviance: 10285 on 10431 degrees of freedom
AIC: 10363
Number of Fisher Scoring iterations: 6
pR2(model1)
fitting null model for pseudo-r2
llh llhNull G2 McFadden r2ML r2CU
-5142.5662033 -5766.5772913 1248.0221759 0.1082117 0.1123696 0.1683081
anova(model1, test= "Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: treats
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 10469 11533
wet_food_order_number_max 1 267.84 10468 11265 < 2.2e-16 ***
pet_order_number_max 1 81.11 10467 11184 < 2.2e-16 ***
kibble_kcal_mean 1 1.03 10466 11183 0.311188
wet_food_discount_percent_mean 1 34.98 10465 11148 3.336e-09 ***
total_minutes_on_website_since_last_order_mean 1 54.00 10464 11094 2.008e-13 ***
pets_household_mean 1 34.44 10463 11060 4.385e-09 ***
has_comm_max 1 0.19 10462 11060 0.663820
days_before_closing_max 1 2.04 10461 11058 0.153005
ratio_kcal_mean 1 0.01 10460 11058 0.933400
pet_has_active_subscription 1 18.64 10459 11039 1.576e-05 ***
pet_food_tier 2 3.13 10457 11036 0.209357
allergen_specified 1 0.01 10456 11036 0.936897
fav_flavour_specified 1 6.73 10455 11029 0.009484 **
health_issue_specified 1 2.39 10454 11027 0.122282
dry_food_brand_specified 1 2.59 10453 11024 0.107842
neutered 1 29.65 10452 10994 5.166e-08 ***
gender 1 0.01 10451 10994 0.914509
pet_breed_size 4 28.15 10447 10966 1.161e-05 ***
signup_promo 12 442.34 10435 10524 < 2.2e-16 ***
ate_wet_food_pre_tails 1 10.53 10434 10513 0.001177 **
pet_life_stage_at_order 3 228.22 10431 10285 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#probablity 0.5
prob=predict(model1,xtest,type="response")
prob1=rep(0,2617)
prob1[prob>0.5]=1
cmlr = confusionMatrix(as.factor(prob1), xtest$treats)
cmlr
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1906 529
1 87 95
Accuracy : 0.7646
95% CI : (0.7479, 0.7808)
No Information Rate : 0.7616
P-Value [Acc > NIR] : 0.3667
Kappa : 0.1435
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9563
Specificity : 0.1522
Pos Pred Value : 0.7828
Neg Pred Value : 0.5220
Prevalence : 0.7616
Detection Rate : 0.7283
Detection Prevalence : 0.9305
Balanced Accuracy : 0.5543
'Positive' Class : 0
round(cmlr$byClass["F1"], 4)
F1
0.8609
roc_lr2 = roc(xtest$treats, prob1, plot=TRUE, print.auc=TRUE)
set.seed(4543)
rf <- randomForest(treats ~ ., data=xtrain)
importance(rf)
MeanDecreaseGini
wet_food_order_number_max 110.57888
pet_order_number_max 232.31183
kibble_kcal_mean 477.30825
wet_food_discount_percent_mean 107.14150
total_minutes_on_website_since_last_order_mean 444.72561
pets_household_mean 97.57354
has_comm_max 56.50148
days_before_closing_max 536.57563
ratio_kcal_mean 205.76744
pet_has_active_subscription 62.15384
pet_food_tier 116.47059
allergen_specified 57.96103
fav_flavour_specified 73.44539
health_issue_specified 73.03605
dry_food_brand_specified 45.56526
neutered 66.28429
gender 76.86991
pet_breed_size 179.81701
signup_promo 448.91025
ate_wet_food_pre_tails 57.97552
pet_life_stage_at_order 158.54733
varUsed(rf, by.tree=FALSE, count =TRUE)
[1] 22242 67356 120907 26053 102272 33236 21208 120356 42449 23625 40594 22528 29475 29894 17864 22679 31939
[18] 54890 74705 21933 37360
varImpPlot(rf)
#prediction
rfp = predict(rf, xtest)
cmrf = confusionMatrix(rfp, xtest$treats)
cmrf
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1922 499
1 71 125
Accuracy : 0.7822
95% CI : (0.7659, 0.7979)
No Information Rate : 0.7616
P-Value [Acc > NIR] : 0.006657
Kappa : 0.2155
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9644
Specificity : 0.2003
Pos Pred Value : 0.7939
Neg Pred Value : 0.6378
Prevalence : 0.7616
Detection Rate : 0.7344
Detection Prevalence : 0.9251
Balanced Accuracy : 0.5823
'Positive' Class : 0
round(cmrf$byClass["F1"], 4)
F1
0.8709
xtest$rfp= rfp
roc_rf= roc(response= xtest$treats, predictor = factor(xtest$rfp, ordered=TRUE), plot=TRUE, print.auc=TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
df3$wetfood = as.factor(df3$wetfood)
#select variables
d2 = df3 %>% select (wetfood,pet_order_number_max,kibble_kcal_mean,total_minutes_on_website_since_last_order_mean,pets_household_mean,has_comm_max,days_before_closing_max, premium_treat_packs_sum, dental_treat_packs_sum, pet_has_active_subscription, pet_food_tier, allergen_specified, fav_flavour_specified, health_issue_specified, dry_food_brand_specified, neutered, gender, pet_breed_size, signup_promo, pet_life_stage_at_order, ate_wet_food_pre_tails)
dim(d2)
[1] 13087 21
set.seed(1234)
y1= sample(1:13087,10470)
xtrain2=d2[y1,]
xtest2=d2[-y1,]
Hmisc:: describe(xtrain2$wetfood)
xtrain2$wetfood
n missing distinct
10470 0 2
Value 0 1
Frequency 7058 3412
Proportion 0.674 0.326
Hmisc:: describe(xtest2$wetfood)
xtest2$wetfood
n missing distinct
2617 0 2
Value 0 1
Frequency 1766 851
Proportion 0.675 0.325
mt2 = rpart(wetfood ~., data = xtrain2, method = "class")
fancyRpartPlot(mt2)
printcp(mt2)
Classification tree:
rpart(formula = wetfood ~ ., data = xtrain2, method = "class")
Variables actually used in tree construction:
[1] allergen_specified ate_wet_food_pre_tails
[3] pet_life_stage_at_order signup_promo
[5] total_minutes_on_website_since_last_order_mean
Root node error: 3412/10470 = 0.32588
n= 10470
CP nsplit rel error xerror xstd
1 0.290445 0 1.00000 1.00000 0.014056
2 0.068581 1 0.70955 0.70955 0.012644
3 0.039859 2 0.64097 0.64097 0.012191
4 0.010258 3 0.60111 0.60873 0.011959
5 0.010000 5 0.58060 0.59906 0.011887
mt2$variable.importance
ate_wet_food_pre_tails dry_food_brand_specified
1179.0746826 149.9913274
kibble_kcal_mean allergen_specified
142.0592801 139.8842868
pet_life_stage_at_order signup_promo
137.6351724 44.5428619
total_minutes_on_website_since_last_order_mean days_before_closing_max
33.2480849 13.5432123
pet_order_number_max has_comm_max
12.2928756 7.2153835
dental_treat_packs_sum premium_treat_packs_sum
6.4136742 1.9998844
pets_household_mean
0.3681165
#visualize variable importance
v3 = data.frame(imp = mt2$variable.importance)
v4 <- v3 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(v4) +
geom_col(aes(x = variable, y = imp, fill= imp),
col = "white", show.legend = F) +
coord_flip() +
scale_fill_viridis() +
theme_minimal() + labs(x="Variable", y="Importance")
#prediction
tree.p2 = predict(mt2, xtest2, type = "class")
cmt2 = confusionMatrix(tree.p2, xtest2$wetfood)
cmt2
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1635 359
1 131 492
Accuracy : 0.8128
95% CI : (0.7973, 0.8275)
No Information Rate : 0.6748
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5416
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9258
Specificity : 0.5781
Pos Pred Value : 0.8200
Neg Pred Value : 0.7897
Prevalence : 0.6748
Detection Rate : 0.6248
Detection Prevalence : 0.7619
Balanced Accuracy : 0.7520
'Positive' Class : 0
round(cmt2$byClass["F1"], 4)
F1
0.8697
xtest2$tp2= tree.p2
roc_t2= roc(response= xtest2$wetfood, predictor = factor(xtest2$tp2, ordered=TRUE), plot=TRUE, print.auc=TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
model2= glm(wetfood ~., data=xtrain2, family = "binomial")
summary(model2)
Call:
glm(formula = wetfood ~ ., family = "binomial", data = xtrain2)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0579 -0.6448 -0.3377 0.6763 3.4249
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.818e+00 3.167e-01 -5.743 9.32e-09 ***
pet_order_number_max 8.550e-02 1.476e-02 5.791 6.99e-09 ***
kibble_kcal_mean -1.479e-05 4.175e-06 -3.543 0.000395 ***
total_minutes_on_website_since_last_order_mean 8.688e-04 1.311e-04 6.627 3.42e-11 ***
pets_household_mean -1.674e-01 4.322e-02 -3.873 0.000107 ***
has_comm_max1 5.184e-01 6.862e-02 7.554 4.21e-14 ***
days_before_closing_max -1.910e-03 4.248e-04 -4.495 6.95e-06 ***
premium_treat_packs_sum 8.565e-02 1.715e-02 4.994 5.91e-07 ***
dental_treat_packs_sum 4.832e-02 7.162e-03 6.746 1.52e-11 ***
pet_has_active_subscriptionTrue 1.660e-02 5.689e-02 0.292 0.770425
pet_food_tierpremium -1.537e-01 9.143e-02 -1.681 0.092805 .
pet_food_tiersuperpremium -2.939e-02 6.786e-02 -0.433 0.664973
allergen_specified1 -1.650e+00 7.934e-02 -20.804 < 2e-16 ***
fav_flavour_specified1 6.377e-02 5.425e-02 1.175 0.239845
health_issue_specified1 -5.294e-02 5.893e-02 -0.898 0.369031
dry_food_brand_specified1 -4.173e-01 7.699e-02 -5.420 5.96e-08 ***
neuteredTrue 3.639e-02 6.055e-02 0.601 0.547820
gendermale -9.449e-02 5.377e-02 -1.757 0.078837 .
pet_breed_sizelarge -2.658e-01 2.082e-01 -1.277 0.201711
pet_breed_sizemedium -3.001e-01 2.200e-01 -1.364 0.172486
pet_breed_sizesmall -1.241e-01 2.346e-01 -0.529 0.596905
pet_breed_sizetoy -3.057e-01 2.489e-01 -1.228 0.219444
signup_promoDigital Display 2.832e-01 1.423e-01 1.990 0.046592 *
signup_promoEvents -4.783e-01 1.779e-01 -2.688 0.007182 **
signup_promoIncompletes 2.696e-01 1.498e-01 1.800 0.071818 .
signup_promoInserts 5.672e-01 2.188e-01 2.593 0.009511 **
signup_promoNull & Default 2.655e-01 1.351e-01 1.966 0.049335 *
signup_promoOther -4.580e-01 2.558e-01 -1.790 0.073440 .
signup_promoRefer a Friend 2.790e-01 1.495e-01 1.867 0.061943 .
signup_promoSearch Brand 1.630e-01 1.524e-01 1.070 0.284773
signup_promoSearch Generic 5.781e-01 1.395e-01 4.144 3.41e-05 ***
signup_promoShopping Centres -2.082e+00 4.173e-01 -4.988 6.10e-07 ***
signup_promoSocial Marketing 1.579e-01 2.444e-01 0.646 0.518061
signup_promoVet -1.352e+01 1.852e+02 -0.073 0.941801
pet_life_stage_at_ordermature 1.082e+00 8.335e-02 12.982 < 2e-16 ***
pet_life_stage_at_ordersenior 9.618e-01 1.060e-01 9.073 < 2e-16 ***
pet_life_stage_at_orderweaning -1.676e+00 1.746e-01 -9.600 < 2e-16 ***
ate_wet_food_pre_tailsTrue 2.442e+00 5.760e-02 42.390 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 13217.9 on 10469 degrees of freedom
Residual deviance: 8996.3 on 10432 degrees of freedom
AIC: 9072.3
Number of Fisher Scoring iterations: 13
pR2(model2)
fitting null model for pseudo-r2
llh llhNull G2 McFadden r2ML r2CU
-4498.1599482 -6608.9254594 4221.5310224 0.3193810 0.3318233 0.4627681
anova(model2, test= "Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: wetfood
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 10469 13217.9
pet_order_number_max 1 10.32 10468 13207.5 0.001313 **
kibble_kcal_mean 1 274.12 10467 12933.4 < 2.2e-16 ***
total_minutes_on_website_since_last_order_mean 1 18.87 10466 12914.5 1.397e-05 ***
pets_household_mean 1 7.62 10465 12906.9 0.005766 **
has_comm_max 1 44.76 10464 12862.2 2.228e-11 ***
days_before_closing_max 1 21.63 10463 12840.5 3.310e-06 ***
premium_treat_packs_sum 1 54.71 10462 12785.8 1.395e-13 ***
dental_treat_packs_sum 1 68.73 10461 12717.1 < 2.2e-16 ***
pet_has_active_subscription 1 0.22 10460 12716.9 0.638885
pet_food_tier 2 6.85 10458 12710.0 0.032504 *
allergen_specified 1 349.29 10457 12360.7 < 2.2e-16 ***
fav_flavour_specified 1 7.05 10456 12353.7 0.007913 **
health_issue_specified 1 21.34 10455 12332.3 3.836e-06 ***
dry_food_brand_specified 1 257.43 10454 12074.9 < 2.2e-16 ***
neutered 1 125.78 10453 11949.1 < 2.2e-16 ***
gender 1 2.52 10452 11946.6 0.112740
pet_breed_size 4 9.06 10448 11937.5 0.059523 .
signup_promo 12 83.93 10436 11853.6 7.291e-13 ***
pet_life_stage_at_order 3 698.25 10433 11155.4 < 2.2e-16 ***
ate_wet_food_pre_tails 1 2159.03 10432 8996.3 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#prediction
prob=predict(model2,xtest2,type="response")
prob1=rep(0,2617)
prob1[prob>0.5]=1
cmlr = confusionMatrix(as.factor(prob1), xtest2$wetfood)
cmlr
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1576 290
1 190 561
Accuracy : 0.8166
95% CI : (0.8012, 0.8312)
No Information Rate : 0.6748
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.569
Mcnemar's Test P-Value : 6.222e-06
Sensitivity : 0.8924
Specificity : 0.6592
Pos Pred Value : 0.8446
Neg Pred Value : 0.7470
Prevalence : 0.6748
Detection Rate : 0.6022
Detection Prevalence : 0.7130
Balanced Accuracy : 0.7758
'Positive' Class : 0
round(cmlr$byClass["F1"], 4)
F1
0.8678
roc_lr2 = roc(xtest2$wetfood, prob1, plot=TRUE, print.auc=TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
set.seed(4543)
rf2 <- randomForest(wetfood ~ ., data=xtrain2)
importance(rf2)
MeanDecreaseGini
pet_order_number_max 221.65142
kibble_kcal_mean 520.15827
total_minutes_on_website_since_last_order_mean 396.70662
pets_household_mean 96.43917
has_comm_max 54.99431
days_before_closing_max 425.92952
premium_treat_packs_sum 78.23994
dental_treat_packs_sum 140.58992
pet_has_active_subscription 67.94073
pet_food_tier 99.77331
allergen_specified 177.60944
fav_flavour_specified 68.47868
health_issue_specified 64.12641
dry_food_brand_specified 108.49394
neutered 65.45760
gender 67.60289
pet_breed_size 160.95901
signup_promo 388.86597
pet_life_stage_at_order 257.75683
ate_wet_food_pre_tails 1018.68005
varUsed(rf2, by.tree=FALSE, count =TRUE)
[1] 68710 111779 91783 34291 18603 107247 18749 28558 27865 34591 10146 28206 26624 11023 21250 29338 45389
[18] 79079 31347 11268
varImpPlot(rf2)
#prediction
rfp2 = predict(rf2, xtest2)
cmrf2 = confusionMatrix(rfp2, xtest2$wetfood)
cmrf2
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1590 267
1 176 584
Accuracy : 0.8307
95% CI : (0.8158, 0.8449)
No Information Rate : 0.6748
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6033
Mcnemar's Test P-Value : 1.903e-05
Sensitivity : 0.9003
Specificity : 0.6863
Pos Pred Value : 0.8562
Neg Pred Value : 0.7684
Prevalence : 0.6748
Detection Rate : 0.6076
Detection Prevalence : 0.7096
Balanced Accuracy : 0.7933
'Positive' Class : 0
round(cmrf2$byClass["F1"], 4)
F1
0.8777
xtest2$rfp2= rfp2
roc_rf= roc(response= xtest2$wetfood, predictor = factor(xtest2$rfp2, ordered=TRUE), plot=TRUE, print.auc=TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
#select variables
d3 = df3 %>% select (wetfood2, pet_order_number_max,kibble_kcal_mean,total_minutes_on_website_since_last_order_mean,pets_household_mean,has_comm_max,days_before_closing_max, premium_treat_packs_sum, dental_treat_packs_sum, pet_has_active_subscription, pet_food_tier, allergen_specified, fav_flavour_specified, health_issue_specified, dry_food_brand_specified, neutered, gender, pet_breed_size, signup_promo, pet_life_stage_at_order, ate_wet_food_pre_tails, wet_food_discount_percent_mean, ratio_kcal_mean)
dim(d3)
[1] 13087 23
#drop obs that do not have any wet food orders
d3 = d3 %>% filter(!(wetfood2==0)) %>% droplevels()
dim(d3)
[1] 4263 23
d3 = d3 %>% mutate(wetfood2= recode(wetfood2,`1`="0", `2` ="1" ))
Hmisc::describe(d3$wetfood2)
d3$wetfood2
n missing distinct
4263 0 2
Value 0 1
Frequency 1131 3132
Proportion 0.265 0.735
d3$wetfood2 = as.factor(d3$wetfood2)
#test and train set
set.seed(2345)
y1= sample(1:4263,3410)
xtrain3=d3[y1,]
xtest3=d3[-y1,]
Hmisc:: describe(xtrain3$wetfood2)
xtrain3$wetfood2
n missing distinct
3410 0 2
Value 0 1
Frequency 903 2507
Proportion 0.265 0.735
Hmisc:: describe(xtest3$wetfood2)
xtest3$wetfood2
n missing distinct
853 0 2
Value 0 1
Frequency 228 625
Proportion 0.267 0.733
mt3 = rpart(wetfood2 ~., data = xtrain3, method = "class", control=rpart.control(cp=0, maxdepth = 3, minbucket = 100, minsplit = 100))
fancyRpartPlot(mt3)
printcp(mt3)
Classification tree:
rpart(formula = wetfood2 ~ ., data = xtrain3, method = "class",
control = rpart.control(cp = 0, maxdepth = 3, minbucket = 100,
minsplit = 100))
Variables actually used in tree construction:
[1] pet_order_number_max ratio_kcal_mean wet_food_discount_percent_mean
Root node error: 903/3410 = 0.26481
n= 3410
CP nsplit rel error xerror xstd
1 0.576966 0 1.00000 1.00000 0.028534
2 0.028239 1 0.42303 0.42303 0.020396
3 0.000000 3 0.36656 0.40089 0.019920
mt3$variable.importance
wet_food_discount_percent_mean pet_order_number_max ratio_kcal_mean
666.145743 578.038513 138.166504
total_minutes_on_website_since_last_order_mean kibble_kcal_mean days_before_closing_max
102.588374 16.835457 8.915154
signup_promo pet_life_stage_at_order
3.130860 2.739797
#visualize variable importance
v3 = data.frame(imp = mt3$variable.importance)
v4 <- v3 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(v4) +
geom_col(aes(x = variable, y = imp, fill= imp),
col = "white", show.legend = F) +
coord_flip() +
scale_fill_viridis() +
theme_minimal() + labs(x="Variable", y="Importance")
#prediction
tree.p3 = predict(mt3, xtest3, type = "class")
cmt3 = confusionMatrix(tree.p3, xtest3$wetfood2)
cmt3
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 157 18
1 71 607
Accuracy : 0.8957
95% CI : (0.8732, 0.9154)
No Information Rate : 0.7327
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7124
Mcnemar's Test P-Value : 3.548e-08
Sensitivity : 0.6886
Specificity : 0.9712
Pos Pred Value : 0.8971
Neg Pred Value : 0.8953
Prevalence : 0.2673
Detection Rate : 0.1841
Detection Prevalence : 0.2052
Balanced Accuracy : 0.8299
'Positive' Class : 0
round(cmt3$byClass["F1"], 4)
F1
0.7792
xtest3$tp3= tree.p3
roc_t3= roc(response= xtest3$wetfood2, predictor = factor(xtest3$tp3, ordered=TRUE), plot=TRUE, print.auc=TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
model3= glm(wetfood2 ~., data=xtrain3, family = "binomial")
summary(model3)
Call:
glm(formula = wetfood2 ~ ., family = "binomial", data = xtrain3)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.6905 -0.3336 0.3080 0.5883 2.7424
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.830e+00 7.094e-01 -5.398 6.72e-08 ***
pet_order_number_max 2.548e-01 4.069e-02 6.263 3.77e-10 ***
kibble_kcal_mean 9.904e-05 1.073e-05 9.234 < 2e-16 ***
total_minutes_on_website_since_last_order_mean 2.462e-05 2.373e-04 0.104 0.91734
pets_household_mean -6.564e-02 8.705e-02 -0.754 0.45084
has_comm_max1 1.419e-01 1.415e-01 1.003 0.31589
days_before_closing_max -4.927e-03 8.523e-04 -5.781 7.41e-09 ***
premium_treat_packs_sum -4.502e-02 2.989e-02 -1.506 0.13201
dental_treat_packs_sum 2.487e-02 1.857e-02 1.339 0.18053
pet_has_active_subscriptionTrue -7.986e-02 1.104e-01 -0.723 0.46964
pet_food_tierpremium 8.637e-02 1.834e-01 0.471 0.63758
pet_food_tiersuperpremium 3.851e-01 1.345e-01 2.864 0.00419 **
allergen_specified1 -6.954e-01 1.755e-01 -3.963 7.39e-05 ***
fav_flavour_specified1 5.742e-02 1.056e-01 0.543 0.58682
health_issue_specified1 -1.671e-02 1.138e-01 -0.147 0.88324
dry_food_brand_specified1 6.248e-02 1.378e-01 0.453 0.65022
neuteredTrue -4.884e-02 1.148e-01 -0.425 0.67050
gendermale -3.405e-01 1.058e-01 -3.219 0.00128 **
pet_breed_sizelarge 1.638e+00 4.259e-01 3.846 0.00012 ***
pet_breed_sizemedium 2.166e+00 4.596e-01 4.712 2.45e-06 ***
pet_breed_sizesmall 2.671e+00 4.983e-01 5.360 8.33e-08 ***
pet_breed_sizetoy 2.371e+00 5.267e-01 4.502 6.73e-06 ***
signup_promoDigital Display -2.851e-01 2.768e-01 -1.030 0.30286
signup_promoEvents -8.780e-01 3.666e-01 -2.395 0.01661 *
signup_promoIncompletes -5.250e-01 2.907e-01 -1.806 0.07095 .
signup_promoInserts -8.418e-01 4.099e-01 -2.054 0.04002 *
signup_promoNull & Default 2.658e-02 2.653e-01 0.100 0.92019
signup_promoOther -2.219e-01 5.380e-01 -0.412 0.68004
signup_promoRefer a Friend -7.371e-01 2.953e-01 -2.496 0.01257 *
signup_promoSearch Brand 9.079e-03 3.028e-01 0.030 0.97608
signup_promoSearch Generic -2.259e-01 2.708e-01 -0.834 0.40416
signup_promoShopping Centres -1.014e+01 1.105e+01 -0.918 0.35874
signup_promoSocial Marketing -8.044e-04 5.514e-01 -0.001 0.99884
pet_life_stage_at_ordermature 1.303e+00 1.738e-01 7.499 6.42e-14 ***
pet_life_stage_at_ordersenior 1.333e+00 2.170e-01 6.143 8.10e-10 ***
pet_life_stage_at_orderweaning -9.261e-01 3.584e-01 -2.584 0.00976 **
ate_wet_food_pre_tailsTrue 5.526e-01 1.206e-01 4.581 4.64e-06 ***
wet_food_discount_percent_mean -8.331e+00 4.662e-01 -17.871 < 2e-16 ***
ratio_kcal_mean 8.537e+00 5.939e-01 14.373 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3942.1 on 3409 degrees of freedom
Residual deviance: 2469.8 on 3371 degrees of freedom
AIC: 2547.8
Number of Fisher Scoring iterations: 8
pR2(model3)
fitting null model for pseudo-r2
llh llhNull G2 McFadden r2ML r2CU
-1234.9043012 -1971.0738088 1472.3390153 0.3734865 0.3506419 0.5116808
anova(model3, test= "Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: wetfood2
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 3409 3942.1
pet_order_number_max 1 549.72 3408 3392.4 < 2.2e-16 ***
kibble_kcal_mean 1 7.31 3407 3385.1 0.0068552 **
total_minutes_on_website_since_last_order_mean 1 0.96 3406 3384.2 0.3276353
pets_household_mean 1 0.00 3405 3384.2 0.9869140
has_comm_max 1 2.44 3404 3381.7 0.1182804
days_before_closing_max 1 55.07 3403 3326.6 1.161e-13 ***
premium_treat_packs_sum 1 1.35 3402 3325.3 0.2450491
dental_treat_packs_sum 1 6.33 3401 3319.0 0.0118908 *
pet_has_active_subscription 1 0.80 3400 3318.2 0.3724465
pet_food_tier 2 11.29 3398 3306.9 0.0035377 **
allergen_specified 1 13.41 3397 3293.5 0.0002504 ***
fav_flavour_specified 1 0.07 3396 3293.4 0.7937317
health_issue_specified 1 0.00 3395 3293.4 0.9545521
dry_food_brand_specified 1 2.14 3394 3291.3 0.1432804
neutered 1 3.93 3393 3287.3 0.0475603 *
gender 1 7.71 3392 3279.6 0.0054794 **
pet_breed_size 4 38.64 3388 3241.0 8.253e-08 ***
signup_promo 11 36.14 3377 3204.8 0.0001602 ***
pet_life_stage_at_order 3 60.89 3374 3143.9 3.794e-13 ***
ate_wet_food_pre_tails 1 25.55 3373 3118.4 4.301e-07 ***
wet_food_discount_percent_mean 1 344.42 3372 2774.0 < 2.2e-16 ***
ratio_kcal_mean 1 304.16 3371 2469.8 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#prediction
prob=predict(model3,xtest3,type="response")
prob1=rep(0,853)
prob1[prob>0.5]=1
cmlr = confusionMatrix(as.factor(prob1), xtest3$wetfood2)
cmlr
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 129 36
1 99 589
Accuracy : 0.8417
95% CI : (0.8155, 0.8656)
No Information Rate : 0.7327
P-Value [Acc > NIR] : 2.256e-14
Kappa : 0.5571
Mcnemar's Test P-Value : 9.496e-08
Sensitivity : 0.5658
Specificity : 0.9424
Pos Pred Value : 0.7818
Neg Pred Value : 0.8561
Prevalence : 0.2673
Detection Rate : 0.1512
Detection Prevalence : 0.1934
Balanced Accuracy : 0.7541
'Positive' Class : 0
round(cmlr$byClass["F1"], 4)
F1
0.6565
roc_lr2 = roc(xtest3$wetfood2, prob1, plot=TRUE, print.auc=TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
Target variable: treats (has treats pack purchase)
Target variable: wetfood (has wet food order)
Target variable: wetfood2 (has follow up wet food order)