##Capstone Project on Shipment Failures
##Date : 14-Nov-2017
##Setting the folder path and reading the file
setwd("C:/Users/v-vyupad/Desktop/Analytics")
data=read.csv("shipmentdata.csv",sep = ",",header = T)
# View(data)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
dim(data)
## [1] 104530 40
names(data)
## [1] "tracking_id" "access_code"
## [3] "da_employee_id" "da_affinity_index"
## [5] "shipment_id" "value_of_goods"
## [7] "warehouse" "item_quantity"
## [9] "ship_method" "city"
## [11] "delivery_note" "geocode_quality"
## [13] "fulfillment_center_id" "pkg_ship_method"
## [15] "pkg_ship_cost" "pkg_width"
## [17] "pkg_length" "pkg_height"
## [19] "pkg_scale_weight" "shipment_contents_type"
## [21] "delivery_date" "pdd"
## [23] "is_promise_met" "star_rating"
## [25] "is_mon_open" "is_tue_open"
## [27] "is_wed_open" "is_thu_open"
## [29] "is_fri_open" "is_sat_open"
## [31] "is_sun_open" "gate_code"
## [33] "res_com_flag" "is_po_box"
## [35] "is_campus" "is_freight_forwarder"
## [37] "is_street" "is_apt"
## [39] "package_class" "Success"
str(data)
## 'data.frame': 104530 obs. of 40 variables:
## $ tracking_id : Factor w/ 104530 levels "TBA424087593000",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ access_code : Factor w/ 5250 levels "\\Gate Code: \\ #3250\\\\\"\\\"\\\"\\\"\\\"\"",..: NA NA NA NA NA NA 2505 NA NA NA ...
## $ da_employee_id : int NA 1706161 2399331 2089381 2723771 1899591 1454641 2539801 1706161 2667961 ...
## $ da_affinity_index : num 0 0.189 0 0 0 ...
## $ shipment_id : num 5.94e+09 5.94e+09 5.95e+09 5.95e+09 5.95e+09 ...
## $ value_of_goods : int 9359 34999 2926 7598 29925 8999 8499 6305 3999 4899 ...
## $ warehouse : Factor w/ 83 levels "ABE2","ABE3",..: 26 26 30 10 66 14 83 66 83 83 ...
## $ item_quantity : int 2 1 1 2 19 1 1 8 1 1 ...
## $ ship_method : Factor w/ 7 levels "AMZL_US_BULK",..: 7 1 5 6 7 7 5 5 1 5 ...
## $ city : Factor w/ 215 levels "aventura","Aventura",..: 3 45 153 3 167 211 190 136 3 126 ...
## $ delivery_note : Factor w/ 289 levels "#001","*1379",..: NA NA NA NA NA NA NA NA NA NA ...
## $ geocode_quality : int 1 1 1 1 1 1 1 1 1 1 ...
## $ fulfillment_center_id : Factor w/ 83 levels "ABE2","ABE3",..: 26 26 30 10 66 14 83 66 83 83 ...
## $ pkg_ship_method : Factor w/ 7 levels "AMZL_US_BULK",..: 7 1 5 6 7 7 5 5 1 5 ...
## $ pkg_ship_cost : num 6.08 2.83 5.76 10.79 5.82 ...
## $ pkg_width : num 16.2 15 8 21 14.5 ...
## $ pkg_length : num 24.2 21 13.5 26 19 ...
## $ pkg_height : num 13 6 4.5 16 12 2 8.5 3 4.25 12 ...
## $ pkg_scale_weight : num 14.33 8.16 1.54 13.03 9.48 ...
## $ shipment_contents_type: int 536871168 536871168 256 512 256 4195328 256 1 256 256 ...
## $ delivery_date : int NA 42983 42981 42981 42982 42981 42980 42982 42980 42980 ...
## $ pdd : int 42982 42980 42983 42981 42987 42985 42980 42983 42980 42980 ...
## $ is_promise_met : Factor w/ 2 levels "N","Y": 1 1 2 2 2 2 2 2 2 2 ...
## $ star_rating : num 5 5 5 5 5 5 5 5 5 5 ...
## $ is_mon_open : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ is_tue_open : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ is_wed_open : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ is_thu_open : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ is_fri_open : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ is_sat_open : Factor w/ 2 levels "N","Y": 2 1 2 1 1 1 1 1 1 1 ...
## $ is_sun_open : Factor w/ 2 levels "N","Y": 2 1 2 1 1 1 1 1 1 1 ...
## $ gate_code : Factor w/ 3789 levels "\t#3250","\t*1010",..: NA NA NA NA 1600 NA NA NA NA NA ...
## $ res_com_flag : Factor w/ 2 levels "C","R": 2 1 2 2 2 2 2 2 2 2 ...
## $ is_po_box : Factor w/ 1 level "Y": NA 1 NA NA NA NA NA NA NA NA ...
## $ is_campus : Factor w/ 1 level "Y": NA 1 NA NA NA NA NA NA NA NA ...
## $ is_freight_forwarder : Factor w/ 1 level "Y": NA NA NA NA NA NA NA NA NA NA ...
## $ is_street : Factor w/ 1 level "Y": NA 1 1 NA NA 1 NA 1 NA 1 ...
## $ is_apt : Factor w/ 2 levels "N","Y": 2 1 1 2 2 1 2 1 2 2 ...
## $ package_class : Factor w/ 3 levels "BPM","PARCEL",..: 3 2 2 2 3 3 2 1 2 2 ...
## $ Success : int 1 1 0 0 0 0 0 0 0 0 ...
summary(data)
## tracking_id access_code
## TBA424087593000: 1 Gate Code: 3015 : 366
## TBA424182187000: 1 Gate Code: 122215 - Mail Room: 221
## TBA424455805000: 1 Gate Code: LU2AOK : 171
## TBA424543917000: 1 Gate Code: 109 : 150
## TBA424562474000: 1 Gate Code: 1234 : 128
## TBA424588332000: 1 (Other) :12719
## (Other) :104524 NA's :90775
## da_employee_id da_affinity_index shipment_id value_of_goods
## Min. :1340721 Min. :0.000 Min. :5.943e+09 Min. : 0
## 1st Qu.:1492401 1st Qu.:0.000 1st Qu.:6.488e+09 1st Qu.: 1898
## Median :2081271 Median :0.008 Median :6.863e+09 Median : 3392
## Mean :2074708 Mean :0.140 Mean :6.872e+09 Mean : 5938
## 3rd Qu.:2474271 3rd Qu.:0.284 3rd Qu.:7.264e+09 3rd Qu.: 6446
## Max. :3201121 Max. :0.888 Max. :7.694e+09 Max. :129987
## NA's :6763 NA's :6860
## warehouse item_quantity ship_method
## TPA1 :44822 Min. : 1.000 AMZL_US_BULK : 5191
## JAX2 :13377 1st Qu.: 1.000 AMZL_US_KEY : 16
## CAE1 : 7299 Median : 1.000 AMZL_US_LMA :14597
## SDF8 : 4680 Mean : 2.247 AMZL_US_LMA_AIR : 1765
## BNA3 : 3922 3rd Qu.: 3.000 AMZL_US_PREMIUM :70895
## TPA2 : 2298 Max. :100.000 AMZL_US_PREMIUM_AIR:11277
## (Other):28132 AMZL_US_STD : 789
## city
## FORT LAUDERDALE:12085
## MIAMI BEACH : 9616
## MIAMI : 8470
## HOLLYWOOD : 7714
## PLANTATION : 4663
## PEMBROKE PINES : 4244
## (Other) :57738
## delivery_note geocode_quality
## package left at the front desk : 45 Min. : 1.000
## package left at the front door : 27 1st Qu.: 1.000
## 171 : 20 Median : 1.000
## customer request package be left at the door.: 13 Mean : 1.054
## left under security camera : 13 3rd Qu.: 1.000
## (Other) : 574 Max. :10.000
## NA's :103838
## fulfillment_center_id pkg_ship_method pkg_ship_cost
## TPA1 :44822 AMZL_US_BULK : 5191 Min. : 0.590
## JAX2 :13377 AMZL_US_KEY : 16 1st Qu.: 3.550
## CAE1 : 7299 AMZL_US_LMA :14597 Median : 4.040
## SDF8 : 4680 AMZL_US_LMA_AIR : 1765 Mean : 4.599
## BNA3 : 3922 AMZL_US_PREMIUM :70895 3rd Qu.: 5.440
## TPA2 : 2298 AMZL_US_PREMIUM_AIR:11277 Max. :54.330
## (Other):28132 AMZL_US_STD : 789
## pkg_width pkg_length pkg_height pkg_scale_weight
## Min. : 0.20 Min. : 0.30 Min. : 0.000 Min. : 0.010
## 1st Qu.: 7.00 1st Qu.:10.25 1st Qu.: 3.000 1st Qu.: 0.990
## Median : 9.99 Median :13.50 Median : 5.000 Median : 2.050
## Mean :10.18 Mean :14.10 Mean : 5.029 Mean : 3.992
## 3rd Qu.:12.50 3rd Qu.:16.00 3rd Qu.: 7.000 3rd Qu.: 4.430
## Max. :21.00 Max. :26.02 Max. :16.000 Max. :49.960
##
## shipment_contents_type delivery_date pdd is_promise_met
## Min. :1.000e+00 Min. :42979 Min. :40543 N:12157
## 1st Qu.:2.560e+02 1st Qu.:43012 1st Qu.:43014 Y:92373
## Median :2.560e+02 Median :43032 Median :43035
## Mean :1.375e+07 Mean :43031 Mean :43034
## 3rd Qu.:2.560e+02 3rd Qu.:43051 3rd Qu.:43055
## Max. :1.074e+09 Max. :43071 Max. :43189
## NA's :14916
## star_rating is_mon_open is_tue_open is_wed_open is_thu_open
## Min. :1.900 N: 8 Y:104530 Y:104530 N: 2
## 1st Qu.:5.000 Y:104522 Y:104528
## Median :5.000
## Mean :4.861
## 3rd Qu.:5.000
## Max. :5.000
##
## is_fri_open is_sat_open is_sun_open gate_code
## Y:104530 N:67701 N:68457 3015 : 456
## Y:36829 Y:36073 122215 - Mail Room: 218
## LU2AOK : 217
## 109 : 154
## 664 : 113
## (Other) : 8743
## NA's :94629
## res_com_flag is_po_box is_campus is_freight_forwarder
## C:18739 Y : 133 Y : 789 Y : 1145
## R:85791 NA's:104397 NA's:103741 NA's:103385
##
##
##
##
##
## is_street is_apt package_class Success
## Y :61505 N :49212 BPM : 908 Min. :0.0000
## NA's:43025 Y :47706 PARCEL:76949 1st Qu.:0.0000
## NA's: 7612 STD :26673 Median :0.0000
## Mean :0.2524
## 3rd Qu.:1.0000
## Max. :1.0000
##
table(data$Success)
##
## 0 1
## 78151 26379
######### Removing Id columns from the dataset ###########
Shipment <-data[,-c(1,3,5,7,10,11,13,14,21,22,25,28,31,30,26,27,29)]
############Summary of the Data###############################3
# data <-data.frame(data[1:2],datatrain[5:13],datatrain[16],datatrain[18:31])
######### Treating Nulls in data ##################
apply(Shipment, 2, function(x) sum(is.na(x)))
## access_code da_affinity_index value_of_goods
## 90775 6860 0
## item_quantity ship_method geocode_quality
## 0 0 0
## pkg_ship_cost pkg_width pkg_length
## 0 0 0
## pkg_height pkg_scale_weight shipment_contents_type
## 0 0 0
## is_promise_met star_rating gate_code
## 0 0 94629
## res_com_flag is_po_box is_campus
## 0 104397 103741
## is_freight_forwarder is_street is_apt
## 103385 43025 7612
## package_class Success
## 0 0
Shipment$da_affinity_index <- ifelse(is.na(Shipment$da_affinity_index), mean(Shipment$da_affinity_index, na.rm=TRUE), Shipment$da_affinity_index)
Shipment$access_code <- ifelse(Shipment$access_code=="NA", 0, 1)
Shipment$access_code[is.na(Shipment$access_code)] <- 0
Shipment$access_code=as.factor(Shipment$access_code)
Shipment$gate_code <- ifelse(Shipment$gate_code=="NA", 0, 1)
Shipment$gate_code[is.na(Shipment$gate_code)] <- 0
Shipment$gate_code=as.factor(Shipment$gate_code)
Shipment$is_po_box<- ifelse(Shipment$is_po_box=="Y", 1, 0)
Shipment$is_po_box[is.na(Shipment$is_po_box)] <- 0
Shipment$is_po_box=as.factor(Shipment$is_po_box)
Shipment$is_campus<- ifelse(Shipment$is_campus=="Y", 1, 0)
Shipment$is_campus[is.na(Shipment$is_campus)] <- 0
Shipment$is_campus=as.factor(Shipment$is_campus)
Shipment$is_freight_forwarder<- ifelse(Shipment$is_freight_forwarder=="Y", 1, 0)
Shipment$is_freight_forwarder[is.na(Shipment$is_freight_forwarder)] <- 0
Shipment$is_freight_forwarder=as.factor(Shipment$is_freight_forwarder)
Shipment$is_street<- ifelse(Shipment$is_street=="Y", 1, 0)
Shipment$is_street[is.na(Shipment$is_street)] <- 0
Shipment$is_street=as.factor(Shipment$is_street)
Shipment$is_apt<- ifelse(Shipment$is_apt=="Y", 1, 0)
Shipment$is_apt[is.na(Shipment$is_apt)] <- 0
Shipment$is_apt=as.factor(Shipment$is_apt)
#Shipment=data
#install.packages("devtools")
library(devtools)
#install.packages("httr")
# library(httr)
# #install.packages("DBI")
# library(DBI)
# #install.packages("bit")
# library(bit)
# #install.packages("RSQLite")
# library(RSQLite)
# #install.packages("chron")
# library(chron)
#install_github("tomasgreif/riv")
library(woe)
###################################Information Value####################################
#iv.mult(Shipment,"Success",TRUE)
iv.plot.summary(iv.mult(Shipment,"Success",TRUE))
## Information Value 0.01
## Information Value 0.58
## Information Value 0
## Information Value 0
## Information Value 0.08
## Information Value 0
## Information Value 0.08
## Information Value 0
## Information Value 0
## Information Value 0
## Information Value 0
## Information Value 0
## Information Value 0.24
## Information Value 0.02
## Information Value 0.06
## Information Value 0.08
## Information Value 0
## Information Value 0
## Information Value 0
## Information Value 0.03
## Information Value 0.01
## Information Value 0

####################Exploratory Data Analysis#####################
hist(Shipment$value_of_goods,main="Shipment$value_of_goods",col="darkgreen")

boxplot(Shipment$value_of_goods,main="Shipment$value_of_goods",col="pink")

hist(Shipment$pkg_ship_cost,main="Package Ship Cost",col="darkgreen")

boxplot(Shipment$pkg_ship_cost,main="Package Ship Cost",col="pink")

hist(Shipment$pkg_width,main="Package Width",col="darkgreen")

boxplot(Shipment$pkg_width,main="Package Width",col="pink")

hist(Shipment$pkg_length,main="Package Length",col="darkgreen")

boxplot(Shipment$pkg_length,main="Package Length",col="pink")

hist(Shipment$pkg_height,main="Package Height",col="darkgreen")

boxplot(Shipment$pkg_height,main="Package Height",col="pink")

hist(Shipment$pkg_scale_weight,main="Package Weight",col="darkgreen")

boxplot(Shipment$pkg_scale_weight,main="Package Weight",col="pink")

hist(Shipment$geocode_quality,main="Geocode Quality",col="darkgreen")

boxplot(Shipment$geocode_quality,main="Geocode Quality",col="pink")

hist(Shipment$da_affinity_index,main="da_affinity_index",col="darkgreen")

boxplot(Shipment$da_affinity_index,main="da_affinity_index",col="pink")

################Chi-Square test for all the attributes##########################3
chisq.test(Shipment$Success,Shipment$access_code)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$access_code
## X-squared = 311.83, df = 1, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$da_affinity_index)
## Warning in chisq.test(Shipment$Success, Shipment$da_affinity_index): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$da_affinity_index
## X-squared = 23462, df = 128, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$item_quantity)
## Warning in chisq.test(Shipment$Success, Shipment$item_quantity): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$item_quantity
## X-squared = 134.04, df = 54, p-value = 9.466e-09
chisq.test(Shipment$Success,Shipment$geocode_quality)
## Warning in chisq.test(Shipment$Success, Shipment$geocode_quality): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$geocode_quality
## X-squared = 942.84, df = 4, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$pkg_ship_cost)
## Warning in chisq.test(Shipment$Success, Shipment$pkg_ship_cost): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$pkg_ship_cost
## X-squared = 7450.1, df = 1273, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$pkg_width)
## Warning in chisq.test(Shipment$Success, Shipment$pkg_width): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$pkg_width
## X-squared = 966.31, df = 765, p-value = 9.188e-07
chisq.test(Shipment$Success,Shipment$pkg_length)
## Warning in chisq.test(Shipment$Success, Shipment$pkg_length): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$pkg_length
## X-squared = 1053.9, df = 815, p-value = 2.691e-08
chisq.test(Shipment$Success,Shipment$pkg_height)
## Warning in chisq.test(Shipment$Success, Shipment$pkg_height): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$pkg_height
## X-squared = 677.95, df = 532, p-value = 1.734e-05
chisq.test(Shipment$Success,Shipment$pkg_scale_weight)
## Warning in chisq.test(Shipment$Success, Shipment$pkg_scale_weight): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$pkg_scale_weight
## X-squared = 3549.9, df = 3081, p-value = 5.896e-09
chisq.test(Shipment$Success,Shipment$shipment_contents_type)
## Warning in chisq.test(Shipment$Success, Shipment$shipment_contents_type):
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$shipment_contents_type
## X-squared = 239.18, df = 169, p-value = 0.0003088
chisq.test(Shipment$Success,Shipment$is_promise_met)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$is_promise_met
## X-squared = 5578.3, df = 1, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$star_rating)
## Warning in chisq.test(Shipment$Success, Shipment$star_rating): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$star_rating
## X-squared = 463.46, df = 10, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$gate_code)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$gate_code
## X-squared = 1303.8, df = 1, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$res_com_flag)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$res_com_flag
## X-squared = 1816, df = 1, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$is_po_box)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$is_po_box
## X-squared = 11.445, df = 1, p-value = 0.0007167
chisq.test(Shipment$Success,Shipment$is_campus)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$is_campus
## X-squared = 25.509, df = 1, p-value = 4.404e-07
chisq.test(Shipment$Success,Shipment$is_freight_forwarder)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$is_freight_forwarder
## X-squared = 54.136, df = 1, p-value = 1.871e-13
chisq.test(Shipment$Success,Shipment$is_street)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$is_street
## X-squared = 530.48, df = 1, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$is_apt)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Shipment$Success and Shipment$is_apt
## X-squared = 136.92, df = 1, p-value < 2.2e-16
chisq.test(Shipment$Success,Shipment$package_class)
##
## Pearson's Chi-squared test
##
## data: Shipment$Success and Shipment$package_class
## X-squared = 31.757, df = 2, p-value = 1.271e-07
####################################Statified Sampling####################################
library(caTools)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
head(Shipment)
## access_code da_affinity_index value_of_goods item_quantity
## 1 0 0.0000000 9359 2
## 2 0 0.1893939 34999 1
## 3 0 0.0000000 2926 1
## 4 0 0.0000000 7598 2
## 5 0 0.0000000 29925 19
## 6 0 0.3888889 8999 1
## ship_method geocode_quality pkg_ship_cost pkg_width pkg_length
## 1 AMZL_US_STD 1 6.08 16.25 24.25
## 2 AMZL_US_BULK 1 2.83 15.00 21.00
## 3 AMZL_US_PREMIUM 1 5.76 8.00 13.50
## 4 AMZL_US_PREMIUM_AIR 1 10.79 21.00 26.00
## 5 AMZL_US_STD 1 5.82 14.50 19.00
## 6 AMZL_US_STD 1 4.92 7.00 10.00
## pkg_height pkg_scale_weight shipment_contents_type is_promise_met
## 1 13.0 14.33 536871168 N
## 2 6.0 8.16 536871168 N
## 3 4.5 1.54 256 Y
## 4 16.0 13.03 512 Y
## 5 12.0 9.48 256 Y
## 6 2.0 1.37 4195328 Y
## star_rating gate_code res_com_flag is_po_box is_campus
## 1 5 0 R 0 0
## 2 5 0 C 1 1
## 3 5 0 R 0 0
## 4 5 0 R 0 0
## 5 5 1 R 0 0
## 6 5 0 R 0 0
## is_freight_forwarder is_street is_apt package_class Success
## 1 0 0 1 STD 1
## 2 0 1 0 PARCEL 1
## 3 0 1 0 PARCEL 0
## 4 0 0 1 PARCEL 0
## 5 0 0 1 STD 0
## 6 0 1 0 STD 0
set.seed(80)
split=sample.split(Shipment$Success,SplitRatio =.7 )# 70% to traiing & 30% for test
head(split,20)
## [1] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [12] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
#check stratified sampling Worked
table(Shipment$Success)
##
## 0 1
## 78151 26379
#create the training dataset
datatrain=subset(Shipment,split==TRUE)
table(datatrain$Success)
##
## 0 1
## 54706 18465
datatest=subset(Shipment,split==FALSE)
table(datatest$Success)
##
## 0 1
## 23445 7914
table(datatrain$Success)
##
## 0 1
## 54706 18465
Mode <- function (x, na.rm) {
xtab <- table(x)
xmode <- names(which(xtab == max(xtab)))
if (length(xmode) > 1) xmode <- ">1 mode"
return(xmode)
}
outlier <- function(x, upperPercentile, lowerPercentile) {
low =quantile(x,lowerPercentile)
high = quantile(x,upperPercentile)
x = ifelse(x>high,high,ifelse(x<low,low,x))
return (x)
}
MeSt = function(x) {
avg = mean(x, na.rm =T)
stdv = sd(x,na.rm =T)
print(avg)
print(stdv)
}
datatrain$value_of_goods= outlier(datatrain$value_of_goods,.85,.0)
datatrain$pkg_ship_cost <- outlier(datatrain$pkg_ship_cost,.85,.09)
datatrain$pkg_width <- outlier(datatrain$pkg_width,.99,.0)
datatrain$pkg_length <- outlier(datatrain$pkg_length,.95,.0)
datatrain$pkg_height <- outlier(datatrain$pkg_height,.95,.0)
datatrain$pkg_scale_weight <- outlier(datatrain$pkg_scale_weight,.99,.0)
datatrain$geocode_quality <- outlier(datatrain$geocode_quality,.99,.0)
datatrain$da_affinity_index <- outlier(datatrain$da_affinity_index,.99,.0)
data_clean=datatrain
# x<-quantile(datatrain$value_of_goods,c(0.01,0.80))
# data_clean <- datatrain[datatrain$value_of_goods >=x[1] & datatrain$value_of_goods<=x[2],]
apply(datatrain, 2, function(x) sum(is.na(x)))
## access_code da_affinity_index value_of_goods
## 0 0 0
## item_quantity ship_method geocode_quality
## 0 0 0
## pkg_ship_cost pkg_width pkg_length
## 0 0 0
## pkg_height pkg_scale_weight shipment_contents_type
## 0 0 0
## is_promise_met star_rating gate_code
## 0 0 0
## res_com_flag is_po_box is_campus
## 0 0 0
## is_freight_forwarder is_street is_apt
## 0 0 0
## package_class Success
## 0 0
#
# x<-quantile(data_clean$pkg_ship_cost,c(0.20,0.80))
# data_clean <- data_clean[data_clean$pkg_ship_cost >=x[1] & data_clean$pkg_ship_cost<=x[2],]
#
# x<-quantile(data_clean$pkg_width,c(0.02,0.98))
# data_clean <- data_clean[data_clean$pkg_width >=x[1] & data_clean$pkg_width<=x[2],]
#
# x<-quantile(data_clean$pkg_length,c(0.02,0.95))
# data_clean <- data_clean[data_clean$pkg_length >=x[1] & data_clean$pkg_length<=x[2],]
#
# x<-quantile(data_clean$pkg_height,c(0.02,0.95))
# data_clean <- data_clean[data_clean$pkg_height >=x[1] & data_clean$pkg_height<=x[2],]
#
# x<-quantile(data_clean$pkg_scale_weight,c(0.02,0.95))
# data_clean <- data_clean[data_clean$pkg_scale_weight >=x[1] & data_clean$pkg_scale_weight<=x[2],]
#
# x<-quantile(data_clean$geocode_quality,c(0.02,0.95))
# data_clean <- data_clean[data_clean$geocode_quality >=x[1] & data_clean$geocode_quality<=x[2],]
#
# x<-quantile(data_clean$da_affinity_index,c(0.02,0.95))
# data_clean <- data_clean[data_clean$da_affinity_index >=x[1] & data_clean$da_affinity_index<=x[2],]
hist(data_clean$value_of_goods,main="data_clean$value_of_goods",col="darkgreen")

boxplot(data_clean$value_of_goods,main="data_clean$value_of_goods",col="pink")

hist(data_clean$pkg_ship_cost,main="Package Ship Cost",col="darkgreen")

boxplot(data_clean$pkg_ship_cost,main="Package Ship Cost",col="pink")

hist(data_clean$pkg_width,main="Package Width",col="darkgreen")

boxplot(data_clean$pkg_width,main="Package Width",col="pink")

hist(data_clean$pkg_length,main="Package Length",col="darkgreen")

boxplot(data_clean$pkg_length,main="Package Length",col="pink")

hist(data_clean$pkg_height,main="Package Height",col="darkgreen")

boxplot(data_clean$pkg_height,main="Package Height",col="pink")

hist(data_clean$pkg_scale_weight,main="Package Weight",col="darkgreen")

boxplot(data_clean$pkg_scale_weight,main="Package Weight",col="pink")

hist(data_clean$geocode_quality,main="Geocode Quality",col="darkgreen")

boxplot(data_clean$geocode_quality,main="Geocode Quality",col="pink")

hist(data_clean$da_affinity_index,main="da_affinity_index",col="darkgreen")

boxplot(data_clean$da_affinity_index,main="da_affinity_index",col="pink")

table(data_clean$Success)
##
## 0 1
## 54706 18465
##############Calculating the IV######################################3
# #data<-Shipment
# ######### Treating NA in data ##################
# apply(data, 2, function(x) sum(is.na(x)))
#
# data$da_affinity_index <- ifelse(is.na(data$da_affinity_index), mean(data$da_affinity_index, na.rm=TRUE), data$da_affinity_index)
# #data$da_affinity_index=round(mean(data$da_affinity_index, na.rm = TRUE))
datatrain=data_clean
########################Scaling Train Data#############################
datatrain$value_of_goods= scale(datatrain$value_of_goods)
datatrain$pkg_ship_cost <- scale(datatrain$pkg_ship_cost)
datatrain$pkg_width <- scale(datatrain$pkg_width)
datatrain$pkg_length <- scale(datatrain$pkg_length)
datatrain$pkg_height <- scale(datatrain$pkg_height)
datatrain$pkg_scale_weight <- scale(datatrain$pkg_scale_weight)
datatrain$geocode_quality <- scale(datatrain$geocode_quality)
datatrain$da_affinity_index <- scale(datatrain$da_affinity_index)
data_clean=datatrain
########################Scaling Test Data#############################
datatest$value_of_goods= scale(datatest$value_of_goods)
datatest$pkg_ship_cost <- scale(datatest$pkg_ship_cost)
datatest$pkg_width <- scale(datatest$pkg_width)
datatest$pkg_length <- scale(datatest$pkg_length)
datatest$pkg_height <- scale(datatest$pkg_height)
datatest$pkg_scale_weight <- scale(datatest$pkg_scale_weight)
datatest$geocode_quality <- scale(datatest$geocode_quality)
datatest$da_affinity_index <- scale(datatest$da_affinity_index)
######### Removing Id columns from the dataset ###########
# data <-data.frame(data[1:2],datatrain[5:13],datatrain[16],datatrain[18:31])
#dataR <-data[,-c(1,3,5,7,10,11,13,14,21,22,26,27,29)]
# str(data_clean)
# names(data_clean)
#summary
table(data_clean$Success)
##
## 0 1
## 54706 18465
apply(data_clean, 2, function(x) sum(is.na(x)))
## access_code da_affinity_index value_of_goods
## 0 0 0
## item_quantity ship_method geocode_quality
## 0 0 0
## pkg_ship_cost pkg_width pkg_length
## 0 0 0
## pkg_height pkg_scale_weight shipment_contents_type
## 0 0 0
## is_promise_met star_rating gate_code
## 0 0 0
## res_com_flag is_po_box is_campus
## 0 0 0
## is_freight_forwarder is_street is_apt
## 0 0 0
## package_class Success
## 0 0
dataR=data_clean
summary(dataR)
## access_code da_affinity_index.V1 value_of_goods.V1 item_quantity
## 0:63550 Min. :-0.739981 Min. :-1.4532204 Min. : 1.000
## 1: 9621 1st Qu.:-0.739981 1st Qu.:-0.8181835 1st Qu.: 1.000
## Median :-0.638170 Median :-0.3182466 Median : 1.000
## Mean : 0.000000 Mean : 0.0000000 Mean : 2.243
## 3rd Qu.: 0.763013 3rd Qu.: 0.7117237 3rd Qu.: 3.000
## Max. : 3.429885 Max. : 1.7627616 Max. :100.000
##
## ship_method geocode_quality.V1 pkg_ship_cost.V1
## AMZL_US_BULK : 3635 Min. :-0.164574 Min. :-1.0829243
## AMZL_US_KEY : 11 1st Qu.:-0.164574 1st Qu.:-0.8416553
## AMZL_US_LMA :10285 Median :-0.164574 Median :-0.3869559
## AMZL_US_LMA_AIR : 1248 Mean : 0.000000 Mean : 0.0000000
## AMZL_US_PREMIUM :49519 3rd Qu.:-0.164574 3rd Qu.: 0.9121851
## AMZL_US_PREMIUM_AIR: 7909 Max. : 6.117155 Max. : 1.6731105
## AMZL_US_STD : 564
## pkg_width.V1 pkg_length.V1 pkg_height.V1
## Min. :-3.228429 Min. :-3.817250 Min. :-1.6539987
## 1st Qu.:-1.025677 1st Qu.:-1.031912 1st Qu.:-0.6592924
## Median :-0.057114 Median :-0.140604 Median : 0.0038452
## Mean : 0.000000 Mean : 0.000000 Mean : 0.0000000
## 3rd Qu.: 0.755960 3rd Qu.: 0.555730 3rd Qu.: 0.6669827
## Max. : 2.548287 Max. : 2.018032 Max. : 2.3248266
##
## pkg_scale_weight.V1 shipment_contents_type is_promise_met star_rating
## Min. :-0.734786 Min. :1.000e+00 N: 8494 Min. :1.900
## 1st Qu.:-0.551183 1st Qu.:2.560e+02 Y:64677 1st Qu.:5.000
## Median :-0.348846 Median :2.560e+02 Median :5.000
## Mean : 0.000000 Mean :1.369e+07 Mean :4.862
## 3rd Qu.: 0.095173 3rd Qu.:2.560e+02 3rd Qu.:5.000
## Max. : 5.061809 Max. :1.074e+09 Max. :5.000
##
## gate_code res_com_flag is_po_box is_campus is_freight_forwarder is_street
## 0:66183 C:13208 0:73078 0:72609 0:72362 0:30094
## 1: 6988 R:59963 1: 93 1: 562 1: 809 1:43077
##
##
##
##
##
## is_apt package_class Success
## 0:39916 BPM : 650 Min. :0.0000
## 1:33255 PARCEL:53994 1st Qu.:0.0000
## STD :18527 Median :0.0000
## Mean :0.2524
## 3rd Qu.:1.0000
## Max. :1.0000
##
# names(dataR)
#Correlation Plot
library(corrplot)
## corrplot 0.84 loaded
corr_plot <- corrplot(cor(Shipment[,-c(1,5,6,12,13,15:22)]), method = "circle",type = "upper")

# str(dataR)
summary(dataR$geocode_quality)
## V1
## Min. :-0.1646
## 1st Qu.:-0.1646
## Median :-0.1646
## Mean : 0.0000
## 3rd Qu.:-0.1646
## Max. : 6.1172
summary(data$geocode_quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.054 1.000 10.000
#dataR=data.frame(dataR[,-c(6)])
datatrain=dataR
ft<-glm(Success ~ . ,data=datatrain,family="binomial")
summary(ft)
##
## Call:
## glm(formula = Success ~ ., family = "binomial", data = datatrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4751 -0.7326 -0.5945 0.6550 2.3561
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.688e+00 1.536e-01 10.993 < 2e-16 ***
## access_code1 -2.117e-01 3.881e-02 -5.453 4.96e-08 ***
## da_affinity_index -1.132e-03 9.065e-03 -0.125 0.90064
## value_of_goods 7.783e-03 1.144e-02 0.680 0.49621
## item_quantity 1.202e-02 4.184e-03 2.872 0.00407 **
## ship_methodAMZL_US_KEY 2.753e-01 6.775e-01 0.406 0.68445
## ship_methodAMZL_US_LMA -1.364e+00 5.715e-02 -23.862 < 2e-16 ***
## ship_methodAMZL_US_LMA_AIR -1.707e+00 8.549e-02 -19.968 < 2e-16 ***
## ship_methodAMZL_US_PREMIUM -5.876e-01 4.508e-02 -13.035 < 2e-16 ***
## ship_methodAMZL_US_PREMIUM_AIR -9.889e-01 5.951e-02 -16.616 < 2e-16 ***
## ship_methodAMZL_US_STD -6.748e-01 1.129e-01 -5.978 2.26e-09 ***
## geocode_quality 1.495e-01 7.965e-03 18.773 < 2e-16 ***
## pkg_ship_cost 3.460e-01 1.242e-02 27.853 < 2e-16 ***
## pkg_width -3.059e-02 1.858e-02 -1.647 0.09959 .
## pkg_length -5.676e-03 1.977e-02 -0.287 0.77403
## pkg_height -1.174e-02 1.410e-02 -0.832 0.40519
## pkg_scale_weight -4.126e-02 1.339e-02 -3.081 0.00206 **
## shipment_contents_type -5.394e-10 1.136e-10 -4.747 2.07e-06 ***
## is_promise_metY -1.275e+00 2.464e-02 -51.756 < 2e-16 ***
## star_rating -6.260e-02 2.036e-02 -3.075 0.00210 **
## gate_code1 8.087e-01 4.182e-02 19.340 < 2e-16 ***
## res_com_flagR -1.286e+00 5.670e-02 -22.687 < 2e-16 ***
## is_po_box1 -1.357e-01 2.367e-01 -0.573 0.56657
## is_campus1 -3.599e-02 9.828e-02 -0.366 0.71419
## is_freight_forwarder1 9.584e-02 8.074e-02 1.187 0.23520
## is_street1 -3.314e-01 2.406e-02 -13.772 < 2e-16 ***
## is_apt1 3.770e-03 2.396e-02 0.157 0.87497
## package_classPARCEL 4.575e-01 1.102e-01 4.153 3.28e-05 ***
## package_classSTD 5.153e-01 1.098e-01 4.693 2.70e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 82670 on 73170 degrees of freedom
## Residual deviance: 75679 on 73142 degrees of freedom
## AIC: 75737
##
## Number of Fisher Scoring iterations: 4
anova(ft,"Chisqr")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Success
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 73170 82670
## access_code 1 212.11 73169 82457
## da_affinity_index 1 0.04 73168 82457
## value_of_goods 1 51.47 73167 82406
## item_quantity 1 24.96 73166 82381
## ship_method 6 1188.15 73160 81193
## geocode_quality 1 447.61 73159 80745
## pkg_ship_cost 1 746.49 73158 79999
## pkg_width 1 72.09 73157 79927
## pkg_length 1 3.84 73156 79923
## pkg_height 1 12.53 73155 79910
## pkg_scale_weight 1 13.02 73154 79897
## shipment_contents_type 1 17.27 73153 79880
## is_promise_met 1 2943.06 73152 76937
## star_rating 1 39.72 73151 76897
## gate_code 1 484.18 73150 76413
## res_com_flag 1 401.03 73149 76012
## is_po_box 1 0.53 73148 76011
## is_campus 1 0.12 73147 76011
## is_freight_forwarder 1 0.44 73146 76011
## is_street 1 305.16 73145 75706
## is_apt 1 0.01 73144 75706
## package_class 2 26.79 73142 75679
ft
##
## Call: glm(formula = Success ~ ., family = "binomial", data = datatrain)
##
## Coefficients:
## (Intercept) access_code1
## 1.688e+00 -2.117e-01
## da_affinity_index value_of_goods
## -1.132e-03 7.783e-03
## item_quantity ship_methodAMZL_US_KEY
## 1.202e-02 2.753e-01
## ship_methodAMZL_US_LMA ship_methodAMZL_US_LMA_AIR
## -1.364e+00 -1.707e+00
## ship_methodAMZL_US_PREMIUM ship_methodAMZL_US_PREMIUM_AIR
## -5.876e-01 -9.889e-01
## ship_methodAMZL_US_STD geocode_quality
## -6.748e-01 1.495e-01
## pkg_ship_cost pkg_width
## 3.460e-01 -3.059e-02
## pkg_length pkg_height
## -5.676e-03 -1.174e-02
## pkg_scale_weight shipment_contents_type
## -4.126e-02 -5.394e-10
## is_promise_metY star_rating
## -1.275e+00 -6.260e-02
## gate_code1 res_com_flagR
## 8.087e-01 -1.286e+00
## is_po_box1 is_campus1
## -1.357e-01 -3.599e-02
## is_freight_forwarder1 is_street1
## 9.584e-02 -3.314e-01
## is_apt1 package_classPARCEL
## 3.770e-03 4.575e-01
## package_classSTD
## 5.153e-01
##
## Degrees of Freedom: 73170 Total (i.e. Null); 73142 Residual
## Null Deviance: 82670
## Residual Deviance: 75680 AIC: 75740
# datatrain$pkg_scaler_weight=datatrain$pkg_outlier_weight
# datatest$pkg_scaler_weight=datatest$pkg_outlier_weight
predict(ft,datatest,type="resp")->p
library(ROCR)
pred <- prediction(p,datatest$Success)
pref<-ROCR::performance(pred,"tpr","fpr")
plot(pref,print.cutoffs.at=seq(0,1,.1))

pref1<-performance(pred,"acc")
plot(pref1)

pref2<-performance(pred,"lift")
plot(pref2)

#View(cbind(datatest$Success,p))
ifelse(p>=.3,1,0)->cq
#table(datatest$Success,cq)
a=table(datatest$Success,cq)
a
## cq
## 0 1
## 0 19438 4007
## 1 4598 3316
Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy
## [1] 72.55971
sensitivity(a)
## [1] 0.8087036
specificity(a)
## [1] 0.4528199
precision(a)
## [1] 0.8290894
######################################## Model 2 ####################################
ft1<-glm(Success ~ . ,data=datatrain[,-c(2,17,18,19,21)],family="binomial")
summary(ft1)
##
## Call:
## glm(formula = Success ~ ., family = "binomial", data = datatrain[,
## -c(2, 17, 18, 19, 21)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4753 -0.7326 -0.5946 0.6586 2.3559
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.686e+00 1.535e-01 10.982 < 2e-16 ***
## access_code1 -2.115e-01 3.879e-02 -5.453 4.95e-08 ***
## value_of_goods 7.865e-03 1.143e-02 0.688 0.49147
## item_quantity 1.197e-02 4.184e-03 2.860 0.00424 **
## ship_methodAMZL_US_KEY 2.737e-01 6.774e-01 0.404 0.68619
## ship_methodAMZL_US_LMA -1.360e+00 5.673e-02 -23.977 < 2e-16 ***
## ship_methodAMZL_US_LMA_AIR -1.704e+00 8.531e-02 -19.968 < 2e-16 ***
## ship_methodAMZL_US_PREMIUM -5.883e-01 4.497e-02 -13.083 < 2e-16 ***
## ship_methodAMZL_US_PREMIUM_AIR -9.899e-01 5.943e-02 -16.658 < 2e-16 ***
## ship_methodAMZL_US_STD -6.753e-01 1.128e-01 -5.986 2.15e-09 ***
## geocode_quality 1.492e-01 7.958e-03 18.750 < 2e-16 ***
## pkg_ship_cost 3.461e-01 1.242e-02 27.865 < 2e-16 ***
## pkg_width -3.077e-02 1.858e-02 -1.656 0.09768 .
## pkg_length -5.559e-03 1.977e-02 -0.281 0.77856
## pkg_height -1.169e-02 1.410e-02 -0.829 0.40729
## pkg_scale_weight -4.143e-02 1.339e-02 -3.095 0.00197 **
## shipment_contents_type -5.405e-10 1.136e-10 -4.756 1.98e-06 ***
## is_promise_metY -1.276e+00 2.463e-02 -51.784 < 2e-16 ***
## star_rating -6.164e-02 1.985e-02 -3.105 0.00190 **
## gate_code1 8.087e-01 4.178e-02 19.359 < 2e-16 ***
## res_com_flagR -1.286e+00 5.649e-02 -22.760 < 2e-16 ***
## is_street1 -3.330e-01 1.905e-02 -17.480 < 2e-16 ***
## package_classPARCEL 4.587e-01 1.102e-01 4.163 3.14e-05 ***
## package_classSTD 5.165e-01 1.098e-01 4.703 2.56e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 82670 on 73170 degrees of freedom
## Residual deviance: 75681 on 73147 degrees of freedom
## AIC: 75729
##
## Number of Fisher Scoring iterations: 4
anova(ft1,"Chisqr")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Success
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 73170 82670
## access_code 1 212.11 73169 82457
## value_of_goods 1 51.49 73168 82406
## item_quantity 1 24.96 73167 82381
## ship_method 6 1186.85 73161 81194
## geocode_quality 1 448.01 73160 80746
## pkg_ship_cost 1 746.66 73159 79999
## pkg_width 1 72.10 73158 79927
## pkg_length 1 3.83 73157 79924
## pkg_height 1 12.52 73156 79911
## pkg_scale_weight 1 13.04 73155 79898
## shipment_contents_type 1 17.24 73154 79881
## is_promise_met 1 2943.84 73153 76937
## star_rating 1 39.71 73152 76897
## gate_code 1 484.19 73151 76413
## res_com_flag 1 400.78 73150 76012
## is_street 1 304.37 73149 75708
## package_class 2 26.91 73147 75681
#datatest$pkg_outlier_weight=datatest$pkg_scale_weight
predict(ft1,datatest,type="resp")->p
library(ROCR)
pred <- prediction(p,datatest$Success)
pref<-ROCR::performance(pred,"tpr","fpr")
plot(pref,print.cutoffs.at=seq(0,1,.1))

pref1<-performance(pred,"acc")
plot(pref1)

pref2<-performance(pred,"lift")
plot(pref2)

#View(cbind(datatest$Success,p))
ifelse(p>=.3,1,0)->cq
#table(datatest$Success,cq)
a=table(datatest$Success,cq)
a
## cq
## 0 1
## 0 19455 3990
## 1 4608 3306
Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy
## [1] 72.58203
sensitivity(a)
## [1] 0.8085027
specificity(a)
## [1] 0.453125
precision(a)
## [1] 0.8298145
####################################Model 3 ###################################
ft2<-glm(Success ~ . ,data=datatrain[,-c(2,9,10,11,17,18,19,21)],family="binomial")
summary(ft2)
##
## Call:
## glm(formula = Success ~ ., family = "binomial", data = datatrain[,
## -c(2, 9, 10, 11, 17, 18, 19, 21)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4588 -0.7328 -0.5958 0.6564 2.3504
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.702e+00 1.529e-01 11.128 < 2e-16 ***
## access_code1 -2.098e-01 3.879e-02 -5.410 6.32e-08 ***
## value_of_goods 1.875e-04 1.112e-02 0.017 0.98654
## item_quantity 9.693e-03 4.142e-03 2.340 0.01927 *
## ship_methodAMZL_US_KEY 2.850e-01 6.773e-01 0.421 0.67395
## ship_methodAMZL_US_LMA -1.359e+00 5.673e-02 -23.953 < 2e-16 ***
## ship_methodAMZL_US_LMA_AIR -1.674e+00 8.499e-02 -19.691 < 2e-16 ***
## ship_methodAMZL_US_PREMIUM -5.816e-01 4.493e-02 -12.944 < 2e-16 ***
## ship_methodAMZL_US_PREMIUM_AIR -9.539e-01 5.876e-02 -16.235 < 2e-16 ***
## ship_methodAMZL_US_STD -6.887e-01 1.126e-01 -6.116 9.57e-10 ***
## geocode_quality 1.494e-01 7.957e-03 18.775 < 2e-16 ***
## pkg_ship_cost 3.331e-01 1.200e-02 27.765 < 2e-16 ***
## pkg_width -5.494e-02 1.038e-02 -5.291 1.21e-07 ***
## shipment_contents_type -6.154e-10 1.121e-10 -5.490 4.02e-08 ***
## is_promise_metY -1.276e+00 2.463e-02 -51.794 < 2e-16 ***
## star_rating -6.326e-02 1.984e-02 -3.189 0.00143 **
## gate_code1 8.074e-01 4.177e-02 19.328 < 2e-16 ***
## res_com_flagR -1.288e+00 5.648e-02 -22.810 < 2e-16 ***
## is_street1 -3.322e-01 1.905e-02 -17.441 < 2e-16 ***
## package_classPARCEL 4.451e-01 1.093e-01 4.074 4.63e-05 ***
## package_classSTD 5.211e-01 1.096e-01 4.754 2.00e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 82670 on 73170 degrees of freedom
## Residual deviance: 75697 on 73150 degrees of freedom
## AIC: 75739
##
## Number of Fisher Scoring iterations: 4
anova(ft2,"Chisqr")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Success
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 73170 82670
## access_code 1 212.11 73169 82457
## value_of_goods 1 51.49 73168 82406
## item_quantity 1 24.96 73167 82381
## ship_method 6 1186.85 73161 81194
## geocode_quality 1 448.01 73160 80746
## pkg_ship_cost 1 746.66 73159 79999
## pkg_width 1 72.10 73158 79927
## shipment_contents_type 1 24.93 73157 79902
## is_promise_met 1 2944.02 73156 76958
## star_rating 1 40.95 73155 76917
## gate_code 1 482.69 73154 76435
## res_com_flag 1 403.50 73153 76031
## is_street 1 303.21 73152 75728
## package_class 2 30.82 73150 75697
#datatest$pkg_outlier_weight=datatest$pkg_scale_weight
predict(ft2,datatest,type="resp")->p
library(ROCR)
pred <- prediction(p,datatest$Success)
pref<-ROCR::performance(pred,"tpr","fpr")
plot(pref,print.cutoffs.at=seq(0,1,.1))

pref1<-performance(pred,"acc")
plot(pref1)

pref2<-performance(pred,"lift")
plot(pref2)

#View(cbind(datatest$Success,p))
ifelse(p>=.3,1,0)->cq
#table(datatest$Success,cq)
a=table(datatest$Success,cq)
a
## cq
## 0 1
## 0 19452 3993
## 1 4605 3309
Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy
## [1] 72.58203
sensitivity(a)
## [1] 0.8085796
specificity(a)
## [1] 0.4531635
precision(a)
## [1] 0.8296865
#####################################################################################################
###################### LASSO Regularization ####################################
#####################################################################################################
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
#convert training data to matrix format
x <- model.matrix(Success~.,datatrain)
#convert class to numerical variable
y <- (datatrain$Success)
#perform grid search to find optimal value of lambda
#family= binomial => logistic regression, alpha=1 => lasso
# check docs to explore other type.measure options
cv.out <- cv.glmnet(x,y,alpha=1,family="binomial",type.measure = "mse" )
#plot result
plot(cv.out)

summary(cv.out)
## Length Class Mode
## lambda 67 -none- numeric
## cvm 67 -none- numeric
## cvsd 67 -none- numeric
## cvup 67 -none- numeric
## cvlo 67 -none- numeric
## nzero 67 -none- numeric
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
#min value of lambda
lambda_min <- cv.out$lambda.min
#best value of lambda
lambda_1se <- cv.out$lambda.1se
#regression coefficients
coef(cv.out,s=lambda_1se)
## 30 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.266378e+00
## (Intercept) .
## access_code1 -4.679346e-03
## da_affinity_index .
## value_of_goods .
## item_quantity 2.972170e-03
## ship_methodAMZL_US_KEY .
## ship_methodAMZL_US_LMA -6.129354e-01
## ship_methodAMZL_US_LMA_AIR -6.988854e-01
## ship_methodAMZL_US_PREMIUM -1.405857e-01
## ship_methodAMZL_US_PREMIUM_AIR -2.871593e-01
## ship_methodAMZL_US_STD .
## geocode_quality 1.331367e-01
## pkg_ship_cost 2.242150e-01
## pkg_width -2.133735e-02
## pkg_length .
## pkg_height .
## pkg_scale_weight -7.978273e-03
## shipment_contents_type -2.956555e-10
## is_promise_metY -1.224937e+00
## star_rating -3.312618e-02
## gate_code1 6.309146e-01
## res_com_flagR -1.046855e+00
## is_po_box1 .
## is_campus1 .
## is_freight_forwarder1 .
## is_street1 -3.001632e-01
## is_apt1 .
## package_classPARCEL .
## package_classSTD 3.654807e-02
#get test data
datatest1 <- model.matrix(Success~.,datatest)
#predict class, type="class"
lasso_prob <- predict(cv.out,newx = datatest1,s=lambda_1se,type="response")
#translate probabilities to predictions
lasso_predict <- rep(0,nrow(datatest))
lasso_predict[lasso_prob>.8] <- 1
#confusion matrix
table(pred=lasso_predict,true=datatest$Success)
## true
## pred 0 1
## 0 23434 7878
## 1 11 36
lasso_predict[lasso_prob]
## numeric(0)
#accuracy
mean(lasso_predict==datatest$Success)
## [1] 0.7484295
table(datatest$Success,lasso_predict)
## lasso_predict
## 0 1
## 0 23434 11
## 1 7878 36
pred1 <- prediction(lasso_predict,datatest$Success)
pref3<-ROCR::performance(pred1,"tpr","fpr")
plot(pref3,print.cutoffs.at=seq(0,1,.1))

############# Removing Columns Whose coefficient were 0 in LASSO Regularization #############
#####################################################################################################
###################### CART ####################################
#####################################################################################################
library(rpart)
library(rpart.plot)
library(ROCR)
cart=rpart(Success~.,data=datatrain,method="class")
summary(cart)
## Call:
## rpart(formula = Success ~ ., data = datatrain, method = "class")
## n= 73171
##
## CP nsplit rel error xerror xstd
## 1 0.08932214 0 1.0000000 1.0000000 0.006363167
## 2 0.04375846 3 0.7320336 0.7320336 0.005685140
## 3 0.02312483 4 0.6882751 0.6882751 0.005549810
## 4 0.01000000 5 0.6651503 0.6651503 0.005475013
##
## Variable importance
## da_affinity_index is_promise_met res_com_flag ship_method
## 68 17 5 4
## is_street is_apt
## 3 2
##
## Node number 1: 73171 observations, complexity param=0.08932214
## predicted class=0 expected loss=0.2523541 P(node) =1
## class counts: 54706 18465
## probabilities: 0.748 0.252
## left son=2 (64677 obs) right son=3 (8494 obs)
## Primary splits:
## is_promise_met splits as RL, improve=1455.4990, (0 missing)
## da_affinity_index < -0.03456654 to the left, improve= 648.6074, (0 missing)
## res_com_flag splits as RL, improve= 495.7095, (0 missing)
## ship_method splits as RRRRLLL, improve= 400.1945, (0 missing)
## pkg_ship_cost < 1.009621 to the left, improve= 356.3170, (0 missing)
##
## Node number 2: 64677 observations, complexity param=0.08932214
## predicted class=0 expected loss=0.2162129 P(node) =0.8839158
## class counts: 50693 13984
## probabilities: 0.784 0.216
## left son=4 (37785 obs) right son=5 (26892 obs)
## Primary splits:
## da_affinity_index < -0.03456654 to the left, improve=728.7503, (0 missing)
## pkg_ship_cost < 1.009621 to the left, improve=428.5726, (0 missing)
## ship_method splits as RRRRLRR, improve=207.1263, (0 missing)
## gate_code splits as LR, improve=186.1909, (0 missing)
## res_com_flag splits as RL, improve=150.3159, (0 missing)
## Surrogate splits:
## ship_method splits as LRLLLLR, agree=0.585, adj=0.002, (0 split)
## pkg_width < -2.936888 to the right, agree=0.584, adj=0.000, (0 split)
## item_quantity < 36.5 to the left, agree=0.584, adj=0.000, (0 split)
## pkg_length < -3.343742 to the right, agree=0.584, adj=0.000, (0 split)
## pkg_scale_weight < -0.7282287 to the right, agree=0.584, adj=0.000, (0 split)
##
## Node number 3: 8494 observations, complexity param=0.04375846
## predicted class=1 expected loss=0.4724511 P(node) =0.1160842
## class counts: 4013 4481
## probabilities: 0.472 0.528
## left son=6 (6198 obs) right son=7 (2296 obs)
## Primary splits:
## res_com_flag splits as RL, improve=394.34240, (0 missing)
## ship_method splits as LLRRLLL, improve=280.74210, (0 missing)
## gate_code splits as LR, improve=116.83970, (0 missing)
## is_street splits as RL, improve= 95.83463, (0 missing)
## geocode_quality < 1.405858 to the left, improve= 85.44774, (0 missing)
## Surrogate splits:
## ship_method splits as LLRRLLL, agree=0.951, adj=0.820, (0 split)
## is_freight_forwarder splits as LR, agree=0.743, adj=0.049, (0 split)
## is_campus splits as LR, agree=0.737, adj=0.026, (0 split)
## item_quantity < 8.5 to the left, agree=0.734, adj=0.016, (0 split)
## pkg_scale_weight < 4.461354 to the left, agree=0.734, adj=0.016, (0 split)
##
## Node number 4: 37785 observations
## predicted class=0 expected loss=0.1528914 P(node) =0.5163931
## class counts: 32008 5777
## probabilities: 0.847 0.153
##
## Node number 5: 26892 observations, complexity param=0.08932214
## predicted class=0 expected loss=0.3051837 P(node) =0.3675227
## class counts: 18685 8207
## probabilities: 0.695 0.305
## left son=10 (21736 obs) right son=11 (5156 obs)
## Primary splits:
## da_affinity_index < 0.04859362 to the right, improve=5051.8310, (0 missing)
## pkg_ship_cost < 1.000341 to the left, improve= 216.0812, (0 missing)
## ship_method splits as RRRRLRR, improve= 181.2585, (0 missing)
## gate_code splits as LR, improve= 113.0765, (0 missing)
## res_com_flag splits as RL, improve= 102.7381, (0 missing)
## Surrogate splits:
## ship_method splits as LRLLLLL, agree=0.808, adj=0.001, (0 split)
## pkg_length < -2.076414 to the right, agree=0.808, adj=0.001, (0 split)
## pkg_width < -3.163642 to the right, agree=0.808, adj=0.000, (0 split)
##
## Node number 6: 6198 observations, complexity param=0.02312483
## predicted class=0 expected loss=0.4348177 P(node) =0.08470569
## class counts: 3503 2695
## probabilities: 0.565 0.435
## left son=12 (2921 obs) right son=13 (3277 obs)
## Primary splits:
## is_street splits as RL, improve=236.23140, (0 missing)
## gate_code splits as LR, improve=146.55410, (0 missing)
## pkg_ship_cost < -0.6235852 to the right, improve= 71.37108, (0 missing)
## geocode_quality < 1.405858 to the left, improve= 63.61190, (0 missing)
## is_apt splits as LR, improve= 46.27264, (0 missing)
## Surrogate splits:
## is_apt splits as LR, agree=0.820, adj=0.619, (0 split)
## gate_code splits as LR, agree=0.567, adj=0.082, (0 split)
## access_code splits as LR, agree=0.558, adj=0.061, (0 split)
## pkg_ship_cost < 0.05382405 to the right, agree=0.543, adj=0.031, (0 split)
## geocode_quality < 1.405858 to the right, agree=0.543, adj=0.030, (0 split)
##
## Node number 7: 2296 observations
## predicted class=1 expected loss=0.2221254 P(node) =0.03137855
## class counts: 510 1786
## probabilities: 0.222 0.778
##
## Node number 10: 21736 observations
## predicted class=0 expected loss=0.1559165 P(node) =0.2970576
## class counts: 18347 3389
## probabilities: 0.844 0.156
##
## Node number 11: 5156 observations
## predicted class=1 expected loss=0.06555469 P(node) =0.07046507
## class counts: 338 4818
## probabilities: 0.066 0.934
##
## Node number 12: 2921 observations
## predicted class=0 expected loss=0.2885998 P(node) =0.03992019
## class counts: 2078 843
## probabilities: 0.711 0.289
##
## Node number 13: 3277 observations
## predicted class=1 expected loss=0.4348489 P(node) =0.0447855
## class counts: 1425 1852
## probabilities: 0.435 0.565
rpart.plot(x=cart,type=2, extra=103,under=T, fallen.leaves=TRUE,digits=2,faclen=0,cex=NULL, tweak=1,snip=FALSE,box.palette="GnRd", shadow.col=0)
a=table(datatrain$Success, predict(cart, newdata=datatrain, type="class"))
a
##
## 0 1
## 0 52433 2273
## 1 10009 8456
Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy
## [1] 83.21466
sensitivity(a)
## [1] 0.8397072
specificity(a)
## [1] 0.7881443
precision(a)
## [1] 0.9584506
b=table(datatest$Success, predict(cart, newdata=datatest, type="class"))
b
##
## 0 1
## 0 22452 993
## 1 4339 3575
Accuracy=(b[2,2]+b[1,1])/(b[2,1]+b[2,2]+b[1,1]+b[1,2])*100
Accuracy
## [1] 82.99691
sensitivity(b)
## [1] 0.8380426
specificity(b)
## [1] 0.7826182
precision(b)
## [1] 0.9576456
#######################################
###### Cross Validation ######
#######################################
library(caret)
datatrain_cart=datatrain
####### Collected cp value
datatrain_cart$Success <- as.factor(datatrain_cart$Success)
trainControl <- trainControl(method="cv", number=10, repeats=3)
## Warning: `repeats` has no meaning for this resampling method.
set.seed(7)
fit.cart <- caret::train(Success~., data=datatrain_cart,metric="Accuracy", method="rpart", trControl=trainControl)
fit.cart
## CART
##
## 73171 samples
## 22 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 65854, 65854, 65854, 65853, 65855, 65853, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.02312483 0.8274317 0.4449825
## 0.04375846 0.8182203 0.4439334
## 0.08932214 0.7804600 0.2284839
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02312483.
cart=rpart(Success~.,data=datatrain_cart,method="class",cp = 0.02312483)
summary(cart)
## Call:
## rpart(formula = Success ~ ., data = datatrain_cart, method = "class",
## cp = 0.02312483)
## n= 73171
##
## CP nsplit rel error xerror xstd
## 1 0.08932214 0 1.0000000 1.0000000 0.006363167
## 2 0.04375846 3 0.7320336 0.7320336 0.005685140
## 3 0.02312483 4 0.6882751 0.6882751 0.005549810
## 4 0.02312483 5 0.6651503 0.6782562 0.005517691
##
## Variable importance
## da_affinity_index is_promise_met res_com_flag ship_method
## 68 17 5 4
## is_street is_apt
## 3 2
##
## Node number 1: 73171 observations, complexity param=0.08932214
## predicted class=0 expected loss=0.2523541 P(node) =1
## class counts: 54706 18465
## probabilities: 0.748 0.252
## left son=2 (64677 obs) right son=3 (8494 obs)
## Primary splits:
## is_promise_met splits as RL, improve=1455.4990, (0 missing)
## da_affinity_index < -0.03456654 to the left, improve= 648.6074, (0 missing)
## res_com_flag splits as RL, improve= 495.7095, (0 missing)
## ship_method splits as RRRRLLL, improve= 400.1945, (0 missing)
## pkg_ship_cost < 1.009621 to the left, improve= 356.3170, (0 missing)
##
## Node number 2: 64677 observations, complexity param=0.08932214
## predicted class=0 expected loss=0.2162129 P(node) =0.8839158
## class counts: 50693 13984
## probabilities: 0.784 0.216
## left son=4 (37785 obs) right son=5 (26892 obs)
## Primary splits:
## da_affinity_index < -0.03456654 to the left, improve=728.7503, (0 missing)
## pkg_ship_cost < 1.009621 to the left, improve=428.5726, (0 missing)
## ship_method splits as RRRRLRR, improve=207.1263, (0 missing)
## gate_code splits as LR, improve=186.1909, (0 missing)
## res_com_flag splits as RL, improve=150.3159, (0 missing)
## Surrogate splits:
## ship_method splits as LRLLLLR, agree=0.585, adj=0.002, (0 split)
## pkg_width < -2.936888 to the right, agree=0.584, adj=0.000, (0 split)
## item_quantity < 36.5 to the left, agree=0.584, adj=0.000, (0 split)
## pkg_length < -3.343742 to the right, agree=0.584, adj=0.000, (0 split)
## pkg_scale_weight < -0.7282287 to the right, agree=0.584, adj=0.000, (0 split)
##
## Node number 3: 8494 observations, complexity param=0.04375846
## predicted class=1 expected loss=0.4724511 P(node) =0.1160842
## class counts: 4013 4481
## probabilities: 0.472 0.528
## left son=6 (6198 obs) right son=7 (2296 obs)
## Primary splits:
## res_com_flag splits as RL, improve=394.34240, (0 missing)
## ship_method splits as LLRRLLL, improve=280.74210, (0 missing)
## gate_code splits as LR, improve=116.83970, (0 missing)
## is_street splits as RL, improve= 95.83463, (0 missing)
## geocode_quality < 1.405858 to the left, improve= 85.44774, (0 missing)
## Surrogate splits:
## ship_method splits as LLRRLLL, agree=0.951, adj=0.820, (0 split)
## is_freight_forwarder splits as LR, agree=0.743, adj=0.049, (0 split)
## is_campus splits as LR, agree=0.737, adj=0.026, (0 split)
## item_quantity < 8.5 to the left, agree=0.734, adj=0.016, (0 split)
## pkg_scale_weight < 4.461354 to the left, agree=0.734, adj=0.016, (0 split)
##
## Node number 4: 37785 observations
## predicted class=0 expected loss=0.1528914 P(node) =0.5163931
## class counts: 32008 5777
## probabilities: 0.847 0.153
##
## Node number 5: 26892 observations, complexity param=0.08932214
## predicted class=0 expected loss=0.3051837 P(node) =0.3675227
## class counts: 18685 8207
## probabilities: 0.695 0.305
## left son=10 (21736 obs) right son=11 (5156 obs)
## Primary splits:
## da_affinity_index < 0.04859362 to the right, improve=5051.8310, (0 missing)
## pkg_ship_cost < 1.000341 to the left, improve= 216.0812, (0 missing)
## ship_method splits as RRRRLRR, improve= 181.2585, (0 missing)
## gate_code splits as LR, improve= 113.0765, (0 missing)
## res_com_flag splits as RL, improve= 102.7381, (0 missing)
## Surrogate splits:
## ship_method splits as LRLLLLL, agree=0.808, adj=0.001, (0 split)
## pkg_length < -2.076414 to the right, agree=0.808, adj=0.001, (0 split)
## pkg_width < -3.163642 to the right, agree=0.808, adj=0.000, (0 split)
##
## Node number 6: 6198 observations, complexity param=0.02312483
## predicted class=0 expected loss=0.4348177 P(node) =0.08470569
## class counts: 3503 2695
## probabilities: 0.565 0.435
## left son=12 (2921 obs) right son=13 (3277 obs)
## Primary splits:
## is_street splits as RL, improve=236.23140, (0 missing)
## gate_code splits as LR, improve=146.55410, (0 missing)
## pkg_ship_cost < -0.6235852 to the right, improve= 71.37108, (0 missing)
## geocode_quality < 1.405858 to the left, improve= 63.61190, (0 missing)
## is_apt splits as LR, improve= 46.27264, (0 missing)
## Surrogate splits:
## is_apt splits as LR, agree=0.820, adj=0.619, (0 split)
## gate_code splits as LR, agree=0.567, adj=0.082, (0 split)
## access_code splits as LR, agree=0.558, adj=0.061, (0 split)
## pkg_ship_cost < 0.05382405 to the right, agree=0.543, adj=0.031, (0 split)
## geocode_quality < 1.405858 to the right, agree=0.543, adj=0.030, (0 split)
##
## Node number 7: 2296 observations
## predicted class=1 expected loss=0.2221254 P(node) =0.03137855
## class counts: 510 1786
## probabilities: 0.222 0.778
##
## Node number 10: 21736 observations
## predicted class=0 expected loss=0.1559165 P(node) =0.2970576
## class counts: 18347 3389
## probabilities: 0.844 0.156
##
## Node number 11: 5156 observations
## predicted class=1 expected loss=0.06555469 P(node) =0.07046507
## class counts: 338 4818
## probabilities: 0.066 0.934
##
## Node number 12: 2921 observations
## predicted class=0 expected loss=0.2885998 P(node) =0.03992019
## class counts: 2078 843
## probabilities: 0.711 0.289
##
## Node number 13: 3277 observations
## predicted class=1 expected loss=0.4348489 P(node) =0.0447855
## class counts: 1425 1852
## probabilities: 0.435 0.565
rpart.plot(x=cart,type=2, extra=103,under=T, fallen.leaves=TRUE,digits=2,faclen=0,cex=NULL, tweak=1,snip=FALSE,box.palette="GnRd", shadow.col=0)

a=table(datatrain$Success, predict(cart, newdata=datatrain, type="class"))
a
##
## 0 1
## 0 52433 2273
## 1 10009 8456
Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy
## [1] 83.21466
sensitivity(a)
## [1] 0.8397072
specificity(a)
## [1] 0.7881443
precision(a)
## [1] 0.9584506
b=table(datatest$Success, predict(cart, newdata=datatest, type="class"))
b
##
## 0 1
## 0 22452 993
## 1 4339 3575
Accuracy=(b[2,2]+b[1,1])/(b[2,1]+b[2,2]+b[1,1]+b[1,2])*100
Accuracy
## [1] 82.99691
sensitivity(a)
## [1] 0.8397072
specificity(a)
## [1] 0.7881443
precision(a)
## [1] 0.9584506
TPRTest=b[2,2]/(b[2,1]+b[2,2])*100
TPRTest
## [1] 45.17311