Shipment Delivery Failuer Prediction

##Capstone Project on Shipment Failures
##Date : 14-Nov-2017
##Setting the folder path and reading the file
setwd("C:/Users/v-vyupad/Desktop/Analytics")
data=read.csv("shipmentdata.csv",sep = ",",header = T)
# View(data)

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(e1071)
dim(data)

## [1] 104530     40

names(data)

##  [1] "tracking_id"            "access_code"           
##  [3] "da_employee_id"         "da_affinity_index"     
##  [5] "shipment_id"            "value_of_goods"        
##  [7] "warehouse"              "item_quantity"         
##  [9] "ship_method"            "city"                  
## [11] "delivery_note"          "geocode_quality"       
## [13] "fulfillment_center_id"  "pkg_ship_method"       
## [15] "pkg_ship_cost"          "pkg_width"             
## [17] "pkg_length"             "pkg_height"            
## [19] "pkg_scale_weight"       "shipment_contents_type"
## [21] "delivery_date"          "pdd"                   
## [23] "is_promise_met"         "star_rating"           
## [25] "is_mon_open"            "is_tue_open"           
## [27] "is_wed_open"            "is_thu_open"           
## [29] "is_fri_open"            "is_sat_open"           
## [31] "is_sun_open"            "gate_code"             
## [33] "res_com_flag"           "is_po_box"             
## [35] "is_campus"              "is_freight_forwarder"  
## [37] "is_street"              "is_apt"                
## [39] "package_class"          "Success"

str(data)

## 'data.frame':    104530 obs. of  40 variables:
##  $ tracking_id           : Factor w/ 104530 levels "TBA424087593000",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ access_code           : Factor w/ 5250 levels "\\Gate Code: \\ #3250\\\\\"\\\"\\\"\\\"\\\"\"",..: NA NA NA NA NA NA 2505 NA NA NA ...
##  $ da_employee_id        : int  NA 1706161 2399331 2089381 2723771 1899591 1454641 2539801 1706161 2667961 ...
##  $ da_affinity_index     : num  0 0.189 0 0 0 ...
##  $ shipment_id           : num  5.94e+09 5.94e+09 5.95e+09 5.95e+09 5.95e+09 ...
##  $ value_of_goods        : int  9359 34999 2926 7598 29925 8999 8499 6305 3999 4899 ...
##  $ warehouse             : Factor w/ 83 levels "ABE2","ABE3",..: 26 26 30 10 66 14 83 66 83 83 ...
##  $ item_quantity         : int  2 1 1 2 19 1 1 8 1 1 ...
##  $ ship_method           : Factor w/ 7 levels "AMZL_US_BULK",..: 7 1 5 6 7 7 5 5 1 5 ...
##  $ city                  : Factor w/ 215 levels "aventura","Aventura",..: 3 45 153 3 167 211 190 136 3 126 ...
##  $ delivery_note         : Factor w/ 289 levels "#001","*1379",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ geocode_quality       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ fulfillment_center_id : Factor w/ 83 levels "ABE2","ABE3",..: 26 26 30 10 66 14 83 66 83 83 ...
##  $ pkg_ship_method       : Factor w/ 7 levels "AMZL_US_BULK",..: 7 1 5 6 7 7 5 5 1 5 ...
##  $ pkg_ship_cost         : num  6.08 2.83 5.76 10.79 5.82 ...
##  $ pkg_width             : num  16.2 15 8 21 14.5 ...
##  $ pkg_length            : num  24.2 21 13.5 26 19 ...
##  $ pkg_height            : num  13 6 4.5 16 12 2 8.5 3 4.25 12 ...
##  $ pkg_scale_weight      : num  14.33 8.16 1.54 13.03 9.48 ...
##  $ shipment_contents_type: int  536871168 536871168 256 512 256 4195328 256 1 256 256 ...
##  $ delivery_date         : int  NA 42983 42981 42981 42982 42981 42980 42982 42980 42980 ...
##  $ pdd                   : int  42982 42980 42983 42981 42987 42985 42980 42983 42980 42980 ...
##  $ is_promise_met        : Factor w/ 2 levels "N","Y": 1 1 2 2 2 2 2 2 2 2 ...
##  $ star_rating           : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ is_mon_open           : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ is_tue_open           : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ is_wed_open           : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ is_thu_open           : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ is_fri_open           : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ is_sat_open           : Factor w/ 2 levels "N","Y": 2 1 2 1 1 1 1 1 1 1 ...
##  $ is_sun_open           : Factor w/ 2 levels "N","Y": 2 1 2 1 1 1 1 1 1 1 ...
##  $ gate_code             : Factor w/ 3789 levels "\t#3250","\t*1010",..: NA NA NA NA 1600 NA NA NA NA NA ...
##  $ res_com_flag          : Factor w/ 2 levels "C","R": 2 1 2 2 2 2 2 2 2 2 ...
##  $ is_po_box             : Factor w/ 1 level "Y": NA 1 NA NA NA NA NA NA NA NA ...
##  $ is_campus             : Factor w/ 1 level "Y": NA 1 NA NA NA NA NA NA NA NA ...
##  $ is_freight_forwarder  : Factor w/ 1 level "Y": NA NA NA NA NA NA NA NA NA NA ...
##  $ is_street             : Factor w/ 1 level "Y": NA 1 1 NA NA 1 NA 1 NA 1 ...
##  $ is_apt                : Factor w/ 2 levels "N","Y": 2 1 1 2 2 1 2 1 2 2 ...
##  $ package_class         : Factor w/ 3 levels "BPM","PARCEL",..: 3 2 2 2 3 3 2 1 2 2 ...
##  $ Success               : int  1 1 0 0 0 0 0 0 0 0 ...

summary(data)

##           tracking_id                            access_code   
##  TBA424087593000:     1   Gate Code: 3015              :  366  
##  TBA424182187000:     1   Gate Code: 122215 - Mail Room:  221  
##  TBA424455805000:     1   Gate Code: LU2AOK            :  171  
##  TBA424543917000:     1   Gate Code: 109               :  150  
##  TBA424562474000:     1   Gate Code: 1234              :  128  
##  TBA424588332000:     1   (Other)                      :12719  
##  (Other)        :104524   NA's                         :90775  
##  da_employee_id    da_affinity_index  shipment_id        value_of_goods  
##  Min.   :1340721   Min.   :0.000     Min.   :5.943e+09   Min.   :     0  
##  1st Qu.:1492401   1st Qu.:0.000     1st Qu.:6.488e+09   1st Qu.:  1898  
##  Median :2081271   Median :0.008     Median :6.863e+09   Median :  3392  
##  Mean   :2074708   Mean   :0.140     Mean   :6.872e+09   Mean   :  5938  
##  3rd Qu.:2474271   3rd Qu.:0.284     3rd Qu.:7.264e+09   3rd Qu.:  6446  
##  Max.   :3201121   Max.   :0.888     Max.   :7.694e+09   Max.   :129987  
##  NA's   :6763      NA's   :6860                                          
##    warehouse     item_quantity                  ship_method   
##  TPA1   :44822   Min.   :  1.000   AMZL_US_BULK       : 5191  
##  JAX2   :13377   1st Qu.:  1.000   AMZL_US_KEY        :   16  
##  CAE1   : 7299   Median :  1.000   AMZL_US_LMA        :14597  
##  SDF8   : 4680   Mean   :  2.247   AMZL_US_LMA_AIR    : 1765  
##  BNA3   : 3922   3rd Qu.:  3.000   AMZL_US_PREMIUM    :70895  
##  TPA2   : 2298   Max.   :100.000   AMZL_US_PREMIUM_AIR:11277  
##  (Other):28132                     AMZL_US_STD        :  789  
##               city      
##  FORT LAUDERDALE:12085  
##  MIAMI BEACH    : 9616  
##  MIAMI          : 8470  
##  HOLLYWOOD      : 7714  
##  PLANTATION     : 4663  
##  PEMBROKE PINES : 4244  
##  (Other)        :57738  
##                                        delivery_note    geocode_quality 
##  package left at the front desk               :    45   Min.   : 1.000  
##  package left at the front door               :    27   1st Qu.: 1.000  
##  171                                          :    20   Median : 1.000  
##  customer request package be left at the door.:    13   Mean   : 1.054  
##  left under security camera                   :    13   3rd Qu.: 1.000  
##  (Other)                                      :   574   Max.   :10.000  
##  NA's                                         :103838                   
##  fulfillment_center_id            pkg_ship_method  pkg_ship_cost   
##  TPA1   :44822         AMZL_US_BULK       : 5191   Min.   : 0.590  
##  JAX2   :13377         AMZL_US_KEY        :   16   1st Qu.: 3.550  
##  CAE1   : 7299         AMZL_US_LMA        :14597   Median : 4.040  
##  SDF8   : 4680         AMZL_US_LMA_AIR    : 1765   Mean   : 4.599  
##  BNA3   : 3922         AMZL_US_PREMIUM    :70895   3rd Qu.: 5.440  
##  TPA2   : 2298         AMZL_US_PREMIUM_AIR:11277   Max.   :54.330  
##  (Other):28132         AMZL_US_STD        :  789                   
##    pkg_width       pkg_length      pkg_height     pkg_scale_weight
##  Min.   : 0.20   Min.   : 0.30   Min.   : 0.000   Min.   : 0.010  
##  1st Qu.: 7.00   1st Qu.:10.25   1st Qu.: 3.000   1st Qu.: 0.990  
##  Median : 9.99   Median :13.50   Median : 5.000   Median : 2.050  
##  Mean   :10.18   Mean   :14.10   Mean   : 5.029   Mean   : 3.992  
##  3rd Qu.:12.50   3rd Qu.:16.00   3rd Qu.: 7.000   3rd Qu.: 4.430  
##  Max.   :21.00   Max.   :26.02   Max.   :16.000   Max.   :49.960  
##                                                                   
##  shipment_contents_type delivery_date        pdd        is_promise_met
##  Min.   :1.000e+00      Min.   :42979   Min.   :40543   N:12157       
##  1st Qu.:2.560e+02      1st Qu.:43012   1st Qu.:43014   Y:92373       
##  Median :2.560e+02      Median :43032   Median :43035                 
##  Mean   :1.375e+07      Mean   :43031   Mean   :43034                 
##  3rd Qu.:2.560e+02      3rd Qu.:43051   3rd Qu.:43055                 
##  Max.   :1.074e+09      Max.   :43071   Max.   :43189                 
##                         NA's   :14916                                 
##   star_rating    is_mon_open is_tue_open is_wed_open is_thu_open
##  Min.   :1.900   N:     8    Y:104530    Y:104530    N:     2   
##  1st Qu.:5.000   Y:104522                            Y:104528   
##  Median :5.000                                                  
##  Mean   :4.861                                                  
##  3rd Qu.:5.000                                                  
##  Max.   :5.000                                                  
##                                                                 
##  is_fri_open is_sat_open is_sun_open              gate_code    
##  Y:104530    N:67701     N:68457     3015              :  456  
##              Y:36829     Y:36073     122215 - Mail Room:  218  
##                                      LU2AOK            :  217  
##                                      109               :  154  
##                                      664               :  113  
##                                      (Other)           : 8743  
##                                      NA's              :94629  
##  res_com_flag is_po_box     is_campus     is_freight_forwarder
##  C:18739      Y   :   133   Y   :   789   Y   :  1145         
##  R:85791      NA's:104397   NA's:103741   NA's:103385         
##                                                               
##                                                               
##                                                               
##                                                               
##                                                               
##  is_street     is_apt      package_class     Success      
##  Y   :61505   N   :49212   BPM   :  908   Min.   :0.0000  
##  NA's:43025   Y   :47706   PARCEL:76949   1st Qu.:0.0000  
##               NA's: 7612   STD   :26673   Median :0.0000  
##                                           Mean   :0.2524  
##                                           3rd Qu.:1.0000  
##                                           Max.   :1.0000  
##

table(data$Success)

## 
##     0     1 
## 78151 26379

######### Removing Id columns from the dataset ###########
Shipment <-data[,-c(1,3,5,7,10,11,13,14,21,22,25,28,31,30,26,27,29)]
############Summary of the Data###############################3




# data <-data.frame(data[1:2],datatrain[5:13],datatrain[16],datatrain[18:31])


######### Treating Nulls in data ##################
apply(Shipment, 2, function(x) sum(is.na(x)))

##            access_code      da_affinity_index         value_of_goods 
##                  90775                   6860                      0 
##          item_quantity            ship_method        geocode_quality 
##                      0                      0                      0 
##          pkg_ship_cost              pkg_width             pkg_length 
##                      0                      0                      0 
##             pkg_height       pkg_scale_weight shipment_contents_type 
##                      0                      0                      0 
##         is_promise_met            star_rating              gate_code 
##                      0                      0                  94629 
##           res_com_flag              is_po_box              is_campus 
##                      0                 104397                 103741 
##   is_freight_forwarder              is_street                 is_apt 
##                 103385                  43025                   7612 
##          package_class                Success 
##                      0                      0

Shipment$da_affinity_index <- ifelse(is.na(Shipment$da_affinity_index), mean(Shipment$da_affinity_index, na.rm=TRUE), Shipment$da_affinity_index)

Shipment$access_code <- ifelse(Shipment$access_code=="NA", 0, 1)
Shipment$access_code[is.na(Shipment$access_code)] <- 0
Shipment$access_code=as.factor(Shipment$access_code)

Shipment$gate_code <- ifelse(Shipment$gate_code=="NA", 0, 1)
Shipment$gate_code[is.na(Shipment$gate_code)] <- 0
Shipment$gate_code=as.factor(Shipment$gate_code)

Shipment$is_po_box<- ifelse(Shipment$is_po_box=="Y", 1, 0)
Shipment$is_po_box[is.na(Shipment$is_po_box)] <- 0
Shipment$is_po_box=as.factor(Shipment$is_po_box)

Shipment$is_campus<- ifelse(Shipment$is_campus=="Y", 1, 0)
Shipment$is_campus[is.na(Shipment$is_campus)] <- 0
Shipment$is_campus=as.factor(Shipment$is_campus)


Shipment$is_freight_forwarder<- ifelse(Shipment$is_freight_forwarder=="Y", 1, 0)
Shipment$is_freight_forwarder[is.na(Shipment$is_freight_forwarder)] <- 0
Shipment$is_freight_forwarder=as.factor(Shipment$is_freight_forwarder)


Shipment$is_street<- ifelse(Shipment$is_street=="Y", 1, 0)
Shipment$is_street[is.na(Shipment$is_street)] <- 0
Shipment$is_street=as.factor(Shipment$is_street)

Shipment$is_apt<- ifelse(Shipment$is_apt=="Y", 1, 0)
Shipment$is_apt[is.na(Shipment$is_apt)] <- 0
Shipment$is_apt=as.factor(Shipment$is_apt)



#Shipment=data
#install.packages("devtools")
library(devtools)
#install.packages("httr")
# library(httr)
# #install.packages("DBI")
# library(DBI)
# #install.packages("bit")
# library(bit)
# #install.packages("RSQLite")
# library(RSQLite)
# #install.packages("chron")
# library(chron)

#install_github("tomasgreif/riv")
library(woe)

###################################Information Value####################################
#iv.mult(Shipment,"Success",TRUE)
iv.plot.summary(iv.mult(Shipment,"Success",TRUE))

## Information Value 0.01 
## Information Value 0.58 
## Information Value 0 
## Information Value 0 
## Information Value 0.08 
## Information Value 0 
## Information Value 0.08 
## Information Value 0 
## Information Value 0 
## Information Value 0 
## Information Value 0 
## Information Value 0 
## Information Value 0.24 
## Information Value 0.02 
## Information Value 0.06 
## Information Value 0.08 
## Information Value 0 
## Information Value 0 
## Information Value 0 
## Information Value 0.03 
## Information Value 0.01 
## Information Value 0

####################Exploratory Data Analysis#####################
hist(Shipment$value_of_goods,main="Shipment$value_of_goods",col="darkgreen")

boxplot(Shipment$value_of_goods,main="Shipment$value_of_goods",col="pink")

hist(Shipment$pkg_ship_cost,main="Package Ship Cost",col="darkgreen")

boxplot(Shipment$pkg_ship_cost,main="Package Ship Cost",col="pink")

hist(Shipment$pkg_width,main="Package Width",col="darkgreen")

boxplot(Shipment$pkg_width,main="Package Width",col="pink")

hist(Shipment$pkg_length,main="Package Length",col="darkgreen")

boxplot(Shipment$pkg_length,main="Package Length",col="pink")

hist(Shipment$pkg_height,main="Package Height",col="darkgreen")

boxplot(Shipment$pkg_height,main="Package Height",col="pink")

hist(Shipment$pkg_scale_weight,main="Package Weight",col="darkgreen")

boxplot(Shipment$pkg_scale_weight,main="Package Weight",col="pink")

hist(Shipment$geocode_quality,main="Geocode Quality",col="darkgreen")

boxplot(Shipment$geocode_quality,main="Geocode Quality",col="pink")

hist(Shipment$da_affinity_index,main="da_affinity_index",col="darkgreen")

boxplot(Shipment$da_affinity_index,main="da_affinity_index",col="pink")

################Chi-Square test for all the attributes##########################3
chisq.test(Shipment$Success,Shipment$access_code)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$access_code
## X-squared = 311.83, df = 1, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$da_affinity_index)

## Warning in chisq.test(Shipment$Success, Shipment$da_affinity_index): Chi-
## squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$da_affinity_index
## X-squared = 23462, df = 128, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$item_quantity)

## Warning in chisq.test(Shipment$Success, Shipment$item_quantity): Chi-
## squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$item_quantity
## X-squared = 134.04, df = 54, p-value = 9.466e-09

chisq.test(Shipment$Success,Shipment$geocode_quality)

## Warning in chisq.test(Shipment$Success, Shipment$geocode_quality): Chi-
## squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$geocode_quality
## X-squared = 942.84, df = 4, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$pkg_ship_cost)

## Warning in chisq.test(Shipment$Success, Shipment$pkg_ship_cost): Chi-
## squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$pkg_ship_cost
## X-squared = 7450.1, df = 1273, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$pkg_width)

## Warning in chisq.test(Shipment$Success, Shipment$pkg_width): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$pkg_width
## X-squared = 966.31, df = 765, p-value = 9.188e-07

chisq.test(Shipment$Success,Shipment$pkg_length)

## Warning in chisq.test(Shipment$Success, Shipment$pkg_length): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$pkg_length
## X-squared = 1053.9, df = 815, p-value = 2.691e-08

chisq.test(Shipment$Success,Shipment$pkg_height)

## Warning in chisq.test(Shipment$Success, Shipment$pkg_height): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$pkg_height
## X-squared = 677.95, df = 532, p-value = 1.734e-05

chisq.test(Shipment$Success,Shipment$pkg_scale_weight)

## Warning in chisq.test(Shipment$Success, Shipment$pkg_scale_weight): Chi-
## squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$pkg_scale_weight
## X-squared = 3549.9, df = 3081, p-value = 5.896e-09

chisq.test(Shipment$Success,Shipment$shipment_contents_type)

## Warning in chisq.test(Shipment$Success, Shipment$shipment_contents_type):
## Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$shipment_contents_type
## X-squared = 239.18, df = 169, p-value = 0.0003088

chisq.test(Shipment$Success,Shipment$is_promise_met)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$is_promise_met
## X-squared = 5578.3, df = 1, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$star_rating)

## Warning in chisq.test(Shipment$Success, Shipment$star_rating): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$star_rating
## X-squared = 463.46, df = 10, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$gate_code)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$gate_code
## X-squared = 1303.8, df = 1, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$res_com_flag)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$res_com_flag
## X-squared = 1816, df = 1, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$is_po_box)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$is_po_box
## X-squared = 11.445, df = 1, p-value = 0.0007167

chisq.test(Shipment$Success,Shipment$is_campus)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$is_campus
## X-squared = 25.509, df = 1, p-value = 4.404e-07

chisq.test(Shipment$Success,Shipment$is_freight_forwarder)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$is_freight_forwarder
## X-squared = 54.136, df = 1, p-value = 1.871e-13

chisq.test(Shipment$Success,Shipment$is_street)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$is_street
## X-squared = 530.48, df = 1, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$is_apt)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Shipment$Success and Shipment$is_apt
## X-squared = 136.92, df = 1, p-value < 2.2e-16

chisq.test(Shipment$Success,Shipment$package_class)

## 
##  Pearson's Chi-squared test
## 
## data:  Shipment$Success and Shipment$package_class
## X-squared = 31.757, df = 2, p-value = 1.271e-07

####################################Statified Sampling####################################
library(caTools)
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

head(Shipment)

##   access_code da_affinity_index value_of_goods item_quantity
## 1           0         0.0000000           9359             2
## 2           0         0.1893939          34999             1
## 3           0         0.0000000           2926             1
## 4           0         0.0000000           7598             2
## 5           0         0.0000000          29925            19
## 6           0         0.3888889           8999             1
##           ship_method geocode_quality pkg_ship_cost pkg_width pkg_length
## 1         AMZL_US_STD               1          6.08     16.25      24.25
## 2        AMZL_US_BULK               1          2.83     15.00      21.00
## 3     AMZL_US_PREMIUM               1          5.76      8.00      13.50
## 4 AMZL_US_PREMIUM_AIR               1         10.79     21.00      26.00
## 5         AMZL_US_STD               1          5.82     14.50      19.00
## 6         AMZL_US_STD               1          4.92      7.00      10.00
##   pkg_height pkg_scale_weight shipment_contents_type is_promise_met
## 1       13.0            14.33              536871168              N
## 2        6.0             8.16              536871168              N
## 3        4.5             1.54                    256              Y
## 4       16.0            13.03                    512              Y
## 5       12.0             9.48                    256              Y
## 6        2.0             1.37                4195328              Y
##   star_rating gate_code res_com_flag is_po_box is_campus
## 1           5         0            R         0         0
## 2           5         0            C         1         1
## 3           5         0            R         0         0
## 4           5         0            R         0         0
## 5           5         1            R         0         0
## 6           5         0            R         0         0
##   is_freight_forwarder is_street is_apt package_class Success
## 1                    0         0      1           STD       1
## 2                    0         1      0        PARCEL       1
## 3                    0         1      0        PARCEL       0
## 4                    0         0      1        PARCEL       0
## 5                    0         0      1           STD       0
## 6                    0         1      0           STD       0

set.seed(80)
split=sample.split(Shipment$Success,SplitRatio =.7 )# 70% to traiing & 30% for test

head(split,20)

##  [1]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [12] FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE

#check stratified sampling Worked
table(Shipment$Success)

## 
##     0     1 
## 78151 26379

#create the training dataset
datatrain=subset(Shipment,split==TRUE)
table(datatrain$Success)

## 
##     0     1 
## 54706 18465

datatest=subset(Shipment,split==FALSE)
table(datatest$Success)

## 
##     0     1 
## 23445  7914

table(datatrain$Success)

## 
##     0     1 
## 54706 18465

Mode <- function (x, na.rm) {
  xtab <- table(x)
  xmode <- names(which(xtab == max(xtab)))
  if (length(xmode) > 1) xmode <- ">1 mode"
  return(xmode)
}


outlier <- function(x, upperPercentile, lowerPercentile) {
  low =quantile(x,lowerPercentile)
  high = quantile(x,upperPercentile)
  x = ifelse(x>high,high,ifelse(x<low,low,x))
  return (x)
}

MeSt = function(x) {
  avg = mean(x, na.rm =T)
  stdv = sd(x,na.rm =T)
  print(avg)
  print(stdv)
}


datatrain$value_of_goods= outlier(datatrain$value_of_goods,.85,.0)
datatrain$pkg_ship_cost <- outlier(datatrain$pkg_ship_cost,.85,.09)
datatrain$pkg_width <- outlier(datatrain$pkg_width,.99,.0)
datatrain$pkg_length <- outlier(datatrain$pkg_length,.95,.0)
datatrain$pkg_height <- outlier(datatrain$pkg_height,.95,.0)
datatrain$pkg_scale_weight <- outlier(datatrain$pkg_scale_weight,.99,.0)
datatrain$geocode_quality <- outlier(datatrain$geocode_quality,.99,.0)
datatrain$da_affinity_index <- outlier(datatrain$da_affinity_index,.99,.0)
data_clean=datatrain
# x<-quantile(datatrain$value_of_goods,c(0.01,0.80))
# data_clean <- datatrain[datatrain$value_of_goods >=x[1] & datatrain$value_of_goods<=x[2],]
 apply(datatrain, 2, function(x) sum(is.na(x)))

##            access_code      da_affinity_index         value_of_goods 
##                      0                      0                      0 
##          item_quantity            ship_method        geocode_quality 
##                      0                      0                      0 
##          pkg_ship_cost              pkg_width             pkg_length 
##                      0                      0                      0 
##             pkg_height       pkg_scale_weight shipment_contents_type 
##                      0                      0                      0 
##         is_promise_met            star_rating              gate_code 
##                      0                      0                      0 
##           res_com_flag              is_po_box              is_campus 
##                      0                      0                      0 
##   is_freight_forwarder              is_street                 is_apt 
##                      0                      0                      0 
##          package_class                Success 
##                      0                      0

# 
# x<-quantile(data_clean$pkg_ship_cost,c(0.20,0.80))
# data_clean <- data_clean[data_clean$pkg_ship_cost >=x[1] & data_clean$pkg_ship_cost<=x[2],]
# 
# x<-quantile(data_clean$pkg_width,c(0.02,0.98))
# data_clean <- data_clean[data_clean$pkg_width >=x[1] & data_clean$pkg_width<=x[2],]
# 
# x<-quantile(data_clean$pkg_length,c(0.02,0.95))
# data_clean <- data_clean[data_clean$pkg_length >=x[1] & data_clean$pkg_length<=x[2],]
# 
# x<-quantile(data_clean$pkg_height,c(0.02,0.95))
# data_clean <- data_clean[data_clean$pkg_height >=x[1] & data_clean$pkg_height<=x[2],]
# 
# x<-quantile(data_clean$pkg_scale_weight,c(0.02,0.95))
# data_clean <- data_clean[data_clean$pkg_scale_weight >=x[1] & data_clean$pkg_scale_weight<=x[2],]
# 
# x<-quantile(data_clean$geocode_quality,c(0.02,0.95))
# data_clean <- data_clean[data_clean$geocode_quality >=x[1] & data_clean$geocode_quality<=x[2],]
# 
# x<-quantile(data_clean$da_affinity_index,c(0.02,0.95))
# data_clean <- data_clean[data_clean$da_affinity_index >=x[1] & data_clean$da_affinity_index<=x[2],]


hist(data_clean$value_of_goods,main="data_clean$value_of_goods",col="darkgreen")

boxplot(data_clean$value_of_goods,main="data_clean$value_of_goods",col="pink")

hist(data_clean$pkg_ship_cost,main="Package Ship Cost",col="darkgreen")

boxplot(data_clean$pkg_ship_cost,main="Package Ship Cost",col="pink")

hist(data_clean$pkg_width,main="Package Width",col="darkgreen")

boxplot(data_clean$pkg_width,main="Package Width",col="pink")

hist(data_clean$pkg_length,main="Package Length",col="darkgreen")

boxplot(data_clean$pkg_length,main="Package Length",col="pink")

hist(data_clean$pkg_height,main="Package Height",col="darkgreen")

boxplot(data_clean$pkg_height,main="Package Height",col="pink")

hist(data_clean$pkg_scale_weight,main="Package Weight",col="darkgreen")

boxplot(data_clean$pkg_scale_weight,main="Package Weight",col="pink")

hist(data_clean$geocode_quality,main="Geocode Quality",col="darkgreen")

boxplot(data_clean$geocode_quality,main="Geocode Quality",col="pink")

hist(data_clean$da_affinity_index,main="da_affinity_index",col="darkgreen")

boxplot(data_clean$da_affinity_index,main="da_affinity_index",col="pink")

table(data_clean$Success)

## 
##     0     1 
## 54706 18465

##############Calculating the IV######################################3
# #data<-Shipment
# ######### Treating NA in data ##################
# apply(data, 2, function(x) sum(is.na(x)))
# 
# data$da_affinity_index <- ifelse(is.na(data$da_affinity_index), mean(data$da_affinity_index, na.rm=TRUE), data$da_affinity_index)
# #data$da_affinity_index=round(mean(data$da_affinity_index, na.rm = TRUE))

datatrain=data_clean

########################Scaling Train Data#############################
datatrain$value_of_goods= scale(datatrain$value_of_goods)
datatrain$pkg_ship_cost <- scale(datatrain$pkg_ship_cost)
datatrain$pkg_width <- scale(datatrain$pkg_width)
datatrain$pkg_length <- scale(datatrain$pkg_length)
datatrain$pkg_height <- scale(datatrain$pkg_height)
datatrain$pkg_scale_weight <- scale(datatrain$pkg_scale_weight)
datatrain$geocode_quality <- scale(datatrain$geocode_quality)
datatrain$da_affinity_index <- scale(datatrain$da_affinity_index)
data_clean=datatrain
########################Scaling Test Data#############################
datatest$value_of_goods= scale(datatest$value_of_goods)
datatest$pkg_ship_cost <- scale(datatest$pkg_ship_cost)
datatest$pkg_width <- scale(datatest$pkg_width)
datatest$pkg_length <- scale(datatest$pkg_length)
datatest$pkg_height <- scale(datatest$pkg_height)
datatest$pkg_scale_weight <- scale(datatest$pkg_scale_weight)
datatest$geocode_quality <- scale(datatest$geocode_quality)
datatest$da_affinity_index <- scale(datatest$da_affinity_index)



######### Removing Id columns from the dataset ###########

# data <-data.frame(data[1:2],datatrain[5:13],datatrain[16],datatrain[18:31])
#dataR <-data[,-c(1,3,5,7,10,11,13,14,21,22,26,27,29)]
# str(data_clean)
# names(data_clean)
#summary
table(data_clean$Success)

## 
##     0     1 
## 54706 18465

apply(data_clean, 2, function(x) sum(is.na(x)))

##            access_code      da_affinity_index         value_of_goods 
##                      0                      0                      0 
##          item_quantity            ship_method        geocode_quality 
##                      0                      0                      0 
##          pkg_ship_cost              pkg_width             pkg_length 
##                      0                      0                      0 
##             pkg_height       pkg_scale_weight shipment_contents_type 
##                      0                      0                      0 
##         is_promise_met            star_rating              gate_code 
##                      0                      0                      0 
##           res_com_flag              is_po_box              is_campus 
##                      0                      0                      0 
##   is_freight_forwarder              is_street                 is_apt 
##                      0                      0                      0 
##          package_class                Success 
##                      0                      0

dataR=data_clean
summary(dataR)

##  access_code da_affinity_index.V1  value_of_goods.V1   item_quantity    
##  0:63550     Min.   :-0.739981    Min.   :-1.4532204   Min.   :  1.000  
##  1: 9621     1st Qu.:-0.739981    1st Qu.:-0.8181835   1st Qu.:  1.000  
##              Median :-0.638170    Median :-0.3182466   Median :  1.000  
##              Mean   : 0.000000    Mean   : 0.0000000   Mean   :  2.243  
##              3rd Qu.: 0.763013    3rd Qu.: 0.7117237   3rd Qu.:  3.000  
##              Max.   : 3.429885    Max.   : 1.7627616   Max.   :100.000  
##                                                                         
##               ship_method    geocode_quality.V1    pkg_ship_cost.V1  
##  AMZL_US_BULK       : 3635   Min.   :-0.164574   Min.   :-1.0829243  
##  AMZL_US_KEY        :   11   1st Qu.:-0.164574   1st Qu.:-0.8416553  
##  AMZL_US_LMA        :10285   Median :-0.164574   Median :-0.3869559  
##  AMZL_US_LMA_AIR    : 1248   Mean   : 0.000000   Mean   : 0.0000000  
##  AMZL_US_PREMIUM    :49519   3rd Qu.:-0.164574   3rd Qu.: 0.9121851  
##  AMZL_US_PREMIUM_AIR: 7909   Max.   : 6.117155   Max.   : 1.6731105  
##  AMZL_US_STD        :  564                                           
##     pkg_width.V1        pkg_length.V1       pkg_height.V1    
##  Min.   :-3.228429   Min.   :-3.817250   Min.   :-1.6539987  
##  1st Qu.:-1.025677   1st Qu.:-1.031912   1st Qu.:-0.6592924  
##  Median :-0.057114   Median :-0.140604   Median : 0.0038452  
##  Mean   : 0.000000   Mean   : 0.000000   Mean   : 0.0000000  
##  3rd Qu.: 0.755960   3rd Qu.: 0.555730   3rd Qu.: 0.6669827  
##  Max.   : 2.548287   Max.   : 2.018032   Max.   : 2.3248266  
##                                                              
##  pkg_scale_weight.V1 shipment_contents_type is_promise_met  star_rating   
##  Min.   :-0.734786   Min.   :1.000e+00      N: 8494        Min.   :1.900  
##  1st Qu.:-0.551183   1st Qu.:2.560e+02      Y:64677        1st Qu.:5.000  
##  Median :-0.348846   Median :2.560e+02                     Median :5.000  
##  Mean   : 0.000000   Mean   :1.369e+07                     Mean   :4.862  
##  3rd Qu.: 0.095173   3rd Qu.:2.560e+02                     3rd Qu.:5.000  
##  Max.   : 5.061809   Max.   :1.074e+09                     Max.   :5.000  
##                                                                           
##  gate_code res_com_flag is_po_box is_campus is_freight_forwarder is_street
##  0:66183   C:13208      0:73078   0:72609   0:72362              0:30094  
##  1: 6988   R:59963      1:   93   1:  562   1:  809              1:43077  
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##  is_apt    package_class     Success      
##  0:39916   BPM   :  650   Min.   :0.0000  
##  1:33255   PARCEL:53994   1st Qu.:0.0000  
##            STD   :18527   Median :0.0000  
##                           Mean   :0.2524  
##                           3rd Qu.:1.0000  
##                           Max.   :1.0000  
##

# names(dataR)
#Correlation Plot
library(corrplot)

## corrplot 0.84 loaded

corr_plot <- corrplot(cor(Shipment[,-c(1,5,6,12,13,15:22)]), method = "circle",type = "upper")

# str(dataR)
summary(dataR$geocode_quality)

##        V1         
##  Min.   :-0.1646  
##  1st Qu.:-0.1646  
##  Median :-0.1646  
##  Mean   : 0.0000  
##  3rd Qu.:-0.1646  
##  Max.   : 6.1172

summary(data$geocode_quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.054   1.000  10.000

#dataR=data.frame(dataR[,-c(6)])
datatrain=dataR
ft<-glm(Success ~ . ,data=datatrain,family="binomial")

summary(ft)

## 
## Call:
## glm(formula = Success ~ ., family = "binomial", data = datatrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4751  -0.7326  -0.5945   0.6550   2.3561  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     1.688e+00  1.536e-01  10.993  < 2e-16 ***
## access_code1                   -2.117e-01  3.881e-02  -5.453 4.96e-08 ***
## da_affinity_index              -1.132e-03  9.065e-03  -0.125  0.90064    
## value_of_goods                  7.783e-03  1.144e-02   0.680  0.49621    
## item_quantity                   1.202e-02  4.184e-03   2.872  0.00407 ** 
## ship_methodAMZL_US_KEY          2.753e-01  6.775e-01   0.406  0.68445    
## ship_methodAMZL_US_LMA         -1.364e+00  5.715e-02 -23.862  < 2e-16 ***
## ship_methodAMZL_US_LMA_AIR     -1.707e+00  8.549e-02 -19.968  < 2e-16 ***
## ship_methodAMZL_US_PREMIUM     -5.876e-01  4.508e-02 -13.035  < 2e-16 ***
## ship_methodAMZL_US_PREMIUM_AIR -9.889e-01  5.951e-02 -16.616  < 2e-16 ***
## ship_methodAMZL_US_STD         -6.748e-01  1.129e-01  -5.978 2.26e-09 ***
## geocode_quality                 1.495e-01  7.965e-03  18.773  < 2e-16 ***
## pkg_ship_cost                   3.460e-01  1.242e-02  27.853  < 2e-16 ***
## pkg_width                      -3.059e-02  1.858e-02  -1.647  0.09959 .  
## pkg_length                     -5.676e-03  1.977e-02  -0.287  0.77403    
## pkg_height                     -1.174e-02  1.410e-02  -0.832  0.40519    
## pkg_scale_weight               -4.126e-02  1.339e-02  -3.081  0.00206 ** 
## shipment_contents_type         -5.394e-10  1.136e-10  -4.747 2.07e-06 ***
## is_promise_metY                -1.275e+00  2.464e-02 -51.756  < 2e-16 ***
## star_rating                    -6.260e-02  2.036e-02  -3.075  0.00210 ** 
## gate_code1                      8.087e-01  4.182e-02  19.340  < 2e-16 ***
## res_com_flagR                  -1.286e+00  5.670e-02 -22.687  < 2e-16 ***
## is_po_box1                     -1.357e-01  2.367e-01  -0.573  0.56657    
## is_campus1                     -3.599e-02  9.828e-02  -0.366  0.71419    
## is_freight_forwarder1           9.584e-02  8.074e-02   1.187  0.23520    
## is_street1                     -3.314e-01  2.406e-02 -13.772  < 2e-16 ***
## is_apt1                         3.770e-03  2.396e-02   0.157  0.87497    
## package_classPARCEL             4.575e-01  1.102e-01   4.153 3.28e-05 ***
## package_classSTD                5.153e-01  1.098e-01   4.693 2.70e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82670  on 73170  degrees of freedom
## Residual deviance: 75679  on 73142  degrees of freedom
## AIC: 75737
## 
## Number of Fisher Scoring iterations: 4

anova(ft,"Chisqr")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Success
## 
## Terms added sequentially (first to last)
## 
## 
##                        Df Deviance Resid. Df Resid. Dev
## NULL                                   73170      82670
## access_code             1   212.11     73169      82457
## da_affinity_index       1     0.04     73168      82457
## value_of_goods          1    51.47     73167      82406
## item_quantity           1    24.96     73166      82381
## ship_method             6  1188.15     73160      81193
## geocode_quality         1   447.61     73159      80745
## pkg_ship_cost           1   746.49     73158      79999
## pkg_width               1    72.09     73157      79927
## pkg_length              1     3.84     73156      79923
## pkg_height              1    12.53     73155      79910
## pkg_scale_weight        1    13.02     73154      79897
## shipment_contents_type  1    17.27     73153      79880
## is_promise_met          1  2943.06     73152      76937
## star_rating             1    39.72     73151      76897
## gate_code               1   484.18     73150      76413
## res_com_flag            1   401.03     73149      76012
## is_po_box               1     0.53     73148      76011
## is_campus               1     0.12     73147      76011
## is_freight_forwarder    1     0.44     73146      76011
## is_street               1   305.16     73145      75706
## is_apt                  1     0.01     73144      75706
## package_class           2    26.79     73142      75679

ft

## 
## Call:  glm(formula = Success ~ ., family = "binomial", data = datatrain)
## 
## Coefficients:
##                    (Intercept)                    access_code1  
##                      1.688e+00                      -2.117e-01  
##              da_affinity_index                  value_of_goods  
##                     -1.132e-03                       7.783e-03  
##                  item_quantity          ship_methodAMZL_US_KEY  
##                      1.202e-02                       2.753e-01  
##         ship_methodAMZL_US_LMA      ship_methodAMZL_US_LMA_AIR  
##                     -1.364e+00                      -1.707e+00  
##     ship_methodAMZL_US_PREMIUM  ship_methodAMZL_US_PREMIUM_AIR  
##                     -5.876e-01                      -9.889e-01  
##         ship_methodAMZL_US_STD                 geocode_quality  
##                     -6.748e-01                       1.495e-01  
##                  pkg_ship_cost                       pkg_width  
##                      3.460e-01                      -3.059e-02  
##                     pkg_length                      pkg_height  
##                     -5.676e-03                      -1.174e-02  
##               pkg_scale_weight          shipment_contents_type  
##                     -4.126e-02                      -5.394e-10  
##                is_promise_metY                     star_rating  
##                     -1.275e+00                      -6.260e-02  
##                     gate_code1                   res_com_flagR  
##                      8.087e-01                      -1.286e+00  
##                     is_po_box1                      is_campus1  
##                     -1.357e-01                      -3.599e-02  
##          is_freight_forwarder1                      is_street1  
##                      9.584e-02                      -3.314e-01  
##                        is_apt1             package_classPARCEL  
##                      3.770e-03                       4.575e-01  
##               package_classSTD  
##                      5.153e-01  
## 
## Degrees of Freedom: 73170 Total (i.e. Null);  73142 Residual
## Null Deviance:       82670 
## Residual Deviance: 75680     AIC: 75740

# datatrain$pkg_scaler_weight=datatrain$pkg_outlier_weight
# datatest$pkg_scaler_weight=datatest$pkg_outlier_weight
predict(ft,datatest,type="resp")->p
library(ROCR)
pred <- prediction(p,datatest$Success)

pref<-ROCR::performance(pred,"tpr","fpr")

plot(pref,print.cutoffs.at=seq(0,1,.1))

pref1<-performance(pred,"acc")
plot(pref1)

pref2<-performance(pred,"lift")
plot(pref2)

#View(cbind(datatest$Success,p))
ifelse(p>=.3,1,0)->cq
#table(datatest$Success,cq)
a=table(datatest$Success,cq)
a

##    cq
##         0     1
##   0 19438  4007
##   1  4598  3316

Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy

## [1] 72.55971

sensitivity(a)

## [1] 0.8087036

specificity(a)

## [1] 0.4528199

precision(a)

## [1] 0.8290894

######################################## Model 2 ####################################
ft1<-glm(Success ~ . ,data=datatrain[,-c(2,17,18,19,21)],family="binomial")

summary(ft1)

## 
## Call:
## glm(formula = Success ~ ., family = "binomial", data = datatrain[, 
##     -c(2, 17, 18, 19, 21)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4753  -0.7326  -0.5946   0.6586   2.3559  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     1.686e+00  1.535e-01  10.982  < 2e-16 ***
## access_code1                   -2.115e-01  3.879e-02  -5.453 4.95e-08 ***
## value_of_goods                  7.865e-03  1.143e-02   0.688  0.49147    
## item_quantity                   1.197e-02  4.184e-03   2.860  0.00424 ** 
## ship_methodAMZL_US_KEY          2.737e-01  6.774e-01   0.404  0.68619    
## ship_methodAMZL_US_LMA         -1.360e+00  5.673e-02 -23.977  < 2e-16 ***
## ship_methodAMZL_US_LMA_AIR     -1.704e+00  8.531e-02 -19.968  < 2e-16 ***
## ship_methodAMZL_US_PREMIUM     -5.883e-01  4.497e-02 -13.083  < 2e-16 ***
## ship_methodAMZL_US_PREMIUM_AIR -9.899e-01  5.943e-02 -16.658  < 2e-16 ***
## ship_methodAMZL_US_STD         -6.753e-01  1.128e-01  -5.986 2.15e-09 ***
## geocode_quality                 1.492e-01  7.958e-03  18.750  < 2e-16 ***
## pkg_ship_cost                   3.461e-01  1.242e-02  27.865  < 2e-16 ***
## pkg_width                      -3.077e-02  1.858e-02  -1.656  0.09768 .  
## pkg_length                     -5.559e-03  1.977e-02  -0.281  0.77856    
## pkg_height                     -1.169e-02  1.410e-02  -0.829  0.40729    
## pkg_scale_weight               -4.143e-02  1.339e-02  -3.095  0.00197 ** 
## shipment_contents_type         -5.405e-10  1.136e-10  -4.756 1.98e-06 ***
## is_promise_metY                -1.276e+00  2.463e-02 -51.784  < 2e-16 ***
## star_rating                    -6.164e-02  1.985e-02  -3.105  0.00190 ** 
## gate_code1                      8.087e-01  4.178e-02  19.359  < 2e-16 ***
## res_com_flagR                  -1.286e+00  5.649e-02 -22.760  < 2e-16 ***
## is_street1                     -3.330e-01  1.905e-02 -17.480  < 2e-16 ***
## package_classPARCEL             4.587e-01  1.102e-01   4.163 3.14e-05 ***
## package_classSTD                5.165e-01  1.098e-01   4.703 2.56e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82670  on 73170  degrees of freedom
## Residual deviance: 75681  on 73147  degrees of freedom
## AIC: 75729
## 
## Number of Fisher Scoring iterations: 4

anova(ft1,"Chisqr")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Success
## 
## Terms added sequentially (first to last)
## 
## 
##                        Df Deviance Resid. Df Resid. Dev
## NULL                                   73170      82670
## access_code             1   212.11     73169      82457
## value_of_goods          1    51.49     73168      82406
## item_quantity           1    24.96     73167      82381
## ship_method             6  1186.85     73161      81194
## geocode_quality         1   448.01     73160      80746
## pkg_ship_cost           1   746.66     73159      79999
## pkg_width               1    72.10     73158      79927
## pkg_length              1     3.83     73157      79924
## pkg_height              1    12.52     73156      79911
## pkg_scale_weight        1    13.04     73155      79898
## shipment_contents_type  1    17.24     73154      79881
## is_promise_met          1  2943.84     73153      76937
## star_rating             1    39.71     73152      76897
## gate_code               1   484.19     73151      76413
## res_com_flag            1   400.78     73150      76012
## is_street               1   304.37     73149      75708
## package_class           2    26.91     73147      75681

#datatest$pkg_outlier_weight=datatest$pkg_scale_weight
predict(ft1,datatest,type="resp")->p
library(ROCR)
pred <- prediction(p,datatest$Success)

pref<-ROCR::performance(pred,"tpr","fpr")

plot(pref,print.cutoffs.at=seq(0,1,.1))

pref1<-performance(pred,"acc")
plot(pref1)

pref2<-performance(pred,"lift")
plot(pref2)

#View(cbind(datatest$Success,p))
ifelse(p>=.3,1,0)->cq
#table(datatest$Success,cq)
a=table(datatest$Success,cq)
a

##    cq
##         0     1
##   0 19455  3990
##   1  4608  3306

Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy

## [1] 72.58203

sensitivity(a)

## [1] 0.8085027

specificity(a)

## [1] 0.453125

precision(a)

## [1] 0.8298145

####################################Model 3 ###################################

ft2<-glm(Success ~ . ,data=datatrain[,-c(2,9,10,11,17,18,19,21)],family="binomial")

summary(ft2)

## 
## Call:
## glm(formula = Success ~ ., family = "binomial", data = datatrain[, 
##     -c(2, 9, 10, 11, 17, 18, 19, 21)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4588  -0.7328  -0.5958   0.6564   2.3504  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     1.702e+00  1.529e-01  11.128  < 2e-16 ***
## access_code1                   -2.098e-01  3.879e-02  -5.410 6.32e-08 ***
## value_of_goods                  1.875e-04  1.112e-02   0.017  0.98654    
## item_quantity                   9.693e-03  4.142e-03   2.340  0.01927 *  
## ship_methodAMZL_US_KEY          2.850e-01  6.773e-01   0.421  0.67395    
## ship_methodAMZL_US_LMA         -1.359e+00  5.673e-02 -23.953  < 2e-16 ***
## ship_methodAMZL_US_LMA_AIR     -1.674e+00  8.499e-02 -19.691  < 2e-16 ***
## ship_methodAMZL_US_PREMIUM     -5.816e-01  4.493e-02 -12.944  < 2e-16 ***
## ship_methodAMZL_US_PREMIUM_AIR -9.539e-01  5.876e-02 -16.235  < 2e-16 ***
## ship_methodAMZL_US_STD         -6.887e-01  1.126e-01  -6.116 9.57e-10 ***
## geocode_quality                 1.494e-01  7.957e-03  18.775  < 2e-16 ***
## pkg_ship_cost                   3.331e-01  1.200e-02  27.765  < 2e-16 ***
## pkg_width                      -5.494e-02  1.038e-02  -5.291 1.21e-07 ***
## shipment_contents_type         -6.154e-10  1.121e-10  -5.490 4.02e-08 ***
## is_promise_metY                -1.276e+00  2.463e-02 -51.794  < 2e-16 ***
## star_rating                    -6.326e-02  1.984e-02  -3.189  0.00143 ** 
## gate_code1                      8.074e-01  4.177e-02  19.328  < 2e-16 ***
## res_com_flagR                  -1.288e+00  5.648e-02 -22.810  < 2e-16 ***
## is_street1                     -3.322e-01  1.905e-02 -17.441  < 2e-16 ***
## package_classPARCEL             4.451e-01  1.093e-01   4.074 4.63e-05 ***
## package_classSTD                5.211e-01  1.096e-01   4.754 2.00e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82670  on 73170  degrees of freedom
## Residual deviance: 75697  on 73150  degrees of freedom
## AIC: 75739
## 
## Number of Fisher Scoring iterations: 4

anova(ft2,"Chisqr")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Success
## 
## Terms added sequentially (first to last)
## 
## 
##                        Df Deviance Resid. Df Resid. Dev
## NULL                                   73170      82670
## access_code             1   212.11     73169      82457
## value_of_goods          1    51.49     73168      82406
## item_quantity           1    24.96     73167      82381
## ship_method             6  1186.85     73161      81194
## geocode_quality         1   448.01     73160      80746
## pkg_ship_cost           1   746.66     73159      79999
## pkg_width               1    72.10     73158      79927
## shipment_contents_type  1    24.93     73157      79902
## is_promise_met          1  2944.02     73156      76958
## star_rating             1    40.95     73155      76917
## gate_code               1   482.69     73154      76435
## res_com_flag            1   403.50     73153      76031
## is_street               1   303.21     73152      75728
## package_class           2    30.82     73150      75697

#datatest$pkg_outlier_weight=datatest$pkg_scale_weight
predict(ft2,datatest,type="resp")->p
library(ROCR)
pred <- prediction(p,datatest$Success)

pref<-ROCR::performance(pred,"tpr","fpr")

plot(pref,print.cutoffs.at=seq(0,1,.1))

pref1<-performance(pred,"acc")
plot(pref1)

pref2<-performance(pred,"lift")
plot(pref2)

#View(cbind(datatest$Success,p))
ifelse(p>=.3,1,0)->cq
#table(datatest$Success,cq)
a=table(datatest$Success,cq)
a

##    cq
##         0     1
##   0 19452  3993
##   1  4605  3309

Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy

## [1] 72.58203

sensitivity(a)

## [1] 0.8085796

specificity(a)

## [1] 0.4531635

precision(a)

## [1] 0.8296865

#####################################################################################################
######################                LASSO Regularization                 ####################################
#####################################################################################################


library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-13

#convert training data to matrix format
x <- model.matrix(Success~.,datatrain)
#convert class to numerical variable
y <- (datatrain$Success)
#perform grid search to find optimal value of lambda
#family= binomial => logistic regression, alpha=1 => lasso
# check docs to explore other type.measure options
cv.out <- cv.glmnet(x,y,alpha=1,family="binomial",type.measure = "mse" )
#plot result
plot(cv.out)

summary(cv.out)

##            Length Class  Mode     
## lambda     67     -none- numeric  
## cvm        67     -none- numeric  
## cvsd       67     -none- numeric  
## cvup       67     -none- numeric  
## cvlo       67     -none- numeric  
## nzero      67     -none- numeric  
## name        1     -none- character
## glmnet.fit 13     lognet list     
## lambda.min  1     -none- numeric  
## lambda.1se  1     -none- numeric

#min value of lambda
lambda_min <- cv.out$lambda.min
#best value of lambda
lambda_1se <- cv.out$lambda.1se
#regression coefficients
coef(cv.out,s=lambda_1se)

## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                            1
## (Intercept)                     1.266378e+00
## (Intercept)                     .           
## access_code1                   -4.679346e-03
## da_affinity_index               .           
## value_of_goods                  .           
## item_quantity                   2.972170e-03
## ship_methodAMZL_US_KEY          .           
## ship_methodAMZL_US_LMA         -6.129354e-01
## ship_methodAMZL_US_LMA_AIR     -6.988854e-01
## ship_methodAMZL_US_PREMIUM     -1.405857e-01
## ship_methodAMZL_US_PREMIUM_AIR -2.871593e-01
## ship_methodAMZL_US_STD          .           
## geocode_quality                 1.331367e-01
## pkg_ship_cost                   2.242150e-01
## pkg_width                      -2.133735e-02
## pkg_length                      .           
## pkg_height                      .           
## pkg_scale_weight               -7.978273e-03
## shipment_contents_type         -2.956555e-10
## is_promise_metY                -1.224937e+00
## star_rating                    -3.312618e-02
## gate_code1                      6.309146e-01
## res_com_flagR                  -1.046855e+00
## is_po_box1                      .           
## is_campus1                      .           
## is_freight_forwarder1           .           
## is_street1                     -3.001632e-01
## is_apt1                         .           
## package_classPARCEL             .           
## package_classSTD                3.654807e-02

#get test data
datatest1 <- model.matrix(Success~.,datatest)
#predict class, type="class"
lasso_prob <- predict(cv.out,newx = datatest1,s=lambda_1se,type="response")
#translate probabilities to predictions
lasso_predict <- rep(0,nrow(datatest))
lasso_predict[lasso_prob>.8] <- 1
#confusion matrix
table(pred=lasso_predict,true=datatest$Success)

##     true
## pred     0     1
##    0 23434  7878
##    1    11    36

lasso_predict[lasso_prob]

## numeric(0)

#accuracy
mean(lasso_predict==datatest$Success)

## [1] 0.7484295

table(datatest$Success,lasso_predict)

##    lasso_predict
##         0     1
##   0 23434    11
##   1  7878    36

pred1 <- prediction(lasso_predict,datatest$Success)

pref3<-ROCR::performance(pred1,"tpr","fpr")

plot(pref3,print.cutoffs.at=seq(0,1,.1))

############# Removing Columns Whose coefficient were 0 in LASSO Regularization #############

#####################################################################################################
######################                CART                 ####################################
#####################################################################################################
library(rpart)
library(rpart.plot)
library(ROCR)
cart=rpart(Success~.,data=datatrain,method="class")
summary(cart)

## Call:
## rpart(formula = Success ~ ., data = datatrain, method = "class")
##   n= 73171 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.08932214      0 1.0000000 1.0000000 0.006363167
## 2 0.04375846      3 0.7320336 0.7320336 0.005685140
## 3 0.02312483      4 0.6882751 0.6882751 0.005549810
## 4 0.01000000      5 0.6651503 0.6651503 0.005475013
## 
## Variable importance
## da_affinity_index    is_promise_met      res_com_flag       ship_method 
##                68                17                 5                 4 
##         is_street            is_apt 
##                 3                 2 
## 
## Node number 1: 73171 observations,    complexity param=0.08932214
##   predicted class=0  expected loss=0.2523541  P(node) =1
##     class counts: 54706 18465
##    probabilities: 0.748 0.252 
##   left son=2 (64677 obs) right son=3 (8494 obs)
##   Primary splits:
##       is_promise_met    splits as  RL, improve=1455.4990, (0 missing)
##       da_affinity_index < -0.03456654 to the left,  improve= 648.6074, (0 missing)
##       res_com_flag      splits as  RL, improve= 495.7095, (0 missing)
##       ship_method       splits as  RRRRLLL, improve= 400.1945, (0 missing)
##       pkg_ship_cost     < 1.009621    to the left,  improve= 356.3170, (0 missing)
## 
## Node number 2: 64677 observations,    complexity param=0.08932214
##   predicted class=0  expected loss=0.2162129  P(node) =0.8839158
##     class counts: 50693 13984
##    probabilities: 0.784 0.216 
##   left son=4 (37785 obs) right son=5 (26892 obs)
##   Primary splits:
##       da_affinity_index < -0.03456654 to the left,  improve=728.7503, (0 missing)
##       pkg_ship_cost     < 1.009621    to the left,  improve=428.5726, (0 missing)
##       ship_method       splits as  RRRRLRR, improve=207.1263, (0 missing)
##       gate_code         splits as  LR, improve=186.1909, (0 missing)
##       res_com_flag      splits as  RL, improve=150.3159, (0 missing)
##   Surrogate splits:
##       ship_method      splits as  LRLLLLR, agree=0.585, adj=0.002, (0 split)
##       pkg_width        < -2.936888   to the right, agree=0.584, adj=0.000, (0 split)
##       item_quantity    < 36.5        to the left,  agree=0.584, adj=0.000, (0 split)
##       pkg_length       < -3.343742   to the right, agree=0.584, adj=0.000, (0 split)
##       pkg_scale_weight < -0.7282287  to the right, agree=0.584, adj=0.000, (0 split)
## 
## Node number 3: 8494 observations,    complexity param=0.04375846
##   predicted class=1  expected loss=0.4724511  P(node) =0.1160842
##     class counts:  4013  4481
##    probabilities: 0.472 0.528 
##   left son=6 (6198 obs) right son=7 (2296 obs)
##   Primary splits:
##       res_com_flag    splits as  RL, improve=394.34240, (0 missing)
##       ship_method     splits as  LLRRLLL, improve=280.74210, (0 missing)
##       gate_code       splits as  LR, improve=116.83970, (0 missing)
##       is_street       splits as  RL, improve= 95.83463, (0 missing)
##       geocode_quality < 1.405858    to the left,  improve= 85.44774, (0 missing)
##   Surrogate splits:
##       ship_method          splits as  LLRRLLL, agree=0.951, adj=0.820, (0 split)
##       is_freight_forwarder splits as  LR, agree=0.743, adj=0.049, (0 split)
##       is_campus            splits as  LR, agree=0.737, adj=0.026, (0 split)
##       item_quantity        < 8.5         to the left,  agree=0.734, adj=0.016, (0 split)
##       pkg_scale_weight     < 4.461354    to the left,  agree=0.734, adj=0.016, (0 split)
## 
## Node number 4: 37785 observations
##   predicted class=0  expected loss=0.1528914  P(node) =0.5163931
##     class counts: 32008  5777
##    probabilities: 0.847 0.153 
## 
## Node number 5: 26892 observations,    complexity param=0.08932214
##   predicted class=0  expected loss=0.3051837  P(node) =0.3675227
##     class counts: 18685  8207
##    probabilities: 0.695 0.305 
##   left son=10 (21736 obs) right son=11 (5156 obs)
##   Primary splits:
##       da_affinity_index < 0.04859362  to the right, improve=5051.8310, (0 missing)
##       pkg_ship_cost     < 1.000341    to the left,  improve= 216.0812, (0 missing)
##       ship_method       splits as  RRRRLRR, improve= 181.2585, (0 missing)
##       gate_code         splits as  LR, improve= 113.0765, (0 missing)
##       res_com_flag      splits as  RL, improve= 102.7381, (0 missing)
##   Surrogate splits:
##       ship_method splits as  LRLLLLL, agree=0.808, adj=0.001, (0 split)
##       pkg_length  < -2.076414   to the right, agree=0.808, adj=0.001, (0 split)
##       pkg_width   < -3.163642   to the right, agree=0.808, adj=0.000, (0 split)
## 
## Node number 6: 6198 observations,    complexity param=0.02312483
##   predicted class=0  expected loss=0.4348177  P(node) =0.08470569
##     class counts:  3503  2695
##    probabilities: 0.565 0.435 
##   left son=12 (2921 obs) right son=13 (3277 obs)
##   Primary splits:
##       is_street       splits as  RL, improve=236.23140, (0 missing)
##       gate_code       splits as  LR, improve=146.55410, (0 missing)
##       pkg_ship_cost   < -0.6235852  to the right, improve= 71.37108, (0 missing)
##       geocode_quality < 1.405858    to the left,  improve= 63.61190, (0 missing)
##       is_apt          splits as  LR, improve= 46.27264, (0 missing)
##   Surrogate splits:
##       is_apt          splits as  LR, agree=0.820, adj=0.619, (0 split)
##       gate_code       splits as  LR, agree=0.567, adj=0.082, (0 split)
##       access_code     splits as  LR, agree=0.558, adj=0.061, (0 split)
##       pkg_ship_cost   < 0.05382405  to the right, agree=0.543, adj=0.031, (0 split)
##       geocode_quality < 1.405858    to the right, agree=0.543, adj=0.030, (0 split)
## 
## Node number 7: 2296 observations
##   predicted class=1  expected loss=0.2221254  P(node) =0.03137855
##     class counts:   510  1786
##    probabilities: 0.222 0.778 
## 
## Node number 10: 21736 observations
##   predicted class=0  expected loss=0.1559165  P(node) =0.2970576
##     class counts: 18347  3389
##    probabilities: 0.844 0.156 
## 
## Node number 11: 5156 observations
##   predicted class=1  expected loss=0.06555469  P(node) =0.07046507
##     class counts:   338  4818
##    probabilities: 0.066 0.934 
## 
## Node number 12: 2921 observations
##   predicted class=0  expected loss=0.2885998  P(node) =0.03992019
##     class counts:  2078   843
##    probabilities: 0.711 0.289 
## 
## Node number 13: 3277 observations
##   predicted class=1  expected loss=0.4348489  P(node) =0.0447855
##     class counts:  1425  1852
##    probabilities: 0.435 0.565

rpart.plot(x=cart,type=2, extra=103,under=T, fallen.leaves=TRUE,digits=2,faclen=0,cex=NULL, tweak=1,snip=FALSE,box.palette="GnRd", shadow.col=0)
a=table(datatrain$Success, predict(cart, newdata=datatrain, type="class"))
a

##    
##         0     1
##   0 52433  2273
##   1 10009  8456

Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy

## [1] 83.21466

sensitivity(a)

## [1] 0.8397072

specificity(a)

## [1] 0.7881443

precision(a)

## [1] 0.9584506

b=table(datatest$Success, predict(cart, newdata=datatest, type="class"))
b

##    
##         0     1
##   0 22452   993
##   1  4339  3575

Accuracy=(b[2,2]+b[1,1])/(b[2,1]+b[2,2]+b[1,1]+b[1,2])*100
Accuracy

## [1] 82.99691

sensitivity(b)

## [1] 0.8380426

specificity(b)

## [1] 0.7826182

precision(b)

## [1] 0.9576456

#######################################
######    Cross Validation    ######
#######################################
library(caret)
datatrain_cart=datatrain

####### Collected cp value
datatrain_cart$Success <- as.factor(datatrain_cart$Success)
trainControl <- trainControl(method="cv", number=10, repeats=3)

## Warning: `repeats` has no meaning for this resampling method.

set.seed(7)
fit.cart <- caret::train(Success~., data=datatrain_cart,metric="Accuracy", method="rpart", trControl=trainControl)
fit.cart

## CART 
## 
## 73171 samples
##    22 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 65854, 65854, 65854, 65853, 65855, 65853, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.02312483  0.8274317  0.4449825
##   0.04375846  0.8182203  0.4439334
##   0.08932214  0.7804600  0.2284839
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02312483.

cart=rpart(Success~.,data=datatrain_cart,method="class",cp = 0.02312483)
summary(cart)

## Call:
## rpart(formula = Success ~ ., data = datatrain_cart, method = "class", 
##     cp = 0.02312483)
##   n= 73171 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.08932214      0 1.0000000 1.0000000 0.006363167
## 2 0.04375846      3 0.7320336 0.7320336 0.005685140
## 3 0.02312483      4 0.6882751 0.6882751 0.005549810
## 4 0.02312483      5 0.6651503 0.6782562 0.005517691
## 
## Variable importance
## da_affinity_index    is_promise_met      res_com_flag       ship_method 
##                68                17                 5                 4 
##         is_street            is_apt 
##                 3                 2 
## 
## Node number 1: 73171 observations,    complexity param=0.08932214
##   predicted class=0  expected loss=0.2523541  P(node) =1
##     class counts: 54706 18465
##    probabilities: 0.748 0.252 
##   left son=2 (64677 obs) right son=3 (8494 obs)
##   Primary splits:
##       is_promise_met    splits as  RL, improve=1455.4990, (0 missing)
##       da_affinity_index < -0.03456654 to the left,  improve= 648.6074, (0 missing)
##       res_com_flag      splits as  RL, improve= 495.7095, (0 missing)
##       ship_method       splits as  RRRRLLL, improve= 400.1945, (0 missing)
##       pkg_ship_cost     < 1.009621    to the left,  improve= 356.3170, (0 missing)
## 
## Node number 2: 64677 observations,    complexity param=0.08932214
##   predicted class=0  expected loss=0.2162129  P(node) =0.8839158
##     class counts: 50693 13984
##    probabilities: 0.784 0.216 
##   left son=4 (37785 obs) right son=5 (26892 obs)
##   Primary splits:
##       da_affinity_index < -0.03456654 to the left,  improve=728.7503, (0 missing)
##       pkg_ship_cost     < 1.009621    to the left,  improve=428.5726, (0 missing)
##       ship_method       splits as  RRRRLRR, improve=207.1263, (0 missing)
##       gate_code         splits as  LR, improve=186.1909, (0 missing)
##       res_com_flag      splits as  RL, improve=150.3159, (0 missing)
##   Surrogate splits:
##       ship_method      splits as  LRLLLLR, agree=0.585, adj=0.002, (0 split)
##       pkg_width        < -2.936888   to the right, agree=0.584, adj=0.000, (0 split)
##       item_quantity    < 36.5        to the left,  agree=0.584, adj=0.000, (0 split)
##       pkg_length       < -3.343742   to the right, agree=0.584, adj=0.000, (0 split)
##       pkg_scale_weight < -0.7282287  to the right, agree=0.584, adj=0.000, (0 split)
## 
## Node number 3: 8494 observations,    complexity param=0.04375846
##   predicted class=1  expected loss=0.4724511  P(node) =0.1160842
##     class counts:  4013  4481
##    probabilities: 0.472 0.528 
##   left son=6 (6198 obs) right son=7 (2296 obs)
##   Primary splits:
##       res_com_flag    splits as  RL, improve=394.34240, (0 missing)
##       ship_method     splits as  LLRRLLL, improve=280.74210, (0 missing)
##       gate_code       splits as  LR, improve=116.83970, (0 missing)
##       is_street       splits as  RL, improve= 95.83463, (0 missing)
##       geocode_quality < 1.405858    to the left,  improve= 85.44774, (0 missing)
##   Surrogate splits:
##       ship_method          splits as  LLRRLLL, agree=0.951, adj=0.820, (0 split)
##       is_freight_forwarder splits as  LR, agree=0.743, adj=0.049, (0 split)
##       is_campus            splits as  LR, agree=0.737, adj=0.026, (0 split)
##       item_quantity        < 8.5         to the left,  agree=0.734, adj=0.016, (0 split)
##       pkg_scale_weight     < 4.461354    to the left,  agree=0.734, adj=0.016, (0 split)
## 
## Node number 4: 37785 observations
##   predicted class=0  expected loss=0.1528914  P(node) =0.5163931
##     class counts: 32008  5777
##    probabilities: 0.847 0.153 
## 
## Node number 5: 26892 observations,    complexity param=0.08932214
##   predicted class=0  expected loss=0.3051837  P(node) =0.3675227
##     class counts: 18685  8207
##    probabilities: 0.695 0.305 
##   left son=10 (21736 obs) right son=11 (5156 obs)
##   Primary splits:
##       da_affinity_index < 0.04859362  to the right, improve=5051.8310, (0 missing)
##       pkg_ship_cost     < 1.000341    to the left,  improve= 216.0812, (0 missing)
##       ship_method       splits as  RRRRLRR, improve= 181.2585, (0 missing)
##       gate_code         splits as  LR, improve= 113.0765, (0 missing)
##       res_com_flag      splits as  RL, improve= 102.7381, (0 missing)
##   Surrogate splits:
##       ship_method splits as  LRLLLLL, agree=0.808, adj=0.001, (0 split)
##       pkg_length  < -2.076414   to the right, agree=0.808, adj=0.001, (0 split)
##       pkg_width   < -3.163642   to the right, agree=0.808, adj=0.000, (0 split)
## 
## Node number 6: 6198 observations,    complexity param=0.02312483
##   predicted class=0  expected loss=0.4348177  P(node) =0.08470569
##     class counts:  3503  2695
##    probabilities: 0.565 0.435 
##   left son=12 (2921 obs) right son=13 (3277 obs)
##   Primary splits:
##       is_street       splits as  RL, improve=236.23140, (0 missing)
##       gate_code       splits as  LR, improve=146.55410, (0 missing)
##       pkg_ship_cost   < -0.6235852  to the right, improve= 71.37108, (0 missing)
##       geocode_quality < 1.405858    to the left,  improve= 63.61190, (0 missing)
##       is_apt          splits as  LR, improve= 46.27264, (0 missing)
##   Surrogate splits:
##       is_apt          splits as  LR, agree=0.820, adj=0.619, (0 split)
##       gate_code       splits as  LR, agree=0.567, adj=0.082, (0 split)
##       access_code     splits as  LR, agree=0.558, adj=0.061, (0 split)
##       pkg_ship_cost   < 0.05382405  to the right, agree=0.543, adj=0.031, (0 split)
##       geocode_quality < 1.405858    to the right, agree=0.543, adj=0.030, (0 split)
## 
## Node number 7: 2296 observations
##   predicted class=1  expected loss=0.2221254  P(node) =0.03137855
##     class counts:   510  1786
##    probabilities: 0.222 0.778 
## 
## Node number 10: 21736 observations
##   predicted class=0  expected loss=0.1559165  P(node) =0.2970576
##     class counts: 18347  3389
##    probabilities: 0.844 0.156 
## 
## Node number 11: 5156 observations
##   predicted class=1  expected loss=0.06555469  P(node) =0.07046507
##     class counts:   338  4818
##    probabilities: 0.066 0.934 
## 
## Node number 12: 2921 observations
##   predicted class=0  expected loss=0.2885998  P(node) =0.03992019
##     class counts:  2078   843
##    probabilities: 0.711 0.289 
## 
## Node number 13: 3277 observations
##   predicted class=1  expected loss=0.4348489  P(node) =0.0447855
##     class counts:  1425  1852
##    probabilities: 0.435 0.565

rpart.plot(x=cart,type=2, extra=103,under=T, fallen.leaves=TRUE,digits=2,faclen=0,cex=NULL, tweak=1,snip=FALSE,box.palette="GnRd", shadow.col=0)

a=table(datatrain$Success, predict(cart, newdata=datatrain, type="class"))
a

##    
##         0     1
##   0 52433  2273
##   1 10009  8456

Accuracy=(a[2,2]+a[1,1])/(a[2,1]+a[2,2]+a[1,1]+a[1,2])*100
Accuracy

## [1] 83.21466

sensitivity(a)

## [1] 0.8397072

specificity(a)

## [1] 0.7881443

precision(a)

## [1] 0.9584506

b=table(datatest$Success, predict(cart, newdata=datatest, type="class"))
b

##    
##         0     1
##   0 22452   993
##   1  4339  3575

Accuracy=(b[2,2]+b[1,1])/(b[2,1]+b[2,2]+b[1,1]+b[1,2])*100
Accuracy

## [1] 82.99691

sensitivity(a)

## [1] 0.8397072

specificity(a)

## [1] 0.7881443

precision(a)

## [1] 0.9584506

TPRTest=b[2,2]/(b[2,1]+b[2,2])*100
TPRTest

## [1] 45.17311

Shipment Delivery Failuer Prediction

Vyomesh Upadhyay

March 9, 2018