library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.2.5
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.2.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.2.5
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.2.5
## Loading required package: DBI
library(scales)
library(foreign)
library(nnet)
library(caTools)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
## 
##     here
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
library(gbm)
## Warning: package 'gbm' was built under R version 3.2.5
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
library(rpart)
library(rattle)
## Warning: package 'rattle' was built under R version 3.2.5
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(caret)

setwd('C:\\Users\\rgupta37\\Desktop\\MarketingProject\\Project')
df<-read.csv('./train_1.csv',header = T,stringsAsFactors = F)
df<-df[,-c(11,12,13,14,15,16,17)]

Feature Engineering

#converting AgeUponOutcome from years,months and weeks to days

df$value<-sapply(df$AgeuponOutcome,function(x) strsplit(as.character(x)," ")[[1]][1])
df$unit<-sapply(df$AgeuponOutcome,function(x) strsplit(as.character(x)," ")[[1]][2])
df$unit_c<-0
df$unit_c[df$unit=="year"]<-365
df$unit_c[df$unit=="years"]<-365
df$unit_c[df$unit=="month"]<-30
df$unit_c[df$unit=="months"]<-30
df$unit_c[df$unit=="week"]<-7
df$unit_c[df$unit=="weeks"]<-7




for(i in 1:nrow(df)){
  df$age[i]<-as.numeric(df[i,"value"])*(as.numeric(df[i,"unit_c"]))
}




df$Breed_1<-sapply(df$Breed,function(x) strsplit(x,"/")[[1]][1])


df[grep("Mix",df$Breed_1),"mix_pure"]<-1

df[-grep("Mix",df$Breed_1),"mix_pure"]<-0

df$Intact <- ifelse(grepl('Intact', df$SexuponOutcome), 1,
               ifelse(grepl('Unknown', df$SexuponOutcome), 'Unknown', 0))


df$Sex <- ifelse(grepl('Male', df$SexuponOutcome), 'Male',
            ifelse(grepl('Unknown', df$Sex), 'Unknown', 'Female'))


  

head(df)
##   AnimalID    Name       Date       Time     OutcomeType OutcomeSubtype
## 1  A671945 Hambone 2014-02-12 18H 22M 0S Return_to_owner               
## 2  A656520   Emily 2013-10-13 12H 44M 0S      Euthanasia      Suffering
## 3  A686464  Pearce 2015-01-31 12H 28M 0S        Adoption         Foster
## 4  A683430         2014-07-11  19H 9M 0S        Transfer        Partner
## 5  A667013         2013-11-15 12H 52M 0S        Transfer        Partner
## 6  A677334    Elsa 2014-04-25  13H 4M 0S        Transfer        Partner
##   AnimalType SexuponOutcome AgeuponOutcome
## 1        Dog  Neutered Male         1 year
## 2        Cat  Spayed Female         1 year
## 3        Dog  Neutered Male        2 years
## 4        Cat    Intact Male        3 weeks
## 5        Dog  Neutered Male        2 years
## 6        Dog  Intact Female        1 month
##                               Breed Weight Code_A Code_B Named Day Month
## 1             Shetland Sheepdog Mix  Small      3      2     1 Wed   Feb
## 2            Domestic Shorthair Mix   #N/A      2      2     1 Sun   Oct
## 3                      Pit Bull Mix Medium      3      2     1 Sat   Jan
## 4            Domestic Shorthair Mix   #N/A      2      1     0 Fri   Jul
## 5       Lhasa Apso/Miniature Poodle  Small      2      1     0 Fri   Nov
## 6 Cairn Terrier/Chihuahua Shorthair  Small      1      2     1 Fri   Apr
##   year hour value  unit unit_c age                Breed_1 mix_pure Intact
## 1 2014   18     1  year    365 365  Shetland Sheepdog Mix        1      0
## 2 2013   12     1  year    365 365 Domestic Shorthair Mix        1      0
## 3 2015   12     2 years    365 730           Pit Bull Mix        1      0
## 4 2014   19     3 weeks      7  21 Domestic Shorthair Mix        1      1
## 5 2013   12     2 years    365 730             Lhasa Apso        0      0
## 6 2014   13     1 month     30  30          Cairn Terrier        0      1
##      Sex
## 1   Male
## 2 Female
## 3   Male
## 4   Male
## 5   Male
## 6 Female

Exploratory Analysis

ggplot(df,aes(OutcomeType))+
  geom_bar(aes(fill=OutcomeType,y = (..count..)/tapply(..count..,..PANEL..,sum)[..PANEL..]))+
  facet_grid(AnimalType~.)+
  scale_y_continuous(labels = percent_format())+
  ylab("Percent")+
  ggtitle("Distribution of OutcomeType across Animal Type")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))

ggplot(df,aes(OutcomeType))+
  geom_bar(aes(fill=OutcomeType,y = (..count..)/tapply(..count..,..PANEL..,sum)[..PANEL..]))+
  scale_y_continuous(labels=percent)+
  facet_grid(Day~.)+
  ylab("Percent")+
  ggtitle("Distribution of OutcomeType across Days")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))

ggplot(df,aes(OutcomeType))+
  geom_bar(aes(fill=OutcomeType,y = (..count..)/tapply(..count..,..PANEL..,sum)[..PANEL..]))+
  scale_y_continuous(labels=percent)+
  facet_grid(Month~.)+
  ylab("Percent")+
  ggtitle("Distribution of OutcomeType across Months")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))

ggplot(df,aes(factor(OutcomeType),hour))+
  geom_boxplot(aes(fill=OutcomeType))+
  ggtitle("Distribution of hours across OutcomeType")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))

Animals with lower ages are adopted compared to other outcomes

ggplot(df,aes(factor(OutcomeType),age))+
  geom_boxplot(aes(fill=OutcomeType))+
  ylab("Age of the animal")+
  xlab("Outcome") + ggtitle("Distribution of Age across OutcomeTypes")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).

x<-df$age
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
x[x < (qnt[1] - H)] <- caps[1]
x[x > (qnt[2] + H)] <- caps[2]
df$age<-x

Modeling:Data Prep for modeling

set.seed(200)
df$Named<-as.factor(df$Named)
df$Month<-as.factor(df$Month)
df$SexuponOutcome<-as.factor(df$SexuponOutcome)
df$year<-as.factor(df$year)
df$hour<-as.factor(df$hour)
df$AnimalType<-as.factor(df$AnimalType)
df$mix_pure<-as.factor(df$mix_pure)
df$Intact<-as.factor(df$Intact)
df$Sex<-as.factor(df$Sex)
df$Weight<-as.factor(df$Weight)
df$Code_A<-as.factor(df$Code_A)
df$Code_B<-as.factor(df$Code_B)
df$age.log<-log(df$age+1.01)



#df$Color_Code<-as.factor(df$Color_Code)



df$OutcomeType<-as.factor(df$OutcomeType)
df$OutcomeType1<-relevel(df$OutcomeType,ref="Died")

df_1<-df[,c(5,7,13,14,15,16,17,18,27,24,25,26)]
df_2<-df[,c(28,7,13,14,15,16,17,18,27,24,25,26)]



sample<-sample.split(df_1$OutcomeType,SplitRatio = 0.75)
df_train<-subset(df_1, sample == TRUE)


df_test<-subset(df_1, sample == FALSE)


colnames(df_train)
##  [1] "OutcomeType" "AnimalType"  "Code_B"      "Named"       "Day"        
##  [6] "Month"       "year"        "hour"        "age.log"     "mix_pure"   
## [11] "Intact"      "Sex"
colnames(df_test)
##  [1] "OutcomeType" "AnimalType"  "Code_B"      "Named"       "Day"        
##  [6] "Month"       "year"        "hour"        "age.log"     "mix_pure"   
## [11] "Intact"      "Sex"
#test_1<-test[,c(5,9,13,14,15,16,17,18,22,24,25,26)]

#df_train_1<-df_train[,c(7,13,17,18,19,20,21,22,26,28,29,30,31)]


#df_train<-df[,c(7,8,12,13,14,15,16,20,5)]
#summary(multi_log_reg)

#df_train<-data.frame(df_train,na.action=T)
#sum(is.na(df_train))
#multi_log_reg<-multinom(OutcomeType_1~.,data = df_train)


df_train<-df_train[complete.cases(df_train),]

df_test<-df_test[complete.cases(df_test),]

#levels(test_1$hour)[levels(test_1$hour)=="3"]<-"5"
# Random forst model has to be repeated again- Last accuracy 66.9% 



#model_gbm<-train(df_train$OutcomeType~., data=df_train, method="gbm", distribution="bernoulli")
# prediction<-predict(multi_log_reg,df_test)



# out<-data.frame(out,prediction_test)
# 
# out1<-data.frame(out,predict_multi_log_reg)
# 
# write.csv(out1,'./Out1.csv')

Decision Tree

model_tree<-rpart(OutcomeType~.,data=df_train,method = "class")
summary(model_tree)
## Call:
## rpart(formula = OutcomeType ~ ., data = df_train, method = "class")
##   n= 20035 
## 
##          CP nsplit rel error    xerror        xstd
## 1 0.3429503      0 1.0000000 1.0000000 0.005806322
## 2 0.0100000      1 0.6570497 0.6570497 0.005779139
## 
## Variable importance
##  Intact age.log   Named     Sex    hour 
##      53      18      16       7       5 
## 
## Node number 1: 20035 observations,    complexity param=0.3429503
##   predicted class=Adoption  expected loss=0.5968555  P(node) =1
##     class counts:  8077   148  1164  3589  7057
##    probabilities: 0.403 0.007 0.058 0.179 0.352 
##   left son=2 (13994 obs) right son=3 (6041 obs)
##   Primary splits:
##       Intact     splits as  LRR, improve=2398.0580, (0 missing)
##       Named      splits as  RL, improve=1186.3100, (0 missing)
##       age.log    < 3.400975 to the right, improve= 963.0611, (0 missing)
##       hour       splits as  RRLLLRRRRRRRRLLLLLLR, improve= 689.0439, (0 missing)
##       AnimalType splits as  RL, improve= 546.0741, (0 missing)
##   Surrogate splits:
##       age.log < 3.847417 to the right, agree=0.801, adj=0.339, (0 split)
##       Named   splits as  RL, agree=0.791, adj=0.307, (0 split)
##       Sex     splits as  LLR, agree=0.738, adj=0.131, (0 split)
##       hour    splits as  LLLLLRLLLLLLLLLLLLLL, agree=0.727, adj=0.094, (0 split)
## 
## Node number 2: 13994 observations
##   predicted class=Adoption  expected loss=0.4414035  P(node) =0.6984777
##     class counts:  7817    28   441  3012  2696
##    probabilities: 0.559 0.002 0.032 0.215 0.193 
## 
## Node number 3: 6041 observations
##   predicted class=Transfer  expected loss=0.2780997  P(node) =0.3015223
##     class counts:   260   120   723   577  4361
##    probabilities: 0.043 0.020 0.120 0.096 0.722
fancyRpartPlot(model_tree)

predict_tree<-predict(model_tree,df_test,type="class")
z<-(data.frame(df_test,predict_tree))
c=0
for(i in 1:nrow(z)){
  if(z[i,"predict_tree"]==z[i,"OutcomeType"]){
    c=c+1
  }
  
}
accuracy_tree<-c*100/nrow(z)

tb_1<-table(z$OutcomeType,z$predict_tree)
confusionMatrix(tb_1)
## Confusion Matrix and Statistics
## 
##                  
##                   Adoption Died Euthanasia Return_to_owner Transfer
##   Adoption            2591    0          0               0      101
##   Died                   9    0          0               0       40
##   Euthanasia           135    0          0               0      254
##   Return_to_owner      983    0          0               0      213
##   Transfer             886    0          0               0     1463
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6073          
##                  95% CI : (0.5955, 0.6191)
##     No Information Rate : 0.6897          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3591          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adoption Class: Died Class: Euthanasia
## Sensitivity                   0.5628          NA                NA
## Specificity                   0.9512    0.992659           0.94172
## Pos Pred Value                0.9625          NA                NA
## Neg Pred Value                0.4946          NA                NA
## Prevalence                    0.6897    0.000000           0.00000
## Detection Rate                0.3882    0.000000           0.00000
## Detection Prevalence          0.4033    0.007341           0.05828
## Balanced Accuracy             0.7570          NA                NA
##                      Class: Return_to_owner Class: Transfer
## Sensitivity                              NA          0.7064
## Specificity                          0.8208          0.8076
## Pos Pred Value                           NA          0.6228
## Neg Pred Value                           NA          0.8595
## Prevalence                           0.0000          0.3103
## Detection Rate                       0.0000          0.2192
## Detection Prevalence                 0.1792          0.3519
## Balanced Accuracy                        NA          0.7570

Random Forest

model_rf<-randomForest(OutcomeType~.,data=df_train,importance=T,ntree=200)
prediction<-predict(model_rf,df_test)
z<-(data.frame(df_test,prediction))
c=0
for(i in 1:nrow(z)){
  if(z[i,"prediction"]==z[i,"OutcomeType"]){
    c=c+1
  }
  
}

varImpPlot(model_rf)

accuracy<-c*100/nrow(z)
tb_1<-table(z$OutcomeType,z$prediction)
confusionMatrix(tb_1)
## Confusion Matrix and Statistics
## 
##                  
##                   Adoption Died Euthanasia Return_to_owner Transfer
##   Adoption            2274    0          4             229      185
##   Died                   6    2          4               2       35
##   Euthanasia            62    0         70              70      187
##   Return_to_owner      539    0          8             492      157
##   Transfer             441    0         21             171     1716
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6822          
##                  95% CI : (0.6709, 0.6934)
##     No Information Rate : 0.4977          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5129          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Adoption Class: Died Class: Euthanasia
## Sensitivity                   0.6845   1.0000000           0.65421
## Specificity                   0.8753   0.9929567           0.95143
## Pos Pred Value                0.8447   0.0408163           0.17995
## Neg Pred Value                0.7369   1.0000000           0.99411
## Prevalence                    0.4977   0.0002996           0.01603
## Detection Rate                0.3407   0.0002996           0.01049
## Detection Prevalence          0.4033   0.0073408           0.05828
## Balanced Accuracy             0.7799   0.9964783           0.80282
##                      Class: Return_to_owner Class: Transfer
## Sensitivity                         0.51037          0.7526
## Specificity                         0.87673          0.8560
## Pos Pred Value                      0.41137          0.7305
## Neg Pred Value                      0.91385          0.8696
## Prevalence                          0.14442          0.3416
## Detection Rate                      0.07371          0.2571
## Detection Prevalence                0.17918          0.3519
## Balanced Accuracy                   0.69355          0.8043
sample1<-sample.split(df_2$OutcomeType1,SplitRatio = 0.75)

df_train1<-subset(df_2, sample1 == TRUE)


df_test1<-subset(df_2, sample1 == FALSE)

df_train1<-df_train1[complete.cases(df_train1),]

df_test1<-df_test1[complete.cases(df_test1),]

multi_log_reg<-multinom(OutcomeType1~.,data = df_train1)
## # weights:  250 (196 variable)
## initial  value 32237.041386 
## iter  10 value 19407.015630
## iter  20 value 18719.709189
## iter  30 value 18021.539516
## iter  40 value 17376.064290
## iter  50 value 16881.130962
## iter  60 value 16689.229299
## iter  70 value 16574.792569
## iter  80 value 16518.789912
## iter  90 value 16488.495270
## iter 100 value 16472.915856
## final  value 16472.915856 
## stopped after 100 iterations
predict_multi_log_reg<-predict(multi_log_reg,df_test1)

z<-(data.frame(df_test1,predict_multi_log_reg))
c=0
for(i in 1:nrow(z)){
  if(z[i,"predict_multi_log_reg"]==z[i,"OutcomeType1"]){
    c=c+1
  }
  
}
accuracy_multiReg<-c*100/nrow(z)


tb_1<-table(z$OutcomeType,z$predict_multi_log_reg)
confusionMatrix(tb_1)
## Confusion Matrix and Statistics
## 
##                  
##                   Died Adoption Euthanasia Return_to_owner Transfer
##   Died               0        4          3               3       39
##   Adoption           0     2245          3             246      198
##   Euthanasia         0       58         38              68      225
##   Return_to_owner    0      469          6             569      152
##   Transfer           0      502         23             217     1612
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6683          
##                  95% CI : (0.6568, 0.6796)
##     No Information Rate : 0.4907          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4932          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Died Class: Adoption Class: Euthanasia
## Sensitivity                   NA          0.6849          0.520548
## Specificity             0.992665          0.8686          0.946875
## Pos Pred Value                NA          0.8340          0.097686
## Neg Pred Value                NA          0.7410          0.994436
## Prevalence              0.000000          0.4907          0.010928
## Detection Rate          0.000000          0.3361          0.005689
## Detection Prevalence    0.007335          0.4030          0.058234
## Balanced Accuracy             NA          0.7767          0.733711
##                      Class: Return_to_owner Class: Transfer
## Sensitivity                         0.51587          0.7242
## Specificity                         0.88757          0.8334
## Pos Pred Value                      0.47575          0.6848
## Neg Pred Value                      0.90263          0.8581
## Prevalence                          0.16512          0.3332
## Detection Rate                      0.08518          0.2413
## Detection Prevalence                0.17904          0.3524
## Balanced Accuracy                   0.70172          0.7788