library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.2.5
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.2.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.2.5
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.2.5
## Loading required package: DBI
library(scales)
library(foreign)
library(nnet)
library(caTools)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
##
## here
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
library(gbm)
## Warning: package 'gbm' was built under R version 3.2.5
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
library(rpart)
library(rattle)
## Warning: package 'rattle' was built under R version 3.2.5
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(caret)
setwd('C:\\Users\\rgupta37\\Desktop\\MarketingProject\\Project')
df<-read.csv('./train_1.csv',header = T,stringsAsFactors = F)
df<-df[,-c(11,12,13,14,15,16,17)]
Feature Engineering
#converting AgeUponOutcome from years,months and weeks to days
df$value<-sapply(df$AgeuponOutcome,function(x) strsplit(as.character(x)," ")[[1]][1])
df$unit<-sapply(df$AgeuponOutcome,function(x) strsplit(as.character(x)," ")[[1]][2])
df$unit_c<-0
df$unit_c[df$unit=="year"]<-365
df$unit_c[df$unit=="years"]<-365
df$unit_c[df$unit=="month"]<-30
df$unit_c[df$unit=="months"]<-30
df$unit_c[df$unit=="week"]<-7
df$unit_c[df$unit=="weeks"]<-7
for(i in 1:nrow(df)){
df$age[i]<-as.numeric(df[i,"value"])*(as.numeric(df[i,"unit_c"]))
}
df$Breed_1<-sapply(df$Breed,function(x) strsplit(x,"/")[[1]][1])
df[grep("Mix",df$Breed_1),"mix_pure"]<-1
df[-grep("Mix",df$Breed_1),"mix_pure"]<-0
df$Intact <- ifelse(grepl('Intact', df$SexuponOutcome), 1,
ifelse(grepl('Unknown', df$SexuponOutcome), 'Unknown', 0))
df$Sex <- ifelse(grepl('Male', df$SexuponOutcome), 'Male',
ifelse(grepl('Unknown', df$Sex), 'Unknown', 'Female'))
head(df)
## AnimalID Name Date Time OutcomeType OutcomeSubtype
## 1 A671945 Hambone 2014-02-12 18H 22M 0S Return_to_owner
## 2 A656520 Emily 2013-10-13 12H 44M 0S Euthanasia Suffering
## 3 A686464 Pearce 2015-01-31 12H 28M 0S Adoption Foster
## 4 A683430 2014-07-11 19H 9M 0S Transfer Partner
## 5 A667013 2013-11-15 12H 52M 0S Transfer Partner
## 6 A677334 Elsa 2014-04-25 13H 4M 0S Transfer Partner
## AnimalType SexuponOutcome AgeuponOutcome
## 1 Dog Neutered Male 1 year
## 2 Cat Spayed Female 1 year
## 3 Dog Neutered Male 2 years
## 4 Cat Intact Male 3 weeks
## 5 Dog Neutered Male 2 years
## 6 Dog Intact Female 1 month
## Breed Weight Code_A Code_B Named Day Month
## 1 Shetland Sheepdog Mix Small 3 2 1 Wed Feb
## 2 Domestic Shorthair Mix #N/A 2 2 1 Sun Oct
## 3 Pit Bull Mix Medium 3 2 1 Sat Jan
## 4 Domestic Shorthair Mix #N/A 2 1 0 Fri Jul
## 5 Lhasa Apso/Miniature Poodle Small 2 1 0 Fri Nov
## 6 Cairn Terrier/Chihuahua Shorthair Small 1 2 1 Fri Apr
## year hour value unit unit_c age Breed_1 mix_pure Intact
## 1 2014 18 1 year 365 365 Shetland Sheepdog Mix 1 0
## 2 2013 12 1 year 365 365 Domestic Shorthair Mix 1 0
## 3 2015 12 2 years 365 730 Pit Bull Mix 1 0
## 4 2014 19 3 weeks 7 21 Domestic Shorthair Mix 1 1
## 5 2013 12 2 years 365 730 Lhasa Apso 0 0
## 6 2014 13 1 month 30 30 Cairn Terrier 0 1
## Sex
## 1 Male
## 2 Female
## 3 Male
## 4 Male
## 5 Male
## 6 Female
Exploratory Analysis
ggplot(df,aes(OutcomeType))+
geom_bar(aes(fill=OutcomeType,y = (..count..)/tapply(..count..,..PANEL..,sum)[..PANEL..]))+
facet_grid(AnimalType~.)+
scale_y_continuous(labels = percent_format())+
ylab("Percent")+
ggtitle("Distribution of OutcomeType across Animal Type")+
theme(plot.title = element_text(lineheight=.8, face="bold"))

ggplot(df,aes(OutcomeType))+
geom_bar(aes(fill=OutcomeType,y = (..count..)/tapply(..count..,..PANEL..,sum)[..PANEL..]))+
scale_y_continuous(labels=percent)+
facet_grid(Day~.)+
ylab("Percent")+
ggtitle("Distribution of OutcomeType across Days")+
theme(plot.title = element_text(lineheight=.8, face="bold"))

ggplot(df,aes(OutcomeType))+
geom_bar(aes(fill=OutcomeType,y = (..count..)/tapply(..count..,..PANEL..,sum)[..PANEL..]))+
scale_y_continuous(labels=percent)+
facet_grid(Month~.)+
ylab("Percent")+
ggtitle("Distribution of OutcomeType across Months")+
theme(plot.title = element_text(lineheight=.8, face="bold"))

ggplot(df,aes(factor(OutcomeType),hour))+
geom_boxplot(aes(fill=OutcomeType))+
ggtitle("Distribution of hours across OutcomeType")+
theme(plot.title = element_text(lineheight=.8, face="bold"))

Animals with lower ages are adopted compared to other outcomes
ggplot(df,aes(factor(OutcomeType),age))+
geom_boxplot(aes(fill=OutcomeType))+
ylab("Age of the animal")+
xlab("Outcome") + ggtitle("Distribution of Age across OutcomeTypes")+
theme(plot.title = element_text(lineheight=.8, face="bold"))
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).

x<-df$age
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
x[x < (qnt[1] - H)] <- caps[1]
x[x > (qnt[2] + H)] <- caps[2]
df$age<-x
Modeling:Data Prep for modeling
set.seed(200)
df$Named<-as.factor(df$Named)
df$Month<-as.factor(df$Month)
df$SexuponOutcome<-as.factor(df$SexuponOutcome)
df$year<-as.factor(df$year)
df$hour<-as.factor(df$hour)
df$AnimalType<-as.factor(df$AnimalType)
df$mix_pure<-as.factor(df$mix_pure)
df$Intact<-as.factor(df$Intact)
df$Sex<-as.factor(df$Sex)
df$Weight<-as.factor(df$Weight)
df$Code_A<-as.factor(df$Code_A)
df$Code_B<-as.factor(df$Code_B)
df$age.log<-log(df$age+1.01)
#df$Color_Code<-as.factor(df$Color_Code)
df$OutcomeType<-as.factor(df$OutcomeType)
df$OutcomeType1<-relevel(df$OutcomeType,ref="Died")
df_1<-df[,c(5,7,13,14,15,16,17,18,27,24,25,26)]
df_2<-df[,c(28,7,13,14,15,16,17,18,27,24,25,26)]
sample<-sample.split(df_1$OutcomeType,SplitRatio = 0.75)
df_train<-subset(df_1, sample == TRUE)
df_test<-subset(df_1, sample == FALSE)
colnames(df_train)
## [1] "OutcomeType" "AnimalType" "Code_B" "Named" "Day"
## [6] "Month" "year" "hour" "age.log" "mix_pure"
## [11] "Intact" "Sex"
colnames(df_test)
## [1] "OutcomeType" "AnimalType" "Code_B" "Named" "Day"
## [6] "Month" "year" "hour" "age.log" "mix_pure"
## [11] "Intact" "Sex"
#test_1<-test[,c(5,9,13,14,15,16,17,18,22,24,25,26)]
#df_train_1<-df_train[,c(7,13,17,18,19,20,21,22,26,28,29,30,31)]
#df_train<-df[,c(7,8,12,13,14,15,16,20,5)]
#summary(multi_log_reg)
#df_train<-data.frame(df_train,na.action=T)
#sum(is.na(df_train))
#multi_log_reg<-multinom(OutcomeType_1~.,data = df_train)
df_train<-df_train[complete.cases(df_train),]
df_test<-df_test[complete.cases(df_test),]
#levels(test_1$hour)[levels(test_1$hour)=="3"]<-"5"
# Random forst model has to be repeated again- Last accuracy 66.9%
#model_gbm<-train(df_train$OutcomeType~., data=df_train, method="gbm", distribution="bernoulli")
# prediction<-predict(multi_log_reg,df_test)
# out<-data.frame(out,prediction_test)
#
# out1<-data.frame(out,predict_multi_log_reg)
#
# write.csv(out1,'./Out1.csv')
Decision Tree
model_tree<-rpart(OutcomeType~.,data=df_train,method = "class")
summary(model_tree)
## Call:
## rpart(formula = OutcomeType ~ ., data = df_train, method = "class")
## n= 20035
##
## CP nsplit rel error xerror xstd
## 1 0.3429503 0 1.0000000 1.0000000 0.005806322
## 2 0.0100000 1 0.6570497 0.6570497 0.005779139
##
## Variable importance
## Intact age.log Named Sex hour
## 53 18 16 7 5
##
## Node number 1: 20035 observations, complexity param=0.3429503
## predicted class=Adoption expected loss=0.5968555 P(node) =1
## class counts: 8077 148 1164 3589 7057
## probabilities: 0.403 0.007 0.058 0.179 0.352
## left son=2 (13994 obs) right son=3 (6041 obs)
## Primary splits:
## Intact splits as LRR, improve=2398.0580, (0 missing)
## Named splits as RL, improve=1186.3100, (0 missing)
## age.log < 3.400975 to the right, improve= 963.0611, (0 missing)
## hour splits as RRLLLRRRRRRRRLLLLLLR, improve= 689.0439, (0 missing)
## AnimalType splits as RL, improve= 546.0741, (0 missing)
## Surrogate splits:
## age.log < 3.847417 to the right, agree=0.801, adj=0.339, (0 split)
## Named splits as RL, agree=0.791, adj=0.307, (0 split)
## Sex splits as LLR, agree=0.738, adj=0.131, (0 split)
## hour splits as LLLLLRLLLLLLLLLLLLLL, agree=0.727, adj=0.094, (0 split)
##
## Node number 2: 13994 observations
## predicted class=Adoption expected loss=0.4414035 P(node) =0.6984777
## class counts: 7817 28 441 3012 2696
## probabilities: 0.559 0.002 0.032 0.215 0.193
##
## Node number 3: 6041 observations
## predicted class=Transfer expected loss=0.2780997 P(node) =0.3015223
## class counts: 260 120 723 577 4361
## probabilities: 0.043 0.020 0.120 0.096 0.722
fancyRpartPlot(model_tree)

predict_tree<-predict(model_tree,df_test,type="class")
z<-(data.frame(df_test,predict_tree))
c=0
for(i in 1:nrow(z)){
if(z[i,"predict_tree"]==z[i,"OutcomeType"]){
c=c+1
}
}
accuracy_tree<-c*100/nrow(z)
tb_1<-table(z$OutcomeType,z$predict_tree)
confusionMatrix(tb_1)
## Confusion Matrix and Statistics
##
##
## Adoption Died Euthanasia Return_to_owner Transfer
## Adoption 2591 0 0 0 101
## Died 9 0 0 0 40
## Euthanasia 135 0 0 0 254
## Return_to_owner 983 0 0 0 213
## Transfer 886 0 0 0 1463
##
## Overall Statistics
##
## Accuracy : 0.6073
## 95% CI : (0.5955, 0.6191)
## No Information Rate : 0.6897
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3591
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adoption Class: Died Class: Euthanasia
## Sensitivity 0.5628 NA NA
## Specificity 0.9512 0.992659 0.94172
## Pos Pred Value 0.9625 NA NA
## Neg Pred Value 0.4946 NA NA
## Prevalence 0.6897 0.000000 0.00000
## Detection Rate 0.3882 0.000000 0.00000
## Detection Prevalence 0.4033 0.007341 0.05828
## Balanced Accuracy 0.7570 NA NA
## Class: Return_to_owner Class: Transfer
## Sensitivity NA 0.7064
## Specificity 0.8208 0.8076
## Pos Pred Value NA 0.6228
## Neg Pred Value NA 0.8595
## Prevalence 0.0000 0.3103
## Detection Rate 0.0000 0.2192
## Detection Prevalence 0.1792 0.3519
## Balanced Accuracy NA 0.7570
Random Forest
model_rf<-randomForest(OutcomeType~.,data=df_train,importance=T,ntree=200)
prediction<-predict(model_rf,df_test)
z<-(data.frame(df_test,prediction))
c=0
for(i in 1:nrow(z)){
if(z[i,"prediction"]==z[i,"OutcomeType"]){
c=c+1
}
}
varImpPlot(model_rf)

accuracy<-c*100/nrow(z)
tb_1<-table(z$OutcomeType,z$prediction)
confusionMatrix(tb_1)
## Confusion Matrix and Statistics
##
##
## Adoption Died Euthanasia Return_to_owner Transfer
## Adoption 2274 0 4 229 185
## Died 6 2 4 2 35
## Euthanasia 62 0 70 70 187
## Return_to_owner 539 0 8 492 157
## Transfer 441 0 21 171 1716
##
## Overall Statistics
##
## Accuracy : 0.6822
## 95% CI : (0.6709, 0.6934)
## No Information Rate : 0.4977
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5129
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Adoption Class: Died Class: Euthanasia
## Sensitivity 0.6845 1.0000000 0.65421
## Specificity 0.8753 0.9929567 0.95143
## Pos Pred Value 0.8447 0.0408163 0.17995
## Neg Pred Value 0.7369 1.0000000 0.99411
## Prevalence 0.4977 0.0002996 0.01603
## Detection Rate 0.3407 0.0002996 0.01049
## Detection Prevalence 0.4033 0.0073408 0.05828
## Balanced Accuracy 0.7799 0.9964783 0.80282
## Class: Return_to_owner Class: Transfer
## Sensitivity 0.51037 0.7526
## Specificity 0.87673 0.8560
## Pos Pred Value 0.41137 0.7305
## Neg Pred Value 0.91385 0.8696
## Prevalence 0.14442 0.3416
## Detection Rate 0.07371 0.2571
## Detection Prevalence 0.17918 0.3519
## Balanced Accuracy 0.69355 0.8043
sample1<-sample.split(df_2$OutcomeType1,SplitRatio = 0.75)
df_train1<-subset(df_2, sample1 == TRUE)
df_test1<-subset(df_2, sample1 == FALSE)
df_train1<-df_train1[complete.cases(df_train1),]
df_test1<-df_test1[complete.cases(df_test1),]
multi_log_reg<-multinom(OutcomeType1~.,data = df_train1)
## # weights: 250 (196 variable)
## initial value 32237.041386
## iter 10 value 19407.015630
## iter 20 value 18719.709189
## iter 30 value 18021.539516
## iter 40 value 17376.064290
## iter 50 value 16881.130962
## iter 60 value 16689.229299
## iter 70 value 16574.792569
## iter 80 value 16518.789912
## iter 90 value 16488.495270
## iter 100 value 16472.915856
## final value 16472.915856
## stopped after 100 iterations
predict_multi_log_reg<-predict(multi_log_reg,df_test1)
z<-(data.frame(df_test1,predict_multi_log_reg))
c=0
for(i in 1:nrow(z)){
if(z[i,"predict_multi_log_reg"]==z[i,"OutcomeType1"]){
c=c+1
}
}
accuracy_multiReg<-c*100/nrow(z)
tb_1<-table(z$OutcomeType,z$predict_multi_log_reg)
confusionMatrix(tb_1)
## Confusion Matrix and Statistics
##
##
## Died Adoption Euthanasia Return_to_owner Transfer
## Died 0 4 3 3 39
## Adoption 0 2245 3 246 198
## Euthanasia 0 58 38 68 225
## Return_to_owner 0 469 6 569 152
## Transfer 0 502 23 217 1612
##
## Overall Statistics
##
## Accuracy : 0.6683
## 95% CI : (0.6568, 0.6796)
## No Information Rate : 0.4907
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4932
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Died Class: Adoption Class: Euthanasia
## Sensitivity NA 0.6849 0.520548
## Specificity 0.992665 0.8686 0.946875
## Pos Pred Value NA 0.8340 0.097686
## Neg Pred Value NA 0.7410 0.994436
## Prevalence 0.000000 0.4907 0.010928
## Detection Rate 0.000000 0.3361 0.005689
## Detection Prevalence 0.007335 0.4030 0.058234
## Balanced Accuracy NA 0.7767 0.733711
## Class: Return_to_owner Class: Transfer
## Sensitivity 0.51587 0.7242
## Specificity 0.88757 0.8334
## Pos Pred Value 0.47575 0.6848
## Neg Pred Value 0.90263 0.8581
## Prevalence 0.16512 0.3332
## Detection Rate 0.08518 0.2413
## Detection Prevalence 0.17904 0.3524
## Balanced Accuracy 0.70172 0.7788