Renew: 06/02 line 37~47 => Use to see the number of each outcometype line 50~58 => Not yet. line 61~168 => Breed line 174~202 => Age line 207~290

06/04 line 205~265 => Color
      line 465~617 => Random Forest

處理data

共26729筆資料,9個解釋變數,包含狗以及貓。

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
library(e1071)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
trainInit<-read.csv("train.csv")
head(trainInit)
##   AnimalID    Name            DateTime     OutcomeType OutcomeSubtype
## 1  A671945 Hambone 2014-02-12 18:22:00 Return_to_owner               
## 2  A656520   Emily 2013-10-13 12:44:00      Euthanasia      Suffering
## 3  A686464  Pearce 2015-01-31 12:28:00        Adoption         Foster
## 4  A683430         2014-07-11 19:09:00        Transfer        Partner
## 5  A667013         2013-11-15 12:52:00        Transfer        Partner
## 6  A677334    Elsa 2014-04-25 13:04:00        Transfer        Partner
##   AnimalType SexuponOutcome AgeuponOutcome
## 1        Dog  Neutered Male         1 year
## 2        Cat  Spayed Female         1 year
## 3        Dog  Neutered Male        2 years
## 4        Cat    Intact Male        3 weeks
## 5        Dog  Neutered Male        2 years
## 6        Dog  Intact Female        1 month
##                               Breed       Color
## 1             Shetland Sheepdog Mix Brown/White
## 2            Domestic Shorthair Mix Cream Tabby
## 3                      Pit Bull Mix  Blue/White
## 4            Domestic Shorthair Mix  Blue Cream
## 5       Lhasa Apso/Miniature Poodle         Tan
## 6 Cairn Terrier/Chihuahua Shorthair   Black/Tan

刪除 Name 以及 OutcomeSubtype

train<-trainInit[,-c(1,2,5)]
head(train)
##              DateTime     OutcomeType AnimalType SexuponOutcome
## 1 2014-02-12 18:22:00 Return_to_owner        Dog  Neutered Male
## 2 2013-10-13 12:44:00      Euthanasia        Cat  Spayed Female
## 3 2015-01-31 12:28:00        Adoption        Dog  Neutered Male
## 4 2014-07-11 19:09:00        Transfer        Cat    Intact Male
## 5 2013-11-15 12:52:00        Transfer        Dog  Neutered Male
## 6 2014-04-25 13:04:00        Transfer        Dog  Intact Female
##   AgeuponOutcome                             Breed       Color
## 1         1 year             Shetland Sheepdog Mix Brown/White
## 2         1 year            Domestic Shorthair Mix Cream Tabby
## 3        2 years                      Pit Bull Mix  Blue/White
## 4        3 weeks            Domestic Shorthair Mix  Blue Cream
## 5        2 years       Lhasa Apso/Miniature Poodle         Tan
## 6        1 month Cairn Terrier/Chihuahua Shorthair   Black/Tan
attach(train)

將狗和貓的data分散開來。

Dogtrain<-train[which(AnimalType=="Dog"),]
Cattrain<-train[-which(AnimalType=="Dog"),]
head(Dogtrain)
##               DateTime     OutcomeType AnimalType SexuponOutcome
## 1  2014-02-12 18:22:00 Return_to_owner        Dog  Neutered Male
## 3  2015-01-31 12:28:00        Adoption        Dog  Neutered Male
## 5  2013-11-15 12:52:00        Transfer        Dog  Neutered Male
## 6  2014-04-25 13:04:00        Transfer        Dog  Intact Female
## 9  2014-02-04 17:17:00        Adoption        Dog  Spayed Female
## 10 2014-05-03 07:48:00        Adoption        Dog  Spayed Female
##    AgeuponOutcome                             Breed       Color
## 1          1 year             Shetland Sheepdog Mix Brown/White
## 3         2 years                      Pit Bull Mix  Blue/White
## 5         2 years       Lhasa Apso/Miniature Poodle         Tan
## 6         1 month Cairn Terrier/Chihuahua Shorthair   Black/Tan
## 9        5 months     American Pit Bull Terrier Mix   Red/White
## 10         1 year                     Cairn Terrier       White
attach(Dogtrain)

n = 15595 p=8

OutcomeType

先看各個 OucomeType 分別有幾筆資料。

## Return to owner = 4286
## Transfer = 3917
## Adoption = 6497
## Died = 50
## Euthanasia = 845

因為“死亡”的個數太少,所以我們決定把他拿掉。 而且,我們將本來的OutcomeType中的,Return_to_owner 改為 Return。 因為後面在做XGBoosting的時候,會出現錯誤,說Return_to_owner超過64位元…….

# Dogtrain<-Dogtrain[-Died,]
# library(dplyr)
# temp <- gsub("_"," ", Dogtrain$OutcomeType) 
# Dogtrain$OutcomeType<-as.factor(strsplit(x = temp, split = " ") %>% sapply(function(x){x[1]}))

將“死亡”的資料,都換成“安樂死”

summary(Dogtrain$OutcomeType)
##        Adoption            Died      Euthanasia Return_to_owner 
##            6497              50             845            4286 
##        Transfer 
##            3917
temp <- gsub("_"," ", Dogtrain$OutcomeType) 
temp <- gsub("Died","Euthanasia", temp) 
Dogtrain$OutcomeType<-as.factor(strsplit(x = temp, split = " ") %>% sapply(function(x){x[1]}))
summary(Dogtrain$OutcomeType)
##   Adoption Euthanasia     Return   Transfer 
##       6497        895       4286       3917

BREED

gsub(" Mix", "", Dogtrain$Breed) -> temp
strsplit(x = temp, split = "/") %>% sapply(function(x){x[1]}) -> Dogtrain$breed1

Dogtrain %>% 
  count(breed1) %>% 
  arrange(desc(n)) %>%  head(40) %>% 
  ggplot(aes(x = reorder(breed1, n), y = n)) +
  geom_bar(stat = "identity", width = 0.8) +
  coord_flip() + 
  theme(axis.title.y = element_blank()) +
  ggtitle("Popular Breeds") +
  ylab("Number of Animals")

strsplit(x = temp, split = "/") %>% sapply(function(x){x[2]}) -> Dogtrain$breed2

Dogtrain %>% 
  count(breed2) %>% 
  arrange(desc(n)) %>%  head(30) %>% 
  ggplot(aes(x = reorder(breed2, n), y = n)) +
  geom_bar(stat = "identity", width = 0.8) +
  coord_flip() + 
  theme(axis.title.y = element_blank()) +
  ggtitle("Popular Breeds") +
  ylab("Number of Animals")

fst301<-c(which(Dogtrain$breed1=="Chihuahua Shorthair")#吉娃娃短毛--1#
         ,which(Dogtrain$breed1=="Pit Bull")#比特犬--2
         ,which(Dogtrain$breed1=="Labrador Retriever")#拉不拉多--3
         ,which(Dogtrain$breed1=="German Shepherd")#德國牧羊犬--4
         ,which(Dogtrain$breed1=="Australian Cattle Dog")#澳洲牧牛犬--5
         ,which(Dogtrain$breed1=="Dachshund")#臘腸狗--6
         ,which(Dogtrain$breed1=="Boxer")#拳師犬--7
         ,which(Dogtrain$breed1=="Border Collie")#邊境牧羊犬--8
         ,which(Dogtrain$breed1=="Miniature Poodle")#迷你貴賓犬--9
         ,which(Dogtrain$breed1=="Australian Shepherd")#澳洲牧羊犬--10
         ,which(Dogtrain$breed1=="Yorkshire Terrier")#約克夏--11
         ,which(Dogtrain$breed1=="Jack Russell Terrier")#傑克羅素???--12
         ,which(Dogtrain$breed1=="Miniature Schnauzer")#迷你雪納瑞--13
         ,which(Dogtrain$breed1=="Beagle")#米格魯--14
         ,which(Dogtrain$breed1=="Catahoula")#加泰霍拉豹犬--15
         ,which(Dogtrain$breed1=="Rat Terrier")#捕鼠???--16
         ,which(Dogtrain$breed1=="Siberian Husky")#西伯利亞雪橇犬--17
         ,which(Dogtrain$breed1=="Rottweiler")#羅威那--18
         ,which(Dogtrain$breed1=="Shih Tzu")#西施犬--19
         ,which(Dogtrain$breed1=="Chihuahua Longhair")#吉娃娃長毛--20
         ,which(Dogtrain$breed1=="Cairn Terrier")#凱恩???21
         ,which(Dogtrain$breed1=="Pointer")#指示犬--22
         ,which(Dogtrain$breed1=="Great Pyrenees")#大白熊犬--23
         ,which(Dogtrain$breed1=="American Bulldog")#美國鬥牛犬--24
         ,which(Dogtrain$breed1=="Anatol Shepherd")#安那托利亞牧羊犬--25
         ,which(Dogtrain$breed1=="Australian Kelpie")#澳洲卡爾比犬--26
         ,which(Dogtrain$breed1=="Staffordshire")#斯塔福郡鬥牛梗--27
         ,which(Dogtrain$breed1=="Pug")#巴哥犬--28
         ,which(Dogtrain$breed1=="Maltese")#馬爾濟斯--29
         ,which(Dogtrain$breed1=="American Staffordshire Terrier")#美國史特富郡梗--30
)

fst302<-c(which(Dogtrain$breed2=="Chihuahua Shorthair")#吉娃娃短毛--1#
               ,which(Dogtrain$breed2=="Pit Bull")#比特犬--2
               ,which(Dogtrain$breed2=="Labrador Retriever")#拉不拉多--3
               ,which(Dogtrain$breed2=="German Shepherd")#德國牧羊犬--4
               ,which(Dogtrain$breed2=="Australian Cattle Dog")#澳洲牧牛犬--5
               ,which(Dogtrain$breed2=="Dachshund")#臘腸狗--6
               ,which(Dogtrain$breed2=="Boxer")#拳師犬--7
               ,which(Dogtrain$breed2=="Border Collie")#邊境牧羊犬--8
               ,which(Dogtrain$breed2=="Miniature Poodle")#迷你貴賓犬--9
               ,which(Dogtrain$breed2=="Australian Shepherd")#澳洲牧羊犬--10
               ,which(Dogtrain$breed2=="Yorkshire Terrier")#約克夏--11
               ,which(Dogtrain$breed2=="Jack Russell Terrier")#傑克羅素???--12
               ,which(Dogtrain$breed2=="Miniature Schnauzer")#迷你雪納瑞--13
               ,which(Dogtrain$breed2=="Beagle")#米格魯--14
               ,which(Dogtrain$breed2=="Catahoula")#加泰霍拉豹犬--15
               ,which(Dogtrain$breed2=="Rat Terrier")#捕鼠???--16
               ,which(Dogtrain$breed2=="Siberian Husky")#西伯利亞雪橇犬--17
               ,which(Dogtrain$breed2=="Rottweiler")#羅威那--18
               ,which(Dogtrain$breed2=="Shih Tzu")#西施犬--19
               ,which(Dogtrain$breed2=="Chihuahua Longhair")#吉娃娃長毛--20
               ,which(Dogtrain$breed2=="Cairn Terrier")#凱恩???21
               ,which(Dogtrain$breed2=="Pointer")#指示犬--22
               ,which(Dogtrain$breed2=="Great Pyrenees")#大白熊犬--23
               ,which(Dogtrain$breed2=="American Bulldog")#美國鬥牛犬--24
               ,which(Dogtrain$breed2=="Anatol Shepherd")#安那托利亞牧羊犬--25
               ,which(Dogtrain$breed2=="Australian Kelpie")#澳洲卡爾比犬--26
               ,which(Dogtrain$breed2=="Staffordshire")#斯塔福郡鬥牛梗--27
               ,which(Dogtrain$breed2=="Pug")#巴哥犬--28
               ,which(Dogtrain$breed2=="Maltese")#馬爾濟斯--29
               ,which(Dogtrain$breed2=="American Staffordshire Terrier")#美國史特富郡梗--30
)
fst302<-fst302[which(is.na(match(fst302,fst301))==T)]

breed<-c()
for(i in 1:dim(Dogtrain)[1]){
  if(is.na(match(i,fst301))==F){
    breed<-c(breed,Dogtrain$breed1[i])
  }else if(is.na(match(i,fst302))==F){
    breed<-c(breed,Dogtrain$breed2[i])
  }else{
    breed<-c(breed,as.character("Other Breed"))
  }
}
Dogtrain$breed<-as.factor(breed)
Dogtrain<-Dogtrain[,-c(which(colnames(Dogtrain)=="breed1"),
                       which(colnames(Dogtrain)=="breed2"),
                       which(colnames(Dogtrain)=="Breed"))]

Age

Change the AgeuponOutcome as “Puppy”, “AdultDog”, “OldDog” three kinds of types.

Age<-Dogtrain$AgeuponOutcome
AgeCha<-as.character(Age)
AgeStr<-strsplit(AgeCha,"[ ]")
Puppy<-c()
AdultDog<-c()
OldDog<-c()

for(i in 1 : length(AgeStr) ){
  if(length(AgeStr[[i]])>0){
  temp1<-strsplit(AgeStr[[i]]," ")[[1]]
  temp1<-as.numeric(temp1)
  temp2<-strsplit(AgeStr[[i]]," ")[[2]]
    if((temp1==0 && length( which(temp2=="years") ) >0 ) |
       length( which(temp2=="day") ) > 0 |
       length( which(temp2=="weeks") ) > 0 |
       length( which(temp2=="month") ) > 0 |
       length( which(temp2=="months") ) > 0 ){
         AgeCha[i]<-"Puppy"
    }else if(temp1 < 7 && length( which(temp2=="years"))>0){
      AgeCha[i]<-"AdultDog"
    }else{
      AgeCha[i]<-"OldDog"
    }
  }
}
head(Dogtrain)
##               DateTime OutcomeType AnimalType SexuponOutcome
## 1  2014-02-12 18:22:00      Return        Dog  Neutered Male
## 3  2015-01-31 12:28:00    Adoption        Dog  Neutered Male
## 5  2013-11-15 12:52:00    Transfer        Dog  Neutered Male
## 6  2014-04-25 13:04:00    Transfer        Dog  Intact Female
## 9  2014-02-04 17:17:00    Adoption        Dog  Spayed Female
## 10 2014-05-03 07:48:00    Adoption        Dog  Spayed Female
##    AgeuponOutcome       Color            breed
## 1          1 year Brown/White      Other Breed
## 3         2 years  Blue/White         Pit Bull
## 5         2 years         Tan Miniature Poodle
## 6         1 month   Black/Tan    Cairn Terrier
## 9        5 months   Red/White      Other Breed
## 10         1 year       White    Cairn Terrier
Dogtrain$Age<-as.factor(AgeCha)
#remove the missing age
Dogtrain<-Dogtrain[-which(Dogtrain[,which(colnames(Dogtrain)=="AgeuponOutcome")]==""),-which(colnames(Dogtrain)=="AgeuponOutcome")]

Color

Replace each color to “Simple”, “Double”, “Tricolor”, “Brindle”, “Tick”, “Merle”, Six categories.

And the “Brindle” contains the color that it has Brindle and Tick, or Brindle and Merle at the same time.

But, there are still some of colors that it not belong to Brindle, like “Blue Tiger”, “Blue cream”, “Smoke” etc.. I will just consider their color, and classify them to “Simple”, “Double” or “Tricolor”.

temp <- as.character(Dogtrain$Color)
Color1 <- strsplit(x = temp, split = "/") %>% sapply(function(x){x[1]})
Color2 <- strsplit(x = temp, split = "/") %>% sapply(function(x){x[2]})
Color11 <- strsplit(x = Color1, split = " ") %>% sapply(function(x){x[1]}) 
Color12 <- strsplit(x = Color1, split = " ") %>% sapply(function(x){x[2]}) 
Color21 <- strsplit(x = Color2, split = " ") %>% sapply(function(x){x[1]})
Color22 <- strsplit(x = Color2, split = " ") %>% sapply(function(x){x[2]}) 

Dogtrain %>% 
  count(Color) %>% 
  arrange(desc(n)) %>%  head(20) %>% 
  ggplot(aes(x = reorder(Color, n), y = n)) +
  geom_bar(stat = "identity", width = 0.8) +
  coord_flip() + 
  theme(axis.title.y = element_blank()) +
  ggtitle("Popular Colors") +
  ylab("Number of Animals")

#先分純or雙色or三色之後
#只要有 Brindle 的 都歸到 Brindle
#其他 例如 Blue tiger, Smoke.... 只要有除了顏色以外的名字,但不是 Brindle, Tick, Merle
#那就讓他以顏色分
ColorFix<-c()
ColorFix[length(Color1)+1]<-0
for(i in 1:length(Color1)){
  if( is.na(Color12[i])==F && is.na(Color22[i])==F ){
    if( Color12[i]=="Brindle" && Color22[i]=="Brindle" ||
        Color12[i]=="Brindle" && Color22[i]=="Tick"    ||
        Color12[i]=="Brindle" && Color22[i]=="Merle"   ||
        Color12[i]=="Tick" && Color22[i]=="Brindle"    ||
        Color12[i]=="Merle" && Color22[i]=="Brindle"   ){
      ColorFix[i]<-"Brindle"
    }else if ( Color12[i] == "Tick" && Color22[i] == "Tick" ){
      ColorFix[i]<-"Tricolor"
    }else if ( Color12[i]== "Merle" && Color22[i] == "Merle"|| 
               Color12[i]=="Tick" && Color22[i]=="Merle"||
               Color12[i]=="Merle" && Color22[i]=="Tick"){
      ColorFix[i]<-"Merle"
    }
  }else if(is.na(Color12[i])==F){
    if( Color12[i]=="Merle" || Color12[i]=="Brindle"){
      ColorFix[i]<-Color12[i]
    }else if(Color12[i]=="Tick"){
      ColorFix[i]<-"Double"
    }
  }else if(is.na(Color22[i])==F){
    if( Color22[i]=="Merle" || Color22[i]=="Brindle" || Color22[i]=="Tick"){
      ColorFix[i]<-Color22[i]
    }else if(Color22[i]=="Tick"){
      ColorFix[i]<-"Double"
    }
  }
  if(is.na(ColorFix[i])==T){
    if(Color11[i]=="Tricolor"){
      ColorFix[i]<-"Tricolor"
    }else if(is.na(Color21[i])==T){
      if(Color11[i]=="Black"||Color11[i]=="Brown"||Color11[i]=="Red"||Color11[i]=="Blue"||
         Color11[i]=="Chocolate"||Color11[i]=="Sable"){
        ColorFix[i]<-"Heavy"
      }else if( Color11[i]=="White" || Color11[i]=="Tan" || Color11[i]=="Buff" || 
                Color11[i]=="Yellow" || Color11[i]=="Fawn" || Color11[i]=="Cream" || 
                Color11[i]=="Gray" || Color11[i]=="Gold"){
        ColorFix[i]<-"Light"
      }else{
        ColorFix[i]<-"Others Simple"
      }
    }else {
      ColorFix[i]<-"Double"
    }
  }
}


ColorFix<-ColorFix[-(length(Color1)+1)]
Dogtrain$ColorFix<-as.factor(ColorFix)
Dogtrain<-Dogtrain[,-which(colnames(Dogtrain)=="Color")]

Time

DateTime<-as.Date(Dogtrain$DateTime)
a<-c()
for(i in 1:length(DateTime)){
  if(format(DateTime[i],"%m")=="06" ||
     format(DateTime[i],"%m")=="07" ||
     format(DateTime[i],"%m")=="08" )
  {
    a<-c(a,"Summer")
  }else if(format(DateTime[i],"%m")=="09" ||
           format(DateTime[i],"%m")=="10" ||
           format(DateTime[i],"%m")=="11" )
  {
    a<-c(a,"Fall")
  }else if(format(DateTime[i],"%m")=="12" ||
           format(DateTime[i],"%m")=="01" ||
           format(DateTime[i],"%m")=="02" )
  {
    a<-c(a,"Winter")
  }else if(format(DateTime[i],"%m")=="03" ||
           format(DateTime[i],"%m")=="04" ||
           format(DateTime[i],"%m")=="05" )
  {
    a<-c(a,"Spring")
  }
}
Dogtrain$DateTime<-as.factor(a)

# DateTime<-as.Date(Dogtrain$DateTime)
# a<-c()
# for(i in 1:length(DateTime)){
#   if(format(DateTime[i],"%m")=="01")
#   {
#     a<-c(a,"January") 
#   }else if(format(DateTime[i],"%m")=="02")
#   {
#     a<-c(a,"February")
#   }else if(format(DateTime[i],"%m")=="03")
#   {
#     a<-c(a,"March")
#   }else if(format(DateTime[i],"%m")=="04")
#   {
#     a<-c(a,"April")
#   }else if(format(DateTime[i],"%m")=="05")
#   {
#     a<-c(a,"May")
#   }else if(format(DateTime[i],"%m")=="06")
#   {
#     a<-c(a,"June")
#   }else if(format(DateTime[i],"%m")=="07")
#   {
#     a<-c(a,"July")
#   }else if(format(DateTime[i],"%m")=="08")
#   {
#     a<-c(a,"August")
#   }else if(format(DateTime[i],"%m")=="09")
#   {
#     a<-c(a,"September")
#   }else if(format(DateTime[i],"%m")=="10")
#   {
#     a<-c(a,"October")
#   }else if(format(DateTime[i],"%m")=="11")
#   {
#     a<-c(a,"November")
#   }else if(format(DateTime[i],"%m")=="12")
#   {
#     a<-c(a,"December")
#   }
# }
# Dogtrain$DateTime<-as.factor(a)

SVM

Try to use SVM to fit model.

method 1 : 使training data中,各個OutcomeType的個數相等。 各個OutcomeType各取 500~600 隨機一個個數

#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain

AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))
choiceddata<-
  c(which(Dogtrainsvm$OutcomeType=="Adoption")[sample(1:AdoL,sample(500:600,1))],
    which(Dogtrainsvm$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],
    which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],
    which(Dogtrainsvm$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))]
    )

cat("The number of training data is ",length(choiceddata))
## The number of training data is  2209
cat("\n")
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]

AdoL<-length(which(SVMtrain$OutcomeType=="Adoption"))
ReL<-length(which(SVMtrain$OutcomeType=="Return"))
EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(SVMtrain$OutcomeType=="Transfer"))
cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is  520
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 535
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 590
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 564
cat("\n")
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 1.466186 secs
testEr<-predict(model,SVMtest[,-2])

trainEr<-predict(model,SVMtrain[,-2])

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption        204          4     35       72
##   Euthanasia        2        264     78      106
##   Return          312        258    405      238
##   Transfer          2         64     17      148
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2161          4    249      448
##   Euthanasia       60        131    526      648
##   Return         3697        144   2841     1450
##   Transfer         59         25    135      807
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))

trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.5377999
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5562196
cat("\n")
rm(Dogtrainsvm)

但是效果還是沒很好,懷疑是不是為了讓他balance,導致 training data 過少

method 2 : 補上安樂死的data 使他們balance 選出 10000 筆作為 training data

#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain

AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))

Dogtrainsvm<-rbind(Dogtrainsvm,
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],])

choiceddata<-sample(1:dim(Dogtrainsvm)[1],10000)
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]

AdoL<-length(which(SVMtrain$OutcomeType=="Adoption"))
ReL<-length(which(SVMtrain$OutcomeType=="Return"))
EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(SVMtrain$OutcomeType=="Transfer"))
cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is  3442
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 2261
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 2199
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 2098
cat("\n")
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 28.55163 secs
testEr<-predict(model,SVMtest[,-2])

trainEr<-predict(model,SVMtrain[,-2])

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       3384       1022   1847     1186
##   Euthanasia       28        976    331      374
##   Return            0          4      1        0
##   Transfer         30        197     82      538
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2990        864   1683     1022
##   Euthanasia       34        844    271      379
##   Return            0          2      0        0
##   Transfer         31        221     71      418
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))

trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.5101
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5184598
cat("\n")
rm(Dogtrainsvm)

method 3 : 補上安樂死的data 使他們balance。並使training data也balance NOT FIX!!!!!!!!!!!!!!!!!!

#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain

AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))

Dogtrainsvm<-rbind(Dogtrainsvm,
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],])


(AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption")))
## [1] 6497
(ReL<-length(which(Dogtrainsvm$OutcomeType=="Return")))
## [1] 4286
(EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia")))
## [1] 4095
(TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer")))
## [1] 3917
choiceddata<-
  c(which(Dogtrainsvm$OutcomeType=="Adoption")[sample(1:AdoL,sample(2000:2500,1))],
    which(Dogtrainsvm$OutcomeType=="Return")[sample(1:ReL,sample(2000:2500,1))],
    which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(2000:2500,1))],
    which(Dogtrainsvm$OutcomeType=="Transfer")[sample(1:TrL,sample(2000:2500,1))]
    )
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]

(AdoL<-length(which(SVMtrain$OutcomeType=="Adoption")))
## [1] 2230
(ReL<-length(which(SVMtrain$OutcomeType=="Return")))
## [1] 2029
(EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia")))
## [1] 2488
(TrL<-length(which(SVMtrain$OutcomeType=="Transfer")))
## [1] 2493
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 26.42086 secs
testEr<-predict(model,SVMtest[,-2])

trainEr<-predict(model,SVMtrain[,-2])

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       1048        101    318      520
##   Euthanasia      178       1563    532      702
##   Return          985        660   1126      761
##   Transfer         19        164     53      510
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       1998         55    345      281
##   Euthanasia      351       1010    592      398
##   Return         1881        417   1273      427
##   Transfer         37        125     47      318
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))

trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.540368
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5186813
cat("\n")
rm(Dogtrainsvm)

Random Forest

method 1 : Training data : 10000 ; Tree 600 & not adjust training data

DogtrainRF<-Dogtrain

choiceddata<-sample(1:dim(DogtrainRF)[1],10000)
RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]

tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 41.52177 secs
trainEr<-predict(model,RFtrain[,-2])
testEr<-predict(model,RFtest[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       3754        184   1518     1176
##   Euthanasia        0         51      3        2
##   Return          363        159    986      295
##   Transfer         75        174    219     1041
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2003         97    987      631
##   Euthanasia        0          9      9        4
##   Return          266        104    392      203
##   Transfer         36        116    172      565
cat("\n")
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.4168
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.4692528
cat("\n")
#rm(DogtrainRF)

Random Forest

method 2 : 使training data中,各個OutcomeType的個數相等。 各個OutcomeType各取 500~600 隨機一個個數 分成 600 個 Tree

DogtrainRF<-Dogtrain

AdoL<-length(which(DogtrainRF$OutcomeType=="Adoption"))
ReL<-length(which(DogtrainRF$OutcomeType=="Return"))
EuL<-length(which(DogtrainRF$OutcomeType=="Euthanasia"))
TrL<-length(which(DogtrainRF$OutcomeType=="Transfer"))

choiceddata<-
  c(which(DogtrainRF$OutcomeType=="Adoption")[sample(1:AdoL,sample(500:600,1))],
    which(DogtrainRF$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],
    which(DogtrainRF$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],
    which(DogtrainRF$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))]
  )
RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]

tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 20.86515 secs
testEr<-predict(model,RFtest[,-2])
trainEr<-predict(model,RFtrain[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption        326         21     54       72
##   Euthanasia       47        427     96       91
##   Return          112         95    377      102
##   Transfer         16         52     29      308
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2650         22    619      623
##   Euthanasia      531        162    879      723
##   Return         2308         76   1864      947
##   Transfer        507         39    368     1051
cat("\n")
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.3537079
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5716209
cat("\n")
rm(DogtrainRF)

Random Forest

method 3 : Training data : 1000 ; Tree : 600 隨機選出10000筆的 training data 後,再使 trainig data 中的 OutcomeType balance。

DogtrainRF<-Dogtrain

choiceddata<-sample(1:dim(DogtrainRF)[1],10000)

RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]

AdoL<-length(which(RFtrain$OutcomeType=="Adoption"))
ReL<-length(which(RFtrain$OutcomeType=="Return"))
EuL<-length(which(RFtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(RFtrain$OutcomeType=="Transfer"))

RFtrain<-rbind(RFtrain,
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],])

RFtrain<-rbind(RFtrain,
               RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],])

RFtrain<-rbind(RFtrain,
               RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(300:400,1))],])


AdoL<-length(which(RFtrain$OutcomeType=="Adoption"))
ReL<-length(which(RFtrain$OutcomeType=="Return"))
EuL<-length(which(RFtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(RFtrain$OutcomeType=="Transfer"))

cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is  4190
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 4106
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 4130
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 4140
cat("\n")
tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 44.51565 secs
testEr<-predict(model,RFtest[,-2])
trainEr<-predict(model,RFtrain[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2277         85    459      812
##   Euthanasia      381       3244    855      754
##   Return         1481        569   2593     1139
##   Transfer         51        232    199     1435
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       1138         23    292      294
##   Euthanasia      224        173    385      286
##   Return          905         94    837      371
##   Transfer         40         49     88      395
cat("\n")
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.7017
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5454058
cat("\n")
rm(DogtrainRF)