Renew: 06/02 line 37~47 => Use to see the number of each outcometype line 50~58 => Not yet. line 61~168 => Breed line 174~202 => Age line 207~290

06/04 line 205~265 => Color
      line 465~617 => Random Forest

處理data

共26729筆資料,9個解釋變數,包含狗以及貓。

library(dplyr)
library(ggplot2)
library(e1071)
library(randomForest)
library(caret)
trainInit<-read.csv("train.csv")
head(trainInit)
##   AnimalID    Name            DateTime     OutcomeType OutcomeSubtype
## 1  A671945 Hambone 2014-02-12 18:22:00 Return_to_owner               
## 2  A656520   Emily 2013-10-13 12:44:00      Euthanasia      Suffering
## 3  A686464  Pearce 2015-01-31 12:28:00        Adoption         Foster
## 4  A683430         2014-07-11 19:09:00        Transfer        Partner
## 5  A667013         2013-11-15 12:52:00        Transfer        Partner
## 6  A677334    Elsa 2014-04-25 13:04:00        Transfer        Partner
##   AnimalType SexuponOutcome AgeuponOutcome
## 1        Dog  Neutered Male         1 year
## 2        Cat  Spayed Female         1 year
## 3        Dog  Neutered Male        2 years
## 4        Cat    Intact Male        3 weeks
## 5        Dog  Neutered Male        2 years
## 6        Dog  Intact Female        1 month
##                               Breed       Color
## 1             Shetland Sheepdog Mix Brown/White
## 2            Domestic Shorthair Mix Cream Tabby
## 3                      Pit Bull Mix  Blue/White
## 4            Domestic Shorthair Mix  Blue Cream
## 5       Lhasa Apso/Miniature Poodle         Tan
## 6 Cairn Terrier/Chihuahua Shorthair   Black/Tan

刪除 Name 以及 OutcomeSubtype

train<-trainInit[,-c(1,2,5)]
head(train)
##              DateTime     OutcomeType AnimalType SexuponOutcome
## 1 2014-02-12 18:22:00 Return_to_owner        Dog  Neutered Male
## 2 2013-10-13 12:44:00      Euthanasia        Cat  Spayed Female
## 3 2015-01-31 12:28:00        Adoption        Dog  Neutered Male
## 4 2014-07-11 19:09:00        Transfer        Cat    Intact Male
## 5 2013-11-15 12:52:00        Transfer        Dog  Neutered Male
## 6 2014-04-25 13:04:00        Transfer        Dog  Intact Female
##   AgeuponOutcome                             Breed       Color
## 1         1 year             Shetland Sheepdog Mix Brown/White
## 2         1 year            Domestic Shorthair Mix Cream Tabby
## 3        2 years                      Pit Bull Mix  Blue/White
## 4        3 weeks            Domestic Shorthair Mix  Blue Cream
## 5        2 years       Lhasa Apso/Miniature Poodle         Tan
## 6        1 month Cairn Terrier/Chihuahua Shorthair   Black/Tan
attach(train)

將狗和貓的data分散開來。

Dogtrain<-train[which(AnimalType=="Dog"),]
Cattrain<-train[-which(AnimalType=="Dog"),]
head(Dogtrain)
##               DateTime     OutcomeType AnimalType SexuponOutcome
## 1  2014-02-12 18:22:00 Return_to_owner        Dog  Neutered Male
## 3  2015-01-31 12:28:00        Adoption        Dog  Neutered Male
## 5  2013-11-15 12:52:00        Transfer        Dog  Neutered Male
## 6  2014-04-25 13:04:00        Transfer        Dog  Intact Female
## 9  2014-02-04 17:17:00        Adoption        Dog  Spayed Female
## 10 2014-05-03 07:48:00        Adoption        Dog  Spayed Female
##    AgeuponOutcome                             Breed       Color
## 1          1 year             Shetland Sheepdog Mix Brown/White
## 3         2 years                      Pit Bull Mix  Blue/White
## 5         2 years       Lhasa Apso/Miniature Poodle         Tan
## 6         1 month Cairn Terrier/Chihuahua Shorthair   Black/Tan
## 9        5 months     American Pit Bull Terrier Mix   Red/White
## 10         1 year                     Cairn Terrier       White
attach(Dogtrain)

n = 15595 p=8

OutcomeType

先看各個 OucomeType 分別有幾筆資料。

## Return to owner = 4286
## Transfer = 3917
## Adoption = 6497
## Died = 50
## Euthanasia = 845

因為“死亡”的個數太少,所以我們決定把他拿掉。 而且,我們將本來的OutcomeType中的,Return_to_owner 改為 Return。 因為後面在做XGBoosting的時候,會出現錯誤,說Return_to_owner超過64位元…….

Dogtrain<-Dogtrain[-Died,]
library(dplyr)
temp <- gsub("_"," ", Dogtrain$OutcomeType)
Dogtrain$OutcomeType<-as.factor(strsplit(x = temp, split = " ") %>% sapply(function(x){x[1]}))

將“死亡”的資料,都換成“安樂死”

# summary(Dogtrain$OutcomeType)
# temp <- gsub("_"," ", Dogtrain$OutcomeType) 
# temp <- gsub("Died","Euthanasia", temp) 
# Dogtrain$OutcomeType<-as.factor(strsplit(x = temp, split = " ") %>% sapply(function(x){x[1]}))
# summary(Dogtrain$OutcomeType)

BREED

Age

Change the AgeuponOutcome as “Puppy”, “AdultDog”, “OldDog” three kinds of types.

##               DateTime OutcomeType AnimalType SexuponOutcome
## 1  2014-02-12 18:22:00      Return        Dog  Neutered Male
## 3  2015-01-31 12:28:00    Adoption        Dog  Neutered Male
## 5  2013-11-15 12:52:00    Transfer        Dog  Neutered Male
## 6  2014-04-25 13:04:00    Transfer        Dog  Intact Female
## 9  2014-02-04 17:17:00    Adoption        Dog  Spayed Female
## 10 2014-05-03 07:48:00    Adoption        Dog  Spayed Female
##    AgeuponOutcome       Color            breed
## 1          1 year Brown/White      Other Breed
## 3         2 years  Blue/White         Pit Bull
## 5         2 years         Tan Miniature Poodle
## 6         1 month   Black/Tan    Cairn Terrier
## 9        5 months   Red/White      Other Breed
## 10         1 year       White    Cairn Terrier

Color

Replace each color to “Simple”, “Double”, “Tricolor”, “Brindle”, “Tick”, “Merle”, Six categories.

And the “Brindle” contains the color that it has Brindle and Tick, or Brindle and Merle at the same time.

But, there are still some of colors that it not belong to Brindle, like “Blue Tiger”, “Blue cream”, “Smoke” etc.. I will just consider their color, and classify them to “Simple”, “Double” or “Tricolor”.

Time

SVM

Try to use SVM to fit model.

method 1 : 使training data中,各個OutcomeType的個數相等。 各個OutcomeType各取 500~600 隨機一個個數

#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain

AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))
choiceddata<-
  c(which(Dogtrainsvm$OutcomeType=="Adoption")[sample(1:AdoL,sample(500:600,1))],
    which(Dogtrainsvm$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],
    which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],
    which(Dogtrainsvm$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))]
    )

cat("The number of training data is ",length(choiceddata))
## The number of training data is  2199
cat("\n")
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]

AdoL<-length(which(SVMtrain$OutcomeType=="Adoption"))
ReL<-length(which(SVMtrain$OutcomeType=="Return"))
EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(SVMtrain$OutcomeType=="Transfer"))
cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is  552
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 552
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 509
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 586
cat("\n")
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 1.644709 secs
testEr<-predict(model,SVMtest[,-2])

trainEr<-predict(model,SVMtrain[,-2])

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption        193          5     38       80
##   Euthanasia        9        208     61       94
##   Return          332        229    430      247
##   Transfer         18         67     23      165
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2172          2    246      440
##   Euthanasia       41        135    505      522
##   Return         3678        159   2819     1457
##   Transfer         54         39    164      912
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))

trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.5470668
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5475459
cat("\n")
rm(Dogtrainsvm)

但是效果還是沒很好,懷疑是不是為了讓他balance,導致 training data 過少

method 2 : 補上安樂死的data 使他們balance 選出 10000 筆作為 training data

#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain

AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))

Dogtrainsvm<-rbind(Dogtrainsvm,
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],])

choiceddata<-sample(1:dim(Dogtrainsvm)[1],10000)
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]

AdoL<-length(which(SVMtrain$OutcomeType=="Adoption"))
ReL<-length(which(SVMtrain$OutcomeType=="Return"))
EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(SVMtrain$OutcomeType=="Transfer"))
cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is  3473
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 2248
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 2269
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 2010
cat("\n")
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 30.6969 secs
testEr<-predict(model,SVMtest[,-2])

trainEr<-predict(model,SVMtrain[,-2])

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       3084        649   1574     1020
##   Euthanasia      266       1412    574      490
##   Return            0          0      0        0
##   Transfer        123        208    100      500
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2665        577   1451      924
##   Euthanasia      258       1200    495      471
##   Return            0          0      0        0
##   Transfer        101        197     92      512
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))

trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.5004
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5105669
cat("\n")
rm(Dogtrainsvm)

method 3 : 補上安樂死的data 使他們balance。並使training data也balance NOT FIX!!!!!!!!!!!!!!!!!!

#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain

AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))

Dogtrainsvm<-rbind(Dogtrainsvm,
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
                   Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],])


(AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption")))
## [1] 6497
(ReL<-length(which(Dogtrainsvm$OutcomeType=="Return")))
## [1] 4286
(EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia")))
## [1] 4313
(TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer")))
## [1] 3917
choiceddata<-
  c(which(Dogtrainsvm$OutcomeType=="Adoption")[sample(1:AdoL,sample(2000:2500,1))],
    which(Dogtrainsvm$OutcomeType=="Return")[sample(1:ReL,sample(2000:2500,1))],
    which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(2000:2500,1))],
    which(Dogtrainsvm$OutcomeType=="Transfer")[sample(1:TrL,sample(2000:2500,1))]
    )
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]

(AdoL<-length(which(SVMtrain$OutcomeType=="Adoption")))
## [1] 2286
(ReL<-length(which(SVMtrain$OutcomeType=="Return")))
## [1] 2475
(EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia")))
## [1] 2202
(TrL<-length(which(SVMtrain$OutcomeType=="Transfer")))
## [1] 2073
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 25.44123 secs
testEr<-predict(model,SVMtest[,-2])

trainEr<-predict(model,SVMtrain[,-2])

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption        812         15    159      275
##   Euthanasia       25       1006    347      377
##   Return         1431        995   1883      896
##   Transfer         18        186     86      525
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       1553         19    125      245
##   Euthanasia       37        936    253      356
##   Return         2578        945   1364      801
##   Transfer         43        211     69      442
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))

trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.5323152
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5695099
cat("\n")
rm(Dogtrainsvm)

Random Forest

method 1 : Training data : 10000 ; Tree 600 & not adjust training data

DogtrainRF<-Dogtrain

choiceddata<-sample(1:dim(DogtrainRF)[1],10000)
RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]

tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 46.98847 secs
trainEr<-predict(model,RFtrain[,-2])
testEr<-predict(model,RFtest[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       3761        153   1455     1149
##   Euthanasia        1         49      4        2
##   Return          383        177   1069      342
##   Transfer         67        137    218     1033
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       1934         98    954      613
##   Euthanasia        0          5      7        5
##   Return          309        108    428      219
##   Transfer         42        117    151      554
cat("\n")
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.4088
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.4731241
cat("\n")
#rm(DogtrainRF)

Random Forest

method 2 : 使training data中,各個OutcomeType的個數相等。 各個OutcomeType各取 500~600 隨機一個個數 分成 600 個 Tree

DogtrainRF<-Dogtrain

AdoL<-length(which(DogtrainRF$OutcomeType=="Adoption"))
ReL<-length(which(DogtrainRF$OutcomeType=="Return"))
EuL<-length(which(DogtrainRF$OutcomeType=="Euthanasia"))
TrL<-length(which(DogtrainRF$OutcomeType=="Transfer"))

choiceddata<-
  c(which(DogtrainRF$OutcomeType=="Adoption")[sample(1:AdoL,sample(500:600,1))],
    which(DogtrainRF$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],
    which(DogtrainRF$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],
    which(DogtrainRF$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))]
  )
RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]

tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 21.12418 secs
testEr<-predict(model,RFtest[,-2])
trainEr<-predict(model,RFtrain[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption        383         28     57       92
##   Euthanasia       45        405     99       71
##   Return           83         61    338       89
##   Transfer         11         54     26      302
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       3112         28    862      797
##   Euthanasia      615        161   1024      789
##   Return         1840         65   1520      750
##   Transfer        408         42    360     1027
cat("\n")
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.3339552
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5656716
cat("\n")
rm(DogtrainRF)

Random Forest

method 3 : Training data : 1000 ; Tree : 600 隨機選出10000筆的 training data 後,再使 trainig data 中的 OutcomeType balance。

DogtrainRF<-Dogtrain

choiceddata<-sample(1:dim(DogtrainRF)[1],10000)

RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]

AdoL<-length(which(RFtrain$OutcomeType=="Adoption"))
ReL<-length(which(RFtrain$OutcomeType=="Return"))
EuL<-length(which(RFtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(RFtrain$OutcomeType=="Transfer"))

RFtrain<-rbind(RFtrain,
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],])

RFtrain<-rbind(RFtrain,
               RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],])

RFtrain<-rbind(RFtrain,
               RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],],
               RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(300:400,1))],])


AdoL<-length(which(RFtrain$OutcomeType=="Adoption"))
ReL<-length(which(RFtrain$OutcomeType=="Return"))
EuL<-length(which(RFtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(RFtrain$OutcomeType=="Transfer"))

cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is  4136
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 4122
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 4122
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 4181
cat("\n")
tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 48.91821 secs
testEr<-predict(model,RFtest[,-2])
trainEr<-predict(model,RFtrain[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)

cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       2339        136    568      848
##   Euthanasia      396       3264    912      752
##   Return         1321        500   2450     1054
##   Transfer         80        222    192     1527
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
##             Reference
## Prediction   Adoption Euthanasia Return Transfer
##   Adoption       1196         20    324      298
##   Euthanasia      263        161    375      302
##   Return          846         72    710      352
##   Transfer         56         42    114      413
cat("\n")
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)

cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.6981
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5526696
cat("\n")
rm(DogtrainRF)