Renew: 06/02 line 37~47 => Use to see the number of each outcometype line 50~58 => Not yet. line 61~168 => Breed line 174~202 => Age line 207~290
06/04 line 205~265 => Color
line 465~617 => Random Forest
共26729筆資料,9個解釋變數,包含狗以及貓。
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
library(e1071)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
trainInit<-read.csv("train.csv")
head(trainInit)
## AnimalID Name DateTime OutcomeType OutcomeSubtype
## 1 A671945 Hambone 2014-02-12 18:22:00 Return_to_owner
## 2 A656520 Emily 2013-10-13 12:44:00 Euthanasia Suffering
## 3 A686464 Pearce 2015-01-31 12:28:00 Adoption Foster
## 4 A683430 2014-07-11 19:09:00 Transfer Partner
## 5 A667013 2013-11-15 12:52:00 Transfer Partner
## 6 A677334 Elsa 2014-04-25 13:04:00 Transfer Partner
## AnimalType SexuponOutcome AgeuponOutcome
## 1 Dog Neutered Male 1 year
## 2 Cat Spayed Female 1 year
## 3 Dog Neutered Male 2 years
## 4 Cat Intact Male 3 weeks
## 5 Dog Neutered Male 2 years
## 6 Dog Intact Female 1 month
## Breed Color
## 1 Shetland Sheepdog Mix Brown/White
## 2 Domestic Shorthair Mix Cream Tabby
## 3 Pit Bull Mix Blue/White
## 4 Domestic Shorthair Mix Blue Cream
## 5 Lhasa Apso/Miniature Poodle Tan
## 6 Cairn Terrier/Chihuahua Shorthair Black/Tan
刪除 Name 以及 OutcomeSubtype
train<-trainInit[,-c(1,2,5)]
head(train)
## DateTime OutcomeType AnimalType SexuponOutcome
## 1 2014-02-12 18:22:00 Return_to_owner Dog Neutered Male
## 2 2013-10-13 12:44:00 Euthanasia Cat Spayed Female
## 3 2015-01-31 12:28:00 Adoption Dog Neutered Male
## 4 2014-07-11 19:09:00 Transfer Cat Intact Male
## 5 2013-11-15 12:52:00 Transfer Dog Neutered Male
## 6 2014-04-25 13:04:00 Transfer Dog Intact Female
## AgeuponOutcome Breed Color
## 1 1 year Shetland Sheepdog Mix Brown/White
## 2 1 year Domestic Shorthair Mix Cream Tabby
## 3 2 years Pit Bull Mix Blue/White
## 4 3 weeks Domestic Shorthair Mix Blue Cream
## 5 2 years Lhasa Apso/Miniature Poodle Tan
## 6 1 month Cairn Terrier/Chihuahua Shorthair Black/Tan
attach(train)
將狗和貓的data分散開來。
Dogtrain<-train[which(AnimalType=="Dog"),]
Cattrain<-train[-which(AnimalType=="Dog"),]
head(Dogtrain)
## DateTime OutcomeType AnimalType SexuponOutcome
## 1 2014-02-12 18:22:00 Return_to_owner Dog Neutered Male
## 3 2015-01-31 12:28:00 Adoption Dog Neutered Male
## 5 2013-11-15 12:52:00 Transfer Dog Neutered Male
## 6 2014-04-25 13:04:00 Transfer Dog Intact Female
## 9 2014-02-04 17:17:00 Adoption Dog Spayed Female
## 10 2014-05-03 07:48:00 Adoption Dog Spayed Female
## AgeuponOutcome Breed Color
## 1 1 year Shetland Sheepdog Mix Brown/White
## 3 2 years Pit Bull Mix Blue/White
## 5 2 years Lhasa Apso/Miniature Poodle Tan
## 6 1 month Cairn Terrier/Chihuahua Shorthair Black/Tan
## 9 5 months American Pit Bull Terrier Mix Red/White
## 10 1 year Cairn Terrier White
attach(Dogtrain)
n = 15595 p=8
先看各個 OucomeType 分別有幾筆資料。
## Return to owner = 4286
## Transfer = 3917
## Adoption = 6497
## Died = 50
## Euthanasia = 845
因為“死亡”的個數太少,所以我們決定把他拿掉。 而且,我們將本來的OutcomeType中的,Return_to_owner 改為 Return。 因為後面在做XGBoosting的時候,會出現錯誤,說Return_to_owner超過64位元…….
# Dogtrain<-Dogtrain[-Died,]
# library(dplyr)
# temp <- gsub("_"," ", Dogtrain$OutcomeType)
# Dogtrain$OutcomeType<-as.factor(strsplit(x = temp, split = " ") %>% sapply(function(x){x[1]}))
將“死亡”的資料,都換成“安樂死”
summary(Dogtrain$OutcomeType)
## Adoption Died Euthanasia Return_to_owner
## 6497 50 845 4286
## Transfer
## 3917
temp <- gsub("_"," ", Dogtrain$OutcomeType)
temp <- gsub("Died","Euthanasia", temp)
Dogtrain$OutcomeType<-as.factor(strsplit(x = temp, split = " ") %>% sapply(function(x){x[1]}))
summary(Dogtrain$OutcomeType)
## Adoption Euthanasia Return Transfer
## 6497 895 4286 3917
gsub(" Mix", "", Dogtrain$Breed) -> temp
strsplit(x = temp, split = "/") %>% sapply(function(x){x[1]}) -> Dogtrain$breed1
Dogtrain %>%
count(breed1) %>%
arrange(desc(n)) %>% head(40) %>%
ggplot(aes(x = reorder(breed1, n), y = n)) +
geom_bar(stat = "identity", width = 0.8) +
coord_flip() +
theme(axis.title.y = element_blank()) +
ggtitle("Popular Breeds") +
ylab("Number of Animals")
strsplit(x = temp, split = "/") %>% sapply(function(x){x[2]}) -> Dogtrain$breed2
Dogtrain %>%
count(breed2) %>%
arrange(desc(n)) %>% head(30) %>%
ggplot(aes(x = reorder(breed2, n), y = n)) +
geom_bar(stat = "identity", width = 0.8) +
coord_flip() +
theme(axis.title.y = element_blank()) +
ggtitle("Popular Breeds") +
ylab("Number of Animals")
fst301<-c(which(Dogtrain$breed1=="Chihuahua Shorthair")#吉娃娃短毛--1#
,which(Dogtrain$breed1=="Pit Bull")#比特犬--2
,which(Dogtrain$breed1=="Labrador Retriever")#拉不拉多--3
,which(Dogtrain$breed1=="German Shepherd")#德國牧羊犬--4
,which(Dogtrain$breed1=="Australian Cattle Dog")#澳洲牧牛犬--5
,which(Dogtrain$breed1=="Dachshund")#臘腸狗--6
,which(Dogtrain$breed1=="Boxer")#拳師犬--7
,which(Dogtrain$breed1=="Border Collie")#邊境牧羊犬--8
,which(Dogtrain$breed1=="Miniature Poodle")#迷你貴賓犬--9
,which(Dogtrain$breed1=="Australian Shepherd")#澳洲牧羊犬--10
,which(Dogtrain$breed1=="Yorkshire Terrier")#約克夏--11
,which(Dogtrain$breed1=="Jack Russell Terrier")#傑克羅素???--12
,which(Dogtrain$breed1=="Miniature Schnauzer")#迷你雪納瑞--13
,which(Dogtrain$breed1=="Beagle")#米格魯--14
,which(Dogtrain$breed1=="Catahoula")#加泰霍拉豹犬--15
,which(Dogtrain$breed1=="Rat Terrier")#捕鼠???--16
,which(Dogtrain$breed1=="Siberian Husky")#西伯利亞雪橇犬--17
,which(Dogtrain$breed1=="Rottweiler")#羅威那--18
,which(Dogtrain$breed1=="Shih Tzu")#西施犬--19
,which(Dogtrain$breed1=="Chihuahua Longhair")#吉娃娃長毛--20
,which(Dogtrain$breed1=="Cairn Terrier")#凱恩???21
,which(Dogtrain$breed1=="Pointer")#指示犬--22
,which(Dogtrain$breed1=="Great Pyrenees")#大白熊犬--23
,which(Dogtrain$breed1=="American Bulldog")#美國鬥牛犬--24
,which(Dogtrain$breed1=="Anatol Shepherd")#安那托利亞牧羊犬--25
,which(Dogtrain$breed1=="Australian Kelpie")#澳洲卡爾比犬--26
,which(Dogtrain$breed1=="Staffordshire")#斯塔福郡鬥牛梗--27
,which(Dogtrain$breed1=="Pug")#巴哥犬--28
,which(Dogtrain$breed1=="Maltese")#馬爾濟斯--29
,which(Dogtrain$breed1=="American Staffordshire Terrier")#美國史特富郡梗--30
)
fst302<-c(which(Dogtrain$breed2=="Chihuahua Shorthair")#吉娃娃短毛--1#
,which(Dogtrain$breed2=="Pit Bull")#比特犬--2
,which(Dogtrain$breed2=="Labrador Retriever")#拉不拉多--3
,which(Dogtrain$breed2=="German Shepherd")#德國牧羊犬--4
,which(Dogtrain$breed2=="Australian Cattle Dog")#澳洲牧牛犬--5
,which(Dogtrain$breed2=="Dachshund")#臘腸狗--6
,which(Dogtrain$breed2=="Boxer")#拳師犬--7
,which(Dogtrain$breed2=="Border Collie")#邊境牧羊犬--8
,which(Dogtrain$breed2=="Miniature Poodle")#迷你貴賓犬--9
,which(Dogtrain$breed2=="Australian Shepherd")#澳洲牧羊犬--10
,which(Dogtrain$breed2=="Yorkshire Terrier")#約克夏--11
,which(Dogtrain$breed2=="Jack Russell Terrier")#傑克羅素???--12
,which(Dogtrain$breed2=="Miniature Schnauzer")#迷你雪納瑞--13
,which(Dogtrain$breed2=="Beagle")#米格魯--14
,which(Dogtrain$breed2=="Catahoula")#加泰霍拉豹犬--15
,which(Dogtrain$breed2=="Rat Terrier")#捕鼠???--16
,which(Dogtrain$breed2=="Siberian Husky")#西伯利亞雪橇犬--17
,which(Dogtrain$breed2=="Rottweiler")#羅威那--18
,which(Dogtrain$breed2=="Shih Tzu")#西施犬--19
,which(Dogtrain$breed2=="Chihuahua Longhair")#吉娃娃長毛--20
,which(Dogtrain$breed2=="Cairn Terrier")#凱恩???21
,which(Dogtrain$breed2=="Pointer")#指示犬--22
,which(Dogtrain$breed2=="Great Pyrenees")#大白熊犬--23
,which(Dogtrain$breed2=="American Bulldog")#美國鬥牛犬--24
,which(Dogtrain$breed2=="Anatol Shepherd")#安那托利亞牧羊犬--25
,which(Dogtrain$breed2=="Australian Kelpie")#澳洲卡爾比犬--26
,which(Dogtrain$breed2=="Staffordshire")#斯塔福郡鬥牛梗--27
,which(Dogtrain$breed2=="Pug")#巴哥犬--28
,which(Dogtrain$breed2=="Maltese")#馬爾濟斯--29
,which(Dogtrain$breed2=="American Staffordshire Terrier")#美國史特富郡梗--30
)
fst302<-fst302[which(is.na(match(fst302,fst301))==T)]
breed<-c()
for(i in 1:dim(Dogtrain)[1]){
if(is.na(match(i,fst301))==F){
breed<-c(breed,Dogtrain$breed1[i])
}else if(is.na(match(i,fst302))==F){
breed<-c(breed,Dogtrain$breed2[i])
}else{
breed<-c(breed,as.character("Other Breed"))
}
}
Dogtrain$breed<-as.factor(breed)
Dogtrain<-Dogtrain[,-c(which(colnames(Dogtrain)=="breed1"),
which(colnames(Dogtrain)=="breed2"),
which(colnames(Dogtrain)=="Breed"))]
Change the AgeuponOutcome as “Puppy”, “AdultDog”, “OldDog” three kinds of types.
Age<-Dogtrain$AgeuponOutcome
AgeCha<-as.character(Age)
AgeStr<-strsplit(AgeCha,"[ ]")
Puppy<-c()
AdultDog<-c()
OldDog<-c()
for(i in 1 : length(AgeStr) ){
if(length(AgeStr[[i]])>0){
temp1<-strsplit(AgeStr[[i]]," ")[[1]]
temp1<-as.numeric(temp1)
temp2<-strsplit(AgeStr[[i]]," ")[[2]]
if((temp1==0 && length( which(temp2=="years") ) >0 ) |
length( which(temp2=="day") ) > 0 |
length( which(temp2=="weeks") ) > 0 |
length( which(temp2=="month") ) > 0 |
length( which(temp2=="months") ) > 0 ){
AgeCha[i]<-"Puppy"
}else if(temp1 < 7 && length( which(temp2=="years"))>0){
AgeCha[i]<-"AdultDog"
}else{
AgeCha[i]<-"OldDog"
}
}
}
head(Dogtrain)
## DateTime OutcomeType AnimalType SexuponOutcome
## 1 2014-02-12 18:22:00 Return Dog Neutered Male
## 3 2015-01-31 12:28:00 Adoption Dog Neutered Male
## 5 2013-11-15 12:52:00 Transfer Dog Neutered Male
## 6 2014-04-25 13:04:00 Transfer Dog Intact Female
## 9 2014-02-04 17:17:00 Adoption Dog Spayed Female
## 10 2014-05-03 07:48:00 Adoption Dog Spayed Female
## AgeuponOutcome Color breed
## 1 1 year Brown/White Other Breed
## 3 2 years Blue/White Pit Bull
## 5 2 years Tan Miniature Poodle
## 6 1 month Black/Tan Cairn Terrier
## 9 5 months Red/White Other Breed
## 10 1 year White Cairn Terrier
Dogtrain$Age<-as.factor(AgeCha)
#remove the missing age
Dogtrain<-Dogtrain[-which(Dogtrain[,which(colnames(Dogtrain)=="AgeuponOutcome")]==""),-which(colnames(Dogtrain)=="AgeuponOutcome")]
Replace each color to “Simple”, “Double”, “Tricolor”, “Brindle”, “Tick”, “Merle”, Six categories.
And the “Brindle” contains the color that it has Brindle and Tick, or Brindle and Merle at the same time.
But, there are still some of colors that it not belong to Brindle, like “Blue Tiger”, “Blue cream”, “Smoke” etc.. I will just consider their color, and classify them to “Simple”, “Double” or “Tricolor”.
temp <- as.character(Dogtrain$Color)
Color1 <- strsplit(x = temp, split = "/") %>% sapply(function(x){x[1]})
Color2 <- strsplit(x = temp, split = "/") %>% sapply(function(x){x[2]})
Color11 <- strsplit(x = Color1, split = " ") %>% sapply(function(x){x[1]})
Color12 <- strsplit(x = Color1, split = " ") %>% sapply(function(x){x[2]})
Color21 <- strsplit(x = Color2, split = " ") %>% sapply(function(x){x[1]})
Color22 <- strsplit(x = Color2, split = " ") %>% sapply(function(x){x[2]})
Dogtrain %>%
count(Color) %>%
arrange(desc(n)) %>% head(20) %>%
ggplot(aes(x = reorder(Color, n), y = n)) +
geom_bar(stat = "identity", width = 0.8) +
coord_flip() +
theme(axis.title.y = element_blank()) +
ggtitle("Popular Colors") +
ylab("Number of Animals")
#先分純or雙色or三色之後
#只要有 Brindle 的 都歸到 Brindle
#其他 例如 Blue tiger, Smoke.... 只要有除了顏色以外的名字,但不是 Brindle, Tick, Merle
#那就讓他以顏色分
ColorFix<-c()
ColorFix[length(Color1)+1]<-0
for(i in 1:length(Color1)){
if( is.na(Color12[i])==F && is.na(Color22[i])==F ){
if( Color12[i]=="Brindle" && Color22[i]=="Brindle" ||
Color12[i]=="Brindle" && Color22[i]=="Tick" ||
Color12[i]=="Brindle" && Color22[i]=="Merle" ||
Color12[i]=="Tick" && Color22[i]=="Brindle" ||
Color12[i]=="Merle" && Color22[i]=="Brindle" ){
ColorFix[i]<-"Brindle"
}else if ( Color12[i] == "Tick" && Color22[i] == "Tick" ){
ColorFix[i]<-"Tricolor"
}else if ( Color12[i]== "Merle" && Color22[i] == "Merle"||
Color12[i]=="Tick" && Color22[i]=="Merle"||
Color12[i]=="Merle" && Color22[i]=="Tick"){
ColorFix[i]<-"Merle"
}
}else if(is.na(Color12[i])==F){
if( Color12[i]=="Merle" || Color12[i]=="Brindle"){
ColorFix[i]<-Color12[i]
}else if(Color12[i]=="Tick"){
ColorFix[i]<-"Double"
}
}else if(is.na(Color22[i])==F){
if( Color22[i]=="Merle" || Color22[i]=="Brindle" || Color22[i]=="Tick"){
ColorFix[i]<-Color22[i]
}else if(Color22[i]=="Tick"){
ColorFix[i]<-"Double"
}
}
if(is.na(ColorFix[i])==T){
if(Color11[i]=="Tricolor"){
ColorFix[i]<-"Tricolor"
}else if(is.na(Color21[i])==T){
if(Color11[i]=="Black"||Color11[i]=="Brown"||Color11[i]=="Red"||Color11[i]=="Blue"||
Color11[i]=="Chocolate"||Color11[i]=="Sable"){
ColorFix[i]<-"Heavy"
}else if( Color11[i]=="White" || Color11[i]=="Tan" || Color11[i]=="Buff" ||
Color11[i]=="Yellow" || Color11[i]=="Fawn" || Color11[i]=="Cream" ||
Color11[i]=="Gray" || Color11[i]=="Gold"){
ColorFix[i]<-"Light"
}else{
ColorFix[i]<-"Others Simple"
}
}else {
ColorFix[i]<-"Double"
}
}
}
ColorFix<-ColorFix[-(length(Color1)+1)]
Dogtrain$ColorFix<-as.factor(ColorFix)
Dogtrain<-Dogtrain[,-which(colnames(Dogtrain)=="Color")]
DateTime<-as.Date(Dogtrain$DateTime)
a<-c()
for(i in 1:length(DateTime)){
if(format(DateTime[i],"%m")=="06" ||
format(DateTime[i],"%m")=="07" ||
format(DateTime[i],"%m")=="08" )
{
a<-c(a,"Summer")
}else if(format(DateTime[i],"%m")=="09" ||
format(DateTime[i],"%m")=="10" ||
format(DateTime[i],"%m")=="11" )
{
a<-c(a,"Fall")
}else if(format(DateTime[i],"%m")=="12" ||
format(DateTime[i],"%m")=="01" ||
format(DateTime[i],"%m")=="02" )
{
a<-c(a,"Winter")
}else if(format(DateTime[i],"%m")=="03" ||
format(DateTime[i],"%m")=="04" ||
format(DateTime[i],"%m")=="05" )
{
a<-c(a,"Spring")
}
}
Dogtrain$DateTime<-as.factor(a)
# DateTime<-as.Date(Dogtrain$DateTime)
# a<-c()
# for(i in 1:length(DateTime)){
# if(format(DateTime[i],"%m")=="01")
# {
# a<-c(a,"January")
# }else if(format(DateTime[i],"%m")=="02")
# {
# a<-c(a,"February")
# }else if(format(DateTime[i],"%m")=="03")
# {
# a<-c(a,"March")
# }else if(format(DateTime[i],"%m")=="04")
# {
# a<-c(a,"April")
# }else if(format(DateTime[i],"%m")=="05")
# {
# a<-c(a,"May")
# }else if(format(DateTime[i],"%m")=="06")
# {
# a<-c(a,"June")
# }else if(format(DateTime[i],"%m")=="07")
# {
# a<-c(a,"July")
# }else if(format(DateTime[i],"%m")=="08")
# {
# a<-c(a,"August")
# }else if(format(DateTime[i],"%m")=="09")
# {
# a<-c(a,"September")
# }else if(format(DateTime[i],"%m")=="10")
# {
# a<-c(a,"October")
# }else if(format(DateTime[i],"%m")=="11")
# {
# a<-c(a,"November")
# }else if(format(DateTime[i],"%m")=="12")
# {
# a<-c(a,"December")
# }
# }
# Dogtrain$DateTime<-as.factor(a)
Try to use SVM to fit model.
method 1 : 使training data中,各個OutcomeType的個數相等。 各個OutcomeType各取 500~600 隨機一個個數
#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain
AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))
choiceddata<-
c(which(Dogtrainsvm$OutcomeType=="Adoption")[sample(1:AdoL,sample(500:600,1))],
which(Dogtrainsvm$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],
which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],
which(Dogtrainsvm$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))]
)
cat("The number of training data is ",length(choiceddata))
## The number of training data is 2209
cat("\n")
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]
AdoL<-length(which(SVMtrain$OutcomeType=="Adoption"))
ReL<-length(which(SVMtrain$OutcomeType=="Return"))
EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(SVMtrain$OutcomeType=="Transfer"))
cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is 520
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 535
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 590
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 564
cat("\n")
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 1.466186 secs
testEr<-predict(model,SVMtest[,-2])
trainEr<-predict(model,SVMtrain[,-2])
cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 204 4 35 72
## Euthanasia 2 264 78 106
## Return 312 258 405 238
## Transfer 2 64 17 148
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 2161 4 249 448
## Euthanasia 60 131 526 648
## Return 3697 144 2841 1450
## Transfer 59 25 135 807
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))
trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)
cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.5377999
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5562196
cat("\n")
rm(Dogtrainsvm)
但是效果還是沒很好,懷疑是不是為了讓他balance,導致 training data 過少
method 2 : 補上安樂死的data 使他們balance 選出 10000 筆作為 training data
#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain
AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))
Dogtrainsvm<-rbind(Dogtrainsvm,
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],])
choiceddata<-sample(1:dim(Dogtrainsvm)[1],10000)
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]
AdoL<-length(which(SVMtrain$OutcomeType=="Adoption"))
ReL<-length(which(SVMtrain$OutcomeType=="Return"))
EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(SVMtrain$OutcomeType=="Transfer"))
cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is 3442
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 2261
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 2199
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 2098
cat("\n")
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 28.55163 secs
testEr<-predict(model,SVMtest[,-2])
trainEr<-predict(model,SVMtrain[,-2])
cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 3384 1022 1847 1186
## Euthanasia 28 976 331 374
## Return 0 4 1 0
## Transfer 30 197 82 538
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 2990 864 1683 1022
## Euthanasia 34 844 271 379
## Return 0 2 0 0
## Transfer 31 221 71 418
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))
trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)
cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.5101
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5184598
cat("\n")
rm(Dogtrainsvm)
method 3 : 補上安樂死的data 使他們balance。並使training data也balance NOT FIX!!!!!!!!!!!!!!!!!!
#remove DateTime & AnimalID
Dogtrainsvm<-Dogtrain
AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption"))
ReL<-length(which(Dogtrainsvm$OutcomeType=="Return"))
EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia"))
TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer"))
Dogtrainsvm<-rbind(Dogtrainsvm,
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],],
Dogtrainsvm[which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],])
(AdoL<-length(which(Dogtrainsvm$OutcomeType=="Adoption")))
## [1] 6497
(ReL<-length(which(Dogtrainsvm$OutcomeType=="Return")))
## [1] 4286
(EuL<-length(which(Dogtrainsvm$OutcomeType=="Euthanasia")))
## [1] 4095
(TrL<-length(which(Dogtrainsvm$OutcomeType=="Transfer")))
## [1] 3917
choiceddata<-
c(which(Dogtrainsvm$OutcomeType=="Adoption")[sample(1:AdoL,sample(2000:2500,1))],
which(Dogtrainsvm$OutcomeType=="Return")[sample(1:ReL,sample(2000:2500,1))],
which(Dogtrainsvm$OutcomeType=="Euthanasia")[sample(1:EuL,sample(2000:2500,1))],
which(Dogtrainsvm$OutcomeType=="Transfer")[sample(1:TrL,sample(2000:2500,1))]
)
SVMtrain<-Dogtrainsvm[choiceddata,]
SVMtest<-Dogtrainsvm[-choiceddata,]
(AdoL<-length(which(SVMtrain$OutcomeType=="Adoption")))
## [1] 2230
(ReL<-length(which(SVMtrain$OutcomeType=="Return")))
## [1] 2029
(EuL<-length(which(SVMtrain$OutcomeType=="Euthanasia")))
## [1] 2488
(TrL<-length(which(SVMtrain$OutcomeType=="Transfer")))
## [1] 2493
tic<-Sys.time()
model<-svm(OutcomeType~.,data = SVMtrain)
toc<-Sys.time()
toc-tic
## Time difference of 26.42086 secs
testEr<-predict(model,SVMtest[,-2])
trainEr<-predict(model,SVMtrain[,-2])
cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,SVMtrain$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 1048 101 318 520
## Euthanasia 178 1563 532 702
## Return 985 660 1126 761
## Transfer 19 164 53 510
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,SVMtest$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 1998 55 345 281
## Euthanasia 351 1010 592 398
## Return 1881 417 1273 427
## Transfer 37 125 47 318
cat("\n")
testErVa<-length(which((SVMtest$OutcomeType==testEr)==F))/(dim(Dogtrainsvm)[1]-length(choiceddata))
trainErVa<-length(which((SVMtrain$OutcomeType==trainEr)==F))/length(choiceddata)
cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.540368
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5186813
cat("\n")
rm(Dogtrainsvm)
method 1 : Training data : 10000 ; Tree 600 & not adjust training data
DogtrainRF<-Dogtrain
choiceddata<-sample(1:dim(DogtrainRF)[1],10000)
RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]
tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 41.52177 secs
trainEr<-predict(model,RFtrain[,-2])
testEr<-predict(model,RFtest[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)
cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 3754 184 1518 1176
## Euthanasia 0 51 3 2
## Return 363 159 986 295
## Transfer 75 174 219 1041
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 2003 97 987 631
## Euthanasia 0 9 9 4
## Return 266 104 392 203
## Transfer 36 116 172 565
cat("\n")
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))
cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.4168
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.4692528
cat("\n")
#rm(DogtrainRF)
method 2 : 使training data中,各個OutcomeType的個數相等。 各個OutcomeType各取 500~600 隨機一個個數 分成 600 個 Tree
DogtrainRF<-Dogtrain
AdoL<-length(which(DogtrainRF$OutcomeType=="Adoption"))
ReL<-length(which(DogtrainRF$OutcomeType=="Return"))
EuL<-length(which(DogtrainRF$OutcomeType=="Euthanasia"))
TrL<-length(which(DogtrainRF$OutcomeType=="Transfer"))
choiceddata<-
c(which(DogtrainRF$OutcomeType=="Adoption")[sample(1:AdoL,sample(500:600,1))],
which(DogtrainRF$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],
which(DogtrainRF$OutcomeType=="Euthanasia")[sample(1:EuL,sample(500:600,1))],
which(DogtrainRF$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))]
)
RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]
tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 20.86515 secs
testEr<-predict(model,RFtest[,-2])
trainEr<-predict(model,RFtrain[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)
cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 326 21 54 72
## Euthanasia 47 427 96 91
## Return 112 95 377 102
## Transfer 16 52 29 308
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 2650 22 619 623
## Euthanasia 531 162 879 723
## Return 2308 76 1864 947
## Transfer 507 39 368 1051
cat("\n")
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))
cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.3537079
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5716209
cat("\n")
rm(DogtrainRF)
method 3 : Training data : 1000 ; Tree : 600 隨機選出10000筆的 training data 後,再使 trainig data 中的 OutcomeType balance。
DogtrainRF<-Dogtrain
choiceddata<-sample(1:dim(DogtrainRF)[1],10000)
RFtrain<-DogtrainRF[choiceddata,]
RFtest<-DogtrainRF[-choiceddata,]
AdoL<-length(which(RFtrain$OutcomeType=="Adoption"))
ReL<-length(which(RFtrain$OutcomeType=="Return"))
EuL<-length(which(RFtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(RFtrain$OutcomeType=="Transfer"))
RFtrain<-rbind(RFtrain,
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],],
RFtrain[which(RFtrain$OutcomeType=="Euthanasia")[sample(1:EuL,sample(400:500,1))],])
RFtrain<-rbind(RFtrain,
RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],],
RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],],
RFtrain[which(RFtrain$OutcomeType=="Transfer")[sample(1:TrL,sample(500:600,1))],])
RFtrain<-rbind(RFtrain,
RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],],
RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(500:600,1))],],
RFtrain[which(RFtrain$OutcomeType=="Return")[sample(1:ReL,sample(300:400,1))],])
AdoL<-length(which(RFtrain$OutcomeType=="Adoption"))
ReL<-length(which(RFtrain$OutcomeType=="Return"))
EuL<-length(which(RFtrain$OutcomeType=="Euthanasia"))
TrL<-length(which(RFtrain$OutcomeType=="Transfer"))
cat("The number of Adoption in the training data is ",AdoL)
## The number of Adoption in the training data is 4190
cat("\n")
cat("The number of Return to transfer in the training data is",ReL)
## The number of Return to transfer in the training data is 4106
cat("\n")
cat("The number of Euthanasia in the training data is",EuL)
## The number of Euthanasia in the training data is 4130
cat("\n")
cat("The number of Transfer in the training data is",TrL)
## The number of Transfer in the training data is 4140
cat("\n")
tic<-Sys.time()
model<-randomForest(OutcomeType~.,data=RFtrain,ntree=600)
toc<-Sys.time()
toc-tic
## Time difference of 44.51565 secs
testEr<-predict(model,RFtest[,-2])
trainEr<-predict(model,RFtrain[,-2])
plot(model, ylim=c(0,1))
legend('topright', colnames(model$err.rate), col=1:6, fill=1:6)
cat("The confusion Matrix of training data.")
## The confusion Matrix of training data.
confusionMatrix(trainEr,RFtrain$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 2277 85 459 812
## Euthanasia 381 3244 855 754
## Return 1481 569 2593 1139
## Transfer 51 232 199 1435
cat("\n")
cat("The confusion Matrix of testing data.")
## The confusion Matrix of testing data.
confusionMatrix(testEr,RFtest$OutcomeType)$table
## Reference
## Prediction Adoption Euthanasia Return Transfer
## Adoption 1138 23 292 294
## Euthanasia 224 173 385 286
## Return 905 94 837 371
## Transfer 40 49 88 395
cat("\n")
testErVa<-length(which((RFtest$OutcomeType==testEr)==F))/(dim(DogtrainRF)[1]-length(choiceddata))
trainErVa<-length(which((RFtrain$OutcomeType==trainEr)==F))/length(choiceddata)
cat("Training Error Rate =",trainErVa)
## Training Error Rate = 0.7017
cat("\n")
cat("Testing Error Rate =",testErVa)
## Testing Error Rate = 0.5454058
cat("\n")
rm(DogtrainRF)