This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

# install.packages(readr)
library(readr)
# install.packages('rmarkdown')
library(ggplot2)
library('RColorBrewer')
# install.packages('ggplot2')
library(psych)

Attaching package: ‘psych’

The following objects are masked from ‘package:ggplot2’:

    %+%, alpha
colors = brewer.pal(8, "Dark2")
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
pets = read_csv('train.csv')
summary(pets)
   AnimalID             Name              DateTime                   OutcomeType       
 Length:26729       Length:26729       Min.   :2013-10-01 09:31:00   Length:26729      
 Class :character   Class :character   1st Qu.:2014-05-31 16:31:00   Class :character  
 Mode  :character   Mode  :character   Median :2014-12-13 17:10:00   Mode  :character  
                                       Mean   :2014-12-19 00:22:23                     
                                       3rd Qu.:2015-07-19 19:48:00                     
                                       Max.   :2016-02-21 19:17:00                     
 OutcomeSubtype      AnimalType        SexuponOutcome     AgeuponOutcome        Breed          
 Length:26729       Length:26729       Length:26729       Length:26729       Length:26729      
 Class :character   Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                                               
                                                                                               
                                                                                               
    Color          
 Length:26729      
 Class :character  
 Mode  :character  
                   
                   
                   
names(pets)
 [1] "AnimalID"       "Name"           "DateTime"       "OutcomeType"    "OutcomeSubtype"
 [6] "AnimalType"     "SexuponOutcome" "AgeuponOutcome" "Breed"          "Color"         

# split data by dog and cats




#Data Splits 

#SexuponOutcome -> 
#Gender 
#Male | Female
#Repro
#SN (Spayed/Neutered) | Intact | Unknown (leave out unknown)

#DateTime ->
# Month - Day - Year

# Data Transforms
  # $AgeuponOutcome
    # convert to years
#Split data by dog & cat?



# 
# split data by dog and cats


# factors

# outcome types and subtypes

pets$OutcomeType = factor(pets$OutcomeType)
pets$OutcomeSubtype = factor(pets$OutcomeSubtype)

# converting colour to first colour as primary colour
pets$PrimaryColor <- word(pets$Color, sep = fixed("/"))
pets$PrimaryColor <- word(pets$PrimaryColor, sep = fixed(" "))

# factor primary colour
pets$PrimaryColor <- factor(pets$PrimaryColor)
pets$PrimaryColor

# counts


pdays <- filter(pets, (word(AgeuponOutcome, start = 2,  sep = fixed(" "))) %in% c("day", "days"))
pmonths <- filter(pets, (word(AgeuponOutcome, start = 2,  sep = fixed(" "))) %in% c("month", "months"))
pweeks <- filter(pets, (word(AgeuponOutcome, start = 2,  sep = fixed(" "))) %in% c("week", "weeks"))
pyears <- filter(pets, (word(AgeuponOutcome, start = 2,  sep = fixed(" "))) %in% c("year", "years"))
pweek <- mutate(pweeks, Age = (as.double(word(AgeuponOutcome, start = 1,  sep = fixed(" ")))) * 7.019)
pweek <- select(pweek, AnimalID, Age)
pmonth <- mutate(pmonths, Age = (as.double(word(AgeuponOutcome, start = 1,  sep = fixed(" ")))) * 30.4)
pmonth <- select(pmonth, AnimalID, Age)
pday <- mutate(pdays, Age = (as.double(word(AgeuponOutcome, start = 1,  sep = fixed(" ")))) * 1.0)
pday <- select(pday, AnimalID, Age)
pyear <- mutate(pyears, Age = as.double((as.double(word(AgeuponOutcome, start = 1,  sep = fixed(" ")))) * 365))
pyear <- select(pyear, AnimalID, Age)
pmw <- NULL
pyd <- NULL
new_pets <- NULL
pyd <- merge(pyear, pday, all = T)
pmw <- merge(pweek, pmonth, all = T)
new_pets <- NULL
new_pets <- merge(pyd, pmw, all = T)
petsd <- merge(new_pets, pets, all.y = T)
mean(petsd$Age, na.rm=T)
[1] 794.6716
remove(new_pets)
remove(pmw)
# change 0 values that equal "0 Years" to the average for all 
# 794.6716
# petsd$Age
# zyr <- filter(petsd, AgeuponOutcome == '0 years')
# zyr$Age = 794
# zyr
# petsd <- merge(petsd, zyr, all.x = TRUE)

#transforms

#SexuponOutcome -> 
#Gender 
#Male | Female
#Repro
#SN (Spayed/Neutered) | Intact | Unknown (leave out unknown)
  # gender <- (word(x, start = 2, sep = fixed(" ")))
  # age <- as.numeric((word (x, start = 1, sep = fixed(" "))))
factor(pets$SexuponOutcome)

petsd <- mutate(petsd, Repro = word(SexuponOutcome, start = 1, sep = fixed(" ")) )
petsd$Repro <- as.factor(petsd$Repro)
petsd$SexuponOutcome



not_unk <- filter(petsd, SexuponOutcome != "Unknown")
not_unk <- mutate(not_unk, Sex = word(SexuponOutcome, start = 2))
petsd <- merge(not_unk, petsd, all.y = T)
remove(not_unk)
petsd$Sex <- as.factor(petsd$Sex)

petsd$Breed <- as.factor(petsd$Breed)


petsd$AnimalType <- as.factor(petsd$AnimalType)
# write.csv(petsd, file = "pets.csv")
pyears <- pweeks <- pweek <- pyd <- pyear <- pyears <- pmonth <- pmonths <- pdays <- pday <- NULL
pets
# A tibble: 26,729 x 11
   AnimalID    Name            DateTime     OutcomeType OutcomeSubtype AnimalType SexuponOutcome
      <chr>   <chr>              <time>          <fctr>         <fctr>      <chr>          <chr>
1   A671945 Hambone 2014-02-12 18:22:00 Return_to_owner             NA        Dog  Neutered Male
2   A656520   Emily 2013-10-13 12:44:00      Euthanasia      Suffering        Cat  Spayed Female
3   A686464  Pearce 2015-01-31 12:28:00        Adoption         Foster        Dog  Neutered Male
4   A683430    <NA> 2014-07-11 19:09:00        Transfer        Partner        Cat    Intact Male
5   A667013    <NA> 2013-11-15 12:52:00        Transfer        Partner        Dog  Neutered Male
6   A677334    Elsa 2014-04-25 13:04:00        Transfer        Partner        Dog  Intact Female
7   A699218   Jimmy 2015-03-28 13:11:00        Transfer        Partner        Cat    Intact Male
8   A701489    <NA> 2015-04-30 17:02:00        Transfer        Partner        Cat        Unknown
9   A671784    Lucy 2014-02-04 17:17:00        Adoption             NA        Dog  Spayed Female
10  A677747    <NA> 2014-05-03 07:48:00        Adoption        Offsite        Dog  Spayed Female
# ... with 26,719 more rows, and 4 more variables: AgeuponOutcome <chr>, Breed <chr>, Color <chr>,
#   PrimaryColor <fctr>
library('Amelia')
Loading required package: Rcpp
## 
## Amelia II: Multiple Imputation
## (Version 1.7.4, built: 2015-12-05)
## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
## 
missmap(petsd, main="Missing Map")
petsd$Sex <- ifelse(petsd$Sex == 'Male', 1,0)
petsd$Spay_neut <- ifelse(petsd$Repro == 'Spayed' | petsd$Repro == 'Neutered', 1,0)
petsd$AgeuponOutcome <- NULL
petsd$AgeYrs <- petsd$Age/365

print(model12)
Boosted Logistic Regression 

11134 samples
   15 predictor
    5 classes: 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer' 

Pre-processing: scaled (6), Yeo-Johnson transformation (6) 
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 9094, 9094, 9095, 9094, 9094, 9095, ... 
Resampling results across tuning parameters:

  nIter  Accuracy   Kappa    
  11     0.8319528  0.6890023
  21     0.8375985  0.6997172
  31     0.8345562  0.6946716

Accuracy was used to select the optimal model using  the largest value.
The final value used for the model was nIter = 21. 

names(petsd)


LS0tCnRpdGxlOiAiRmluYWwgQW5pbWFsIFNoZWx0ZXIgQ29kZSIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiBbUiBNYXJrZG93bl0oaHR0cDovL3JtYXJrZG93bi5yc3R1ZGlvLmNvbSkgTm90ZWJvb2suIFdoZW4geW91IGV4ZWN1dGUgY29kZSB3aXRoaW4gdGhlIG5vdGVib29rLCB0aGUgcmVzdWx0cyBhcHBlYXIgYmVuZWF0aCB0aGUgY29kZS4gCgpUcnkgZXhlY3V0aW5nIHRoaXMgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpSdW4qIGJ1dHRvbiB3aXRoaW4gdGhlIGNodW5rIG9yIGJ5IHBsYWNpbmcgeW91ciBjdXJzb3IgaW5zaWRlIGl0IGFuZCBwcmVzc2luZyAqQ21kK1NoaWZ0K0VudGVyKi4gCgpgYGB7ciBzZXR1cDIyfQojIGluc3RhbGwucGFja2FnZXMocmVhZHIpCmxpYnJhcnkocmVhZHIpCiMgaW5zdGFsbC5wYWNrYWdlcygncm1hcmtkb3duJykKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KCdSQ29sb3JCcmV3ZXInKQojIGluc3RhbGwucGFja2FnZXMoJ2dncGxvdDInKQpsaWJyYXJ5KHBzeWNoKQpjb2xvcnMgPSBicmV3ZXIucGFsKDgsICJEYXJrMiIpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodGlkeXIpCmxpYnJhcnkoc3RyaW5ncikKCnBldHMgPSByZWFkX2NzdigndHJhaW4uY3N2JykKc3VtbWFyeShwZXRzKQpuYW1lcyhwZXRzKQoKYGBgCgpgYGB7ciBkYXRhIHRyYW5zZm9ybXN9CgojIHNwbGl0IGRhdGEgYnkgZG9nIGFuZCBjYXRzCgoKCgojRGF0YSBTcGxpdHMgCgojU2V4dXBvbk91dGNvbWUgLT4gCiNHZW5kZXIgCiNNYWxlIHwgRmVtYWxlCiNSZXBybwojU04gKFNwYXllZC9OZXV0ZXJlZCkgfCBJbnRhY3QgfCBVbmtub3duIChsZWF2ZSBvdXQgdW5rbm93bikKCiNEYXRlVGltZSAtPgojIE1vbnRoIC0gRGF5IC0gWWVhcgoKIyBEYXRhIFRyYW5zZm9ybXMKICAjICRBZ2V1cG9uT3V0Y29tZQogICAgIyBjb252ZXJ0IHRvIHllYXJzCiNTcGxpdCBkYXRhIGJ5IGRvZyAmIGNhdD8KCgoKIyAKIyBzcGxpdCBkYXRhIGJ5IGRvZyBhbmQgY2F0cwoKCiMgZmFjdG9ycwoKIyBvdXRjb21lIHR5cGVzIGFuZCBzdWJ0eXBlcwoKcGV0cyRPdXRjb21lVHlwZSA9IGZhY3RvcihwZXRzJE91dGNvbWVUeXBlKQpwZXRzJE91dGNvbWVTdWJ0eXBlID0gZmFjdG9yKHBldHMkT3V0Y29tZVN1YnR5cGUpCgojIGNvbnZlcnRpbmcgY29sb3VyIHRvIGZpcnN0IGNvbG91ciBhcyBwcmltYXJ5IGNvbG91cgpwZXRzJFByaW1hcnlDb2xvciA8LSB3b3JkKHBldHMkQ29sb3IsIHNlcCA9IGZpeGVkKCIvIikpCnBldHMkUHJpbWFyeUNvbG9yIDwtIHdvcmQocGV0cyRQcmltYXJ5Q29sb3IsIHNlcCA9IGZpeGVkKCIgIikpCgojIGZhY3RvciBwcmltYXJ5IGNvbG91cgpwZXRzJFByaW1hcnlDb2xvciA8LSBmYWN0b3IocGV0cyRQcmltYXJ5Q29sb3IpCnBldHMkUHJpbWFyeUNvbG9yCgojIGNvdW50cwoKCgpgYGAKCgpgYGB7ciBkYXRhIHRyYW5zZm9ybSBhZ2V9CgpwZGF5cyA8LSBmaWx0ZXIocGV0cywgKHdvcmQoQWdldXBvbk91dGNvbWUsIHN0YXJ0ID0gMiwgIHNlcCA9IGZpeGVkKCIgIikpKSAlaW4lIGMoImRheSIsICJkYXlzIikpCnBtb250aHMgPC0gZmlsdGVyKHBldHMsICh3b3JkKEFnZXVwb25PdXRjb21lLCBzdGFydCA9IDIsICBzZXAgPSBmaXhlZCgiICIpKSkgJWluJSBjKCJtb250aCIsICJtb250aHMiKSkKcHdlZWtzIDwtIGZpbHRlcihwZXRzLCAod29yZChBZ2V1cG9uT3V0Y29tZSwgc3RhcnQgPSAyLCAgc2VwID0gZml4ZWQoIiAiKSkpICVpbiUgYygid2VlayIsICJ3ZWVrcyIpKQpweWVhcnMgPC0gZmlsdGVyKHBldHMsICh3b3JkKEFnZXVwb25PdXRjb21lLCBzdGFydCA9IDIsICBzZXAgPSBmaXhlZCgiICIpKSkgJWluJSBjKCJ5ZWFyIiwgInllYXJzIikpCgpwd2VlayA8LSBtdXRhdGUocHdlZWtzLCBBZ2UgPSAoYXMuZG91YmxlKHdvcmQoQWdldXBvbk91dGNvbWUsIHN0YXJ0ID0gMSwgIHNlcCA9IGZpeGVkKCIgIikpKSkgKiA3LjAxOSkKcHdlZWsgPC0gc2VsZWN0KHB3ZWVrLCBBbmltYWxJRCwgQWdlKQoKcG1vbnRoIDwtIG11dGF0ZShwbW9udGhzLCBBZ2UgPSAoYXMuZG91YmxlKHdvcmQoQWdldXBvbk91dGNvbWUsIHN0YXJ0ID0gMSwgIHNlcCA9IGZpeGVkKCIgIikpKSkgKiAzMC40KQpwbW9udGggPC0gc2VsZWN0KHBtb250aCwgQW5pbWFsSUQsIEFnZSkKCnBkYXkgPC0gbXV0YXRlKHBkYXlzLCBBZ2UgPSAoYXMuZG91YmxlKHdvcmQoQWdldXBvbk91dGNvbWUsIHN0YXJ0ID0gMSwgIHNlcCA9IGZpeGVkKCIgIikpKSkgKiAxLjApCnBkYXkgPC0gc2VsZWN0KHBkYXksIEFuaW1hbElELCBBZ2UpCgpweWVhciA8LSBtdXRhdGUocHllYXJzLCBBZ2UgPSBhcy5kb3VibGUoKGFzLmRvdWJsZSh3b3JkKEFnZXVwb25PdXRjb21lLCBzdGFydCA9IDEsICBzZXAgPSBmaXhlZCgiICIpKSkpICogMzY1KSkKcHllYXIgPC0gc2VsZWN0KHB5ZWFyLCBBbmltYWxJRCwgQWdlKQoKcG13IDwtIE5VTEwKcHlkIDwtIE5VTEwKbmV3X3BldHMgPC0gTlVMTApweWQgPC0gbWVyZ2UocHllYXIsIHBkYXksIGFsbCA9IFQpCnBtdyA8LSBtZXJnZShwd2VlaywgcG1vbnRoLCBhbGwgPSBUKQoKbmV3X3BldHMgPC0gTlVMTApuZXdfcGV0cyA8LSBtZXJnZShweWQsIHBtdywgYWxsID0gVCkKcGV0c2QgPC0gbWVyZ2UobmV3X3BldHMsIHBldHMsIGFsbC55ID0gVCkKCm1lYW4ocGV0c2QkQWdlLCBuYS5ybT1UKQoKcmVtb3ZlKG5ld19wZXRzKQpyZW1vdmUocG13KQojIGNoYW5nZSAwIHZhbHVlcyB0aGF0IGVxdWFsICIwIFllYXJzIiB0byB0aGUgYXZlcmFnZSBmb3IgYWxsIAojIDc5NC42NzE2CiMgcGV0c2QkQWdlCiMgenlyIDwtIGZpbHRlcihwZXRzZCwgQWdldXBvbk91dGNvbWUgPT0gJzAgeWVhcnMnKQojIHp5ciRBZ2UgPSA3OTQKIyB6eXIKIyBwZXRzZCA8LSBtZXJnZShwZXRzZCwgenlyLCBhbGwueCA9IFRSVUUpCgpgYGAKCgoKYGBge3Igc2V4IHVwb24gb3V0Y29tZSBhbmQgZmFjdG9yaW5nfQoKI3RyYW5zZm9ybXMKCiNTZXh1cG9uT3V0Y29tZSAtPiAKI0dlbmRlciAKI01hbGUgfCBGZW1hbGUKI1JlcHJvCiNTTiAoU3BheWVkL05ldXRlcmVkKSB8IEludGFjdCB8IFVua25vd24gKGxlYXZlIG91dCB1bmtub3duKQogICMgZ2VuZGVyIDwtICh3b3JkKHgsIHN0YXJ0ID0gMiwgc2VwID0gZml4ZWQoIiAiKSkpCiAgIyBhZ2UgPC0gYXMubnVtZXJpYygod29yZCAoeCwgc3RhcnQgPSAxLCBzZXAgPSBmaXhlZCgiICIpKSkpCmZhY3RvcihwZXRzJFNleHVwb25PdXRjb21lKQoKcGV0c2QgPC0gbXV0YXRlKHBldHNkLCBSZXBybyA9IHdvcmQoU2V4dXBvbk91dGNvbWUsIHN0YXJ0ID0gMSwgc2VwID0gZml4ZWQoIiAiKSkgKQpwZXRzZCRSZXBybyA8LSBhcy5mYWN0b3IocGV0c2QkUmVwcm8pCnBldHNkJFNleHVwb25PdXRjb21lCgoKCm5vdF91bmsgPC0gZmlsdGVyKHBldHNkLCBTZXh1cG9uT3V0Y29tZSAhPSAiVW5rbm93biIpCm5vdF91bmsgPC0gbXV0YXRlKG5vdF91bmssIFNleCA9IHdvcmQoU2V4dXBvbk91dGNvbWUsIHN0YXJ0ID0gMikpCnBldHNkIDwtIG1lcmdlKG5vdF91bmssIHBldHNkLCBhbGwueSA9IFQpCnJlbW92ZShub3RfdW5rKQpwZXRzZCRTZXggPC0gYXMuZmFjdG9yKHBldHNkJFNleCkKCnBldHNkJEJyZWVkIDwtIGFzLmZhY3RvcihwZXRzZCRCcmVlZCkKCgpwZXRzZCRBbmltYWxUeXBlIDwtIGFzLmZhY3RvcihwZXRzZCRBbmltYWxUeXBlKQoKYGBgCgoKYGBge3IgY3Jvc3Nmb2xkIGFuZCBtb2RlbH0KCiMgd3JpdGUuY3N2KHBldHNkLCBmaWxlID0gInBldHMuY3N2IikKcHllYXJzIDwtIHB3ZWVrcyA8LSBwd2VlayA8LSBweWQgPC0gcHllYXIgPC0gcHllYXJzIDwtIHBtb250aCA8LSBwbW9udGhzIDwtIHBkYXlzIDwtIHBkYXkgPC0gTlVMTAoKcGV0cwpsaWJyYXJ5KCdBbWVsaWEnKQptaXNzbWFwKHBldHNkLCBtYWluPSJNaXNzaW5nIE1hcCIpCgoKcGV0c2QkU2V4IDwtIGlmZWxzZShwZXRzZCRTZXggPT0gJ01hbGUnLCAxLDApCnBldHNkJFNwYXlfbmV1dCA8LSBpZmVsc2UocGV0c2QkUmVwcm8gPT0gJ1NwYXllZCcgfCBwZXRzZCRSZXBybyA9PSAnTmV1dGVyZWQnLCAxLDApCgoKcGV0c2QkQWdldXBvbk91dGNvbWUgPC0gTlVMTAoKcGV0c2QkQWdlWXJzIDwtIHBldHNkJEFnZS8zNjUKYGBgCgpgYGB7cn0KCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocGFydHkpCmxpYnJhcnkocGFydHlraXQpCj9wYXJ0eQo/cGFydHlraXQKCmNvbnRyb2wgPSB0cmFpbkNvbnRyb2wobWV0aG9kPSJyZXBlYXRlZGN2IiwgbnVtYmVyPTMsIHJlcGVhdHM9MTApCgoKCmxpYnJhcnkoZHBseXIpCmRvZ3MgPC0gZmlsdGVyKHBldHNkLCBBbmltYWxUeXBlID09ICdEb2cnKQpjYXRzIDwtIGZpbHRlcihwZXRzZCwgQW5pbWFsVHlwZSA9PSAnQ2F0JykKCmNhdHMkQWdlWXJzTCA8LSBsb2coY2F0cyRBZ2VZcnMpCgpmaXRDb250cm9sPC10cmFpbkNvbnRyb2wobWV0aG9kPSJjdiIsCm51bWJlcj0xMCwgY2xhc3NQcm9icyA9IFRSVUUpCgptb2RlbDEyIDwtIHRyYWluKE91dGNvbWVUeXBlIH4gUmVwcm8gKyBTZXggKyBTcGF5X25ldXQgKyBBZ2VZcnNMLAogICAgICAgICAgICAgICAgZGF0YSA9IGNhdHMsIAogICAgICAgICAgICAgICAgcHJlUHJvY2Vzcz1jKCJzY2FsZSIsIlllb0pvaG5zb24iKSwKICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJMb2dpdEJvb3N0IiwKICAgICAgICAgICAgICAgIHRyQ29udHJvbCA9IGZpdENvbnRyb2wKICAgICAgICAgICAgICAgICkKCnByaW50KG1vZGVsMTIpCnBsb3QobW9kZWwxMikKCiMgZG9ncyAtIDgyLTgzJQojIGRvZ3MgLSA4Mi04MyUKCnByaW50KG1vZGVsMTIpCgoKYGBgCgoKYGBge3IgZmVhdHVyZSBlbmdpbmVlcmluZ30KCm5hbWVzKHBldHNkKQoKCgpgYGAKCg==