This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
# install.packages(readr)
library(readr)
# install.packages('rmarkdown')
library(ggplot2)
library('RColorBrewer')
# install.packages('ggplot2')
library(psych)
Attaching package: ‘psych’
The following objects are masked from ‘package:ggplot2’:
%+%, alpha
colors = brewer.pal(8, "Dark2")
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
pets = read_csv('train.csv')
summary(pets)
AnimalID Name DateTime OutcomeType
Length:26729 Length:26729 Min. :2013-10-01 09:31:00 Length:26729
Class :character Class :character 1st Qu.:2014-05-31 16:31:00 Class :character
Mode :character Mode :character Median :2014-12-13 17:10:00 Mode :character
Mean :2014-12-19 00:22:23
3rd Qu.:2015-07-19 19:48:00
Max. :2016-02-21 19:17:00
OutcomeSubtype AnimalType SexuponOutcome AgeuponOutcome Breed
Length:26729 Length:26729 Length:26729 Length:26729 Length:26729
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
Color
Length:26729
Class :character
Mode :character
names(pets)
[1] "AnimalID" "Name" "DateTime" "OutcomeType" "OutcomeSubtype"
[6] "AnimalType" "SexuponOutcome" "AgeuponOutcome" "Breed" "Color"
# split data by dog and cats
#Data Splits
#SexuponOutcome ->
#Gender
#Male | Female
#Repro
#SN (Spayed/Neutered) | Intact | Unknown (leave out unknown)
#DateTime ->
# Month - Day - Year
# Data Transforms
# $AgeuponOutcome
# convert to years
#Split data by dog & cat?
#
# split data by dog and cats
# factors
# outcome types and subtypes
pets$OutcomeType = factor(pets$OutcomeType)
pets$OutcomeSubtype = factor(pets$OutcomeSubtype)
# converting colour to first colour as primary colour
pets$PrimaryColor <- word(pets$Color, sep = fixed("/"))
pets$PrimaryColor <- word(pets$PrimaryColor, sep = fixed(" "))
# factor primary colour
pets$PrimaryColor <- factor(pets$PrimaryColor)
pets$PrimaryColor
# counts
pdays <- filter(pets, (word(AgeuponOutcome, start = 2, sep = fixed(" "))) %in% c("day", "days"))
pmonths <- filter(pets, (word(AgeuponOutcome, start = 2, sep = fixed(" "))) %in% c("month", "months"))
pweeks <- filter(pets, (word(AgeuponOutcome, start = 2, sep = fixed(" "))) %in% c("week", "weeks"))
pyears <- filter(pets, (word(AgeuponOutcome, start = 2, sep = fixed(" "))) %in% c("year", "years"))
pweek <- mutate(pweeks, Age = (as.double(word(AgeuponOutcome, start = 1, sep = fixed(" ")))) * 7.019)
pweek <- select(pweek, AnimalID, Age)
pmonth <- mutate(pmonths, Age = (as.double(word(AgeuponOutcome, start = 1, sep = fixed(" ")))) * 30.4)
pmonth <- select(pmonth, AnimalID, Age)
pday <- mutate(pdays, Age = (as.double(word(AgeuponOutcome, start = 1, sep = fixed(" ")))) * 1.0)
pday <- select(pday, AnimalID, Age)
pyear <- mutate(pyears, Age = as.double((as.double(word(AgeuponOutcome, start = 1, sep = fixed(" ")))) * 365))
pyear <- select(pyear, AnimalID, Age)
pmw <- NULL
pyd <- NULL
new_pets <- NULL
pyd <- merge(pyear, pday, all = T)
pmw <- merge(pweek, pmonth, all = T)
new_pets <- NULL
new_pets <- merge(pyd, pmw, all = T)
petsd <- merge(new_pets, pets, all.y = T)
mean(petsd$Age, na.rm=T)
[1] 794.6716
remove(new_pets)
remove(pmw)
# change 0 values that equal "0 Years" to the average for all
# 794.6716
# petsd$Age
# zyr <- filter(petsd, AgeuponOutcome == '0 years')
# zyr$Age = 794
# zyr
# petsd <- merge(petsd, zyr, all.x = TRUE)
#transforms
#SexuponOutcome ->
#Gender
#Male | Female
#Repro
#SN (Spayed/Neutered) | Intact | Unknown (leave out unknown)
# gender <- (word(x, start = 2, sep = fixed(" ")))
# age <- as.numeric((word (x, start = 1, sep = fixed(" "))))
factor(pets$SexuponOutcome)
petsd <- mutate(petsd, Repro = word(SexuponOutcome, start = 1, sep = fixed(" ")) )
petsd$Repro <- as.factor(petsd$Repro)
petsd$SexuponOutcome
not_unk <- filter(petsd, SexuponOutcome != "Unknown")
not_unk <- mutate(not_unk, Sex = word(SexuponOutcome, start = 2))
petsd <- merge(not_unk, petsd, all.y = T)
remove(not_unk)
petsd$Sex <- as.factor(petsd$Sex)
petsd$Breed <- as.factor(petsd$Breed)
petsd$AnimalType <- as.factor(petsd$AnimalType)
# write.csv(petsd, file = "pets.csv")
pyears <- pweeks <- pweek <- pyd <- pyear <- pyears <- pmonth <- pmonths <- pdays <- pday <- NULL
pets
# A tibble: 26,729 x 11
AnimalID Name DateTime OutcomeType OutcomeSubtype AnimalType SexuponOutcome
<chr> <chr> <time> <fctr> <fctr> <chr> <chr>
1 A671945 Hambone 2014-02-12 18:22:00 Return_to_owner NA Dog Neutered Male
2 A656520 Emily 2013-10-13 12:44:00 Euthanasia Suffering Cat Spayed Female
3 A686464 Pearce 2015-01-31 12:28:00 Adoption Foster Dog Neutered Male
4 A683430 <NA> 2014-07-11 19:09:00 Transfer Partner Cat Intact Male
5 A667013 <NA> 2013-11-15 12:52:00 Transfer Partner Dog Neutered Male
6 A677334 Elsa 2014-04-25 13:04:00 Transfer Partner Dog Intact Female
7 A699218 Jimmy 2015-03-28 13:11:00 Transfer Partner Cat Intact Male
8 A701489 <NA> 2015-04-30 17:02:00 Transfer Partner Cat Unknown
9 A671784 Lucy 2014-02-04 17:17:00 Adoption NA Dog Spayed Female
10 A677747 <NA> 2014-05-03 07:48:00 Adoption Offsite Dog Spayed Female
# ... with 26,719 more rows, and 4 more variables: AgeuponOutcome <chr>, Breed <chr>, Color <chr>,
# PrimaryColor <fctr>
library('Amelia')
Loading required package: Rcpp
##
## Amelia II: Multiple Imputation
## (Version 1.7.4, built: 2015-12-05)
## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
missmap(petsd, main="Missing Map")
petsd$Sex <- ifelse(petsd$Sex == 'Male', 1,0)
petsd$Spay_neut <- ifelse(petsd$Repro == 'Spayed' | petsd$Repro == 'Neutered', 1,0)
petsd$AgeuponOutcome <- NULL
petsd$AgeYrs <- petsd$Age/365
print(model12)
Boosted Logistic Regression
11134 samples
15 predictor
5 classes: 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'
Pre-processing: scaled (6), Yeo-Johnson transformation (6)
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 9094, 9094, 9095, 9094, 9094, 9095, ...
Resampling results across tuning parameters:
nIter Accuracy Kappa
11 0.8319528 0.6890023
21 0.8375985 0.6997172
31 0.8345562 0.6946716
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was nIter = 21.
names(petsd)