There are millions of stray pets around the world, some of which are fortunate enough to be adopted while many others are not. While adoption of a pet is often the definition of success, the rate at which a pet is adopted is also a key success factor - pets that take a long time to adopt contribute to over-crowded animal shelters and can prevent taking on new strays. Sadly, pets that are not adopted eventually need to be euthanized.
Predictor (Adoption Speed) Description: Predict how quickly, if at all, a pet is adopted.
The values are determined in the following way: 0 - Pet was adopted on the same day as it was listed. 1 - Pet was adopted between 1 and 7 days (1st week) after being listed. 2 - Pet was adopted between 8 and 30 days (1st month) after being listed. 3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed. 4 - No adoption after 100 days of being listed.
inputs the transformation training data and completes an analysis of the features, with visuals.
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
library(dplyr)
library(reshape)
library(ggplot2)
library(purrr)
library(psych)
library(tidyr)
library(corrplot)
Load the data
## Type Breed1 Gender Color1 Color2 Color3 MaturitySize FurLength Vaccinated
## 1 1 0 2 2 0 0 2 1 1
## 2 1 0 2 2 3 0 2 1 1
## Dewormed Sterilized Health Quantity Fee VideoAmt PhotoAmt AdoptionSpeed
## 1 1 2 1 1 0 0 3 3
## 2 1 2 1 1 100 0 5 4
## AgeYears NumColors AllMeds
## 1 0 1 4
## 2 0 2 4
#Function to calculate each feature's correlation to AdoptionSpeed
correlations <- function(data) {
res_vector <- vector()
colnames_vector <- colnames(data)
for(i in 1:ncol(data)) {
res <- cor.test(data[ , i], data$AdoptionSpeed,
method = "pearson")
res_round = abs(round(res$estimate,4))
res_vector <- c(res_vector, res_round)
}
corrdf <- data.frame(colnames_vector, res_vector)
corrdf_sort <- corrdf %>% arrange(desc(res_vector))
#Remove AdoptionSpeed from correlation df
corrdf_sort = corrdf_sort[-c(1),]
return(corrdf_sort)
}
#Dogs and Cats: each feature's correlation to AdoptionSpeed
corrdf_sort = correlations(data)
ggplot(corrdf_sort, aes(x=reorder(colnames_vector,res_vector), y=res_vector)) +
geom_bar(stat="identity") + ggtitle("Correlations to Adoption Speed - Dogs and Cats") +
theme(axis.text.x = element_text(angle = 90, size = 10))
#Dogs: each feature's correlation to AdoptionSpeed
dog_data = filter(data, Type == 1)
dog_data = subset(dog_data, select = -c(Type))
corrdf_sort = correlations(dog_data)
#Dogs: each feature's correlation to AdoptionSpeed
ggplot(corrdf_sort, aes(x=reorder(colnames_vector,res_vector), y=res_vector)) +
geom_bar(stat="identity") + ggtitle("Correlations to Adoption Speed - Dogs") +
theme(axis.text.x = element_text(angle = 90, size = 10))
#Cats: each feature's correlation to AdoptionSpeed
cat_data = filter(data, Type == 2)
cat_data = subset(cat_data, select = -c(Type))
corrdf_sort = correlations(cat_data)
#Dogs: each feature's correlation to AdoptionSpeed
ggplot(corrdf_sort, aes(x=reorder(colnames_vector,res_vector), y=res_vector)) +
geom_bar(stat="identity") + ggtitle("Correlations to Adoption Speed - Cats") +
theme(axis.text.x = element_text(angle = 90, size = 10))
corrplot.mixed(cor(data %>% keep(is.numeric)), tl.col = 'black', tl.pos = 'lt', upper = "number", lower="shade", shade.col=NA, tl.srt=45)
column_names = names(data)
for(i in 1:ncol(data)) {
#ggplot(data, aes(data[ , i], data$AdoptionSpeed)) + geom_bar()
boxplot(AdoptionSpeed ~ data[ , i], data = data, main=column_names[i])
}
for(i in 1:ncol(data)) {
counts <- table(data[ , i])
barplot(counts, main=column_names[i])
}