#This analysis will compare dog breeds in the US in regards to cost. The souce from this data is Kaggle: (https://www.kaggle.com/paultimothymooney/best-in-show-data-about-dogs?select=best_in_show.csv)

library(tidyverse) #installs tidyverse package
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr) #installs dplyr package
setwd("C:/Users/Dano/Documents/") #sets  working directoy
dogs <- read_csv("best_in_show.csv")
## Warning: Duplicated column names deduplicated: 'avg purchase price' => 'avg
## purchase price_1' [30]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   `datadog score` = col_double(),
##   popularity = col_double(),
##   `TOP DOG  SCORE (with kids) intelligence + longevity + ailments + cost scores (50% each) + 100% grooming score. highest possible score: 5` = col_double(),
##   `TOP DATA DOG RANKING  wo kids` = col_double()
## )
## i Use `spec()` for the full column specifications.
# uploads data
dogs1 <- na.omit(dogs) #removes missing values
 names(dogs1) <- tolower(names(dogs1)) # converts headings to lowercase column names
  names(dogs1) <- gsub(" ","_",names(dogs1)) # replaces spaces in headings with underscore
dogs2 <- dogs1 %>%
  select(breed, category, lifetime_cost, trainability,food_cost_lifetime, avg_long_yr, genetics_ailments, avg_purchase_price, size, `weight_(kg)`, intelligence, `obedience_(%_of_time)`, congenital_ailments, other_costs__lifetime, toys_treats_per_yr,pet_sitters_yr, grooming_yr, vet_fees_yr, kennels_yr, misc) 
# selects the variables that I want to explore  
view(dogs2) #displays the columns that I selected
dogs3 <- dogs2
  dogs3$lifetime_cost = (gsub("\\$", "", dogs3$lifetime_cost))
  dogs3$lifetime_cost = (gsub("\\,", "", dogs3$lifetime_cost))
  dogs3$avg_purchase_price = (gsub("\\$", "", dogs3$avg_purchase_price))
  dogs3$food_cost_lifetime = (gsub("\\$", "", dogs3$food_cost_lifetime))
  dogs3$food_cost_lifetime = (gsub("\\,", "", dogs3$food_cost_lifetime))
  dogs3$other_costs__lifetime = (gsub("\\$", "", dogs3$other_costs__lifetime))
  dogs3$other_costs__lifetime = (gsub("\\,", "", dogs3$other_costs__lifetime))
  dogs3$toys_treats_per_yr = (gsub("\\$", "", dogs3$toys_treats_per_yr))
  dogs3$pet_sitters_yr = (gsub("\\$", "", dogs3$pet_sitters_yr))
  dogs3$grooming_yr = (gsub("\\$", "", dogs3$grooming_yr))
  dogs3$vet_fees_yr = (gsub("\\$", "", dogs3$vet_fees_yr))
  dogs3$kennels_yr = (gsub("\\$", "", dogs3$kennels_yr))
  dogs3$misc = (gsub("\\$", "", dogs3$misc)) 
# removes the dollar and comma sign from selected columns
dogs3$lifetime_cost <- as.numeric(dogs3$lifetime_cost)
dogs3$avg_purchase_price <- as.numeric(dogs3$avg_purchase_price)
## Warning: NAs introduced by coercion
dogs3$food_cost_lifetime <- as.numeric(dogs3$food_cost_lifetime)
dogs3$genetics_ailments <- as.numeric(dogs3$genetics_ailments)
dogs3$other_costs__lifetime <- as.numeric(dogs3$other_costs__lifetime)
dogs3$toys_treats_per_yr <- as.numeric(dogs3$toys_treats_per_yr)
dogs3$pet_sitters_yr <- as.numeric(dogs3$pet_sitters_yr)
dogs3$grooming_yr <- as.numeric(dogs3$grooming_yr)
dogs3$vet_fees_yr <- as.numeric(dogs3$vet_fees_yr)
dogs3$kennels_yr <- as.numeric(dogs3$kennels_yr)
dogs3$avg_long_yr  <- as.numeric(dogs3$avg_long_yr)
dogs3$misc<- as.numeric(dogs3$misc)
dogs3$`weight_(kg)`<- as.numeric(dogs3$`weight_(kg)`)
## Warning: NAs introduced by coercion
# converts characters to numeric values
str(dogs3)
## tibble [21 x 20] (S3: tbl_df/tbl/data.frame)
##  $ breed                : chr [1:21] "Welsh Springer Spaniel" "Cocker Spaniel" "Siberian Husky" "Chihuahua" ...
##  $ category             : chr [1:21] "sporting" "sporting" "working" "toy" ...
##  $ lifetime_cost        : num [1:21] 20224 24330 22049 26250 22107 ...
##  $ trainability         : chr [1:21] "31" "20" "45" "67" ...
##  $ food_cost_lifetime   : num [1:21] 3478 6149 5035 4594 3762 ...
##  $ avg_long_yr          : num [1:21] 12.5 12.5 12.6 16.5 13.5 ...
##  $ genetics_ailments    : num [1:21] 1 2 0 1 2 2 1 0 1 1 ...
##  $ avg_purchase_price   : num [1:21] 750 465 650 588 NA 900 294 913 700 650 ...
##  $ size                 : chr [1:21] "medium" "small" "medium" "small" ...
##  $ weight_(kg)          : num [1:21] NA 11 22 2 9 NA 27 NA 28 NA ...
##  $ intelligence         : chr [1:21] "Above average" "Excellent" "Average" "Fair" ...
##  $ obedience_(%_of_time): chr [1:21] ">70" ">85" ">50" ">30" ...
##  $ congenital_ailments  : chr [1:21] "hip dysplasia" "Retinal dysplasia, seborrhea" "none" "Patellar luxation" ...
##  $ other_costs__lifetime: num [1:21] 13064 13074 13158 17258 14131 ...
##  $ toys_treats_per_yr   : num [1:21] 121 121 121 121 121 121 121 121 121 121 ...
##  $ pet_sitters_yr       : num [1:21] 126 126 126 126 126 126 126 126 126 126 ...
##  $ grooming_yr          : num [1:21] 244 244 244 244 244 244 244 244 244 244 ...
##  $ vet_fees_yr          : num [1:21] 177 177 177 177 177 177 177 177 177 177 ...
##  $ kennels_yr           : num [1:21] 116 116 116 116 116 116 116 116 116 116 ...
##  $ misc                 : num [1:21] 200 200 200 200 200 200 200 200 200 200 ...
##  - attr(*, "na.action")= 'omit' Named int [1:152] 1 2 3 4 6 8 9 10 12 13 ...
##   ..- attr(*, "names")= chr [1:152] "1" "2" "3" "4" ...
# displays variables structures to confirm that the columns were converted to numeric variables
dog_cost <- dogs3 %>%                    
    group_by(category) %>% 
  summarise(lifetime_cost= mean(lifetime_cost)) %>% 
arrange(desc(lifetime_cost))
#creates a new table with average lifetime cost per dog category which includes 21 breeds

library(RColorBrewer) #installs Rcolorbewer for color palette
ggplot(dog_cost, aes(x=category, y=lifetime_cost, fill=category)) + #graphs barplot based on table created in line 80
  ggtitle("Average Lifetime Cost per Dog Category") + labs(x="Dog Category", y = "Avg. Lifetime Cost (USD)") + #adds title and axis title to graph
  geom_bar(stat = "identity") + theme_minimal() + theme(axis.title.y = element_text(size = 12)) + theme(axis.title.x = element_text(size = 12)) + theme(axis.text.x = element_text(size = 8)) + theme(axis.text.y = element_text(size = 8)) + scale_fill_brewer(palette="Set2") 

#changes theme of barplot, text size and sets the color for the bars
  dogsfood <- dogs3 %>%                    
    group_by(category) %>% 
    summarise(lifetime_cost= mean(lifetime_cost), mean(food_cost_lifetime))%>% 
    arrange(desc(lifetime_cost)) #creates a new table with average values from prior table for lifetime cost and food cost
library(treemap)  #istalls treemap package
## Warning: package 'treemap' was built under R version 4.0.4
treemap(dogsfood, index="category", vSize = "mean(food_cost_lifetime)", vColor="mean(food_cost_lifetime)",  type="manual", palette = "Blues", 
        title="Average Lifetime Food Cost",
        title.legend = "Avg. Food Cost per Dog Category (USD)") 

#graphs treemap for food cost per category, sets colors and adds titles
dogail <-dogs3 %>%                    
    group_by(category) %>% 
   summarise(lifetime_cost= mean(lifetime_cost), mean(genetics_ailments)) %>%
arrange(desc(lifetime_cost)) #creates a new table with average values for lifetime cost and ailments variables
plot1 <- dogail %>%
  ggplot( aes (`mean(genetics_ailments)`, lifetime_cost, color=category))+   
  geom_point( aes (`mean(genetics_ailments)`, lifetime_cost), size = 3) +
 geom_smooth(method="lm",se=FALSE, color="blue", size=.5) + theme_minimal() + scale_color_brewer(palette="Set2") + labs(title = "Average Genetic Ailments vs. Average Lifetime Cost", x="Avg. Genetic Ailments", y = "Avg. Lifetime Cost (USD)", color="Categories") # creates a scatter plot with data from previous table
plot1
## `geom_smooth()` using formula 'y ~ x'

dogyr <- dogs3  %>%
 group_by(category)  %>% 
  summarise(lifetime_cost= mean(lifetime_cost), mean(avg_long_yr)) %>%
  arrange(desc(lifetime_cost)) # creates a new table with average lifetime cost and life years
plot2 <- dogyr %>%
  ggplot( aes ( `mean(avg_long_yr)`, lifetime_cost, color=category))+   
  geom_point( aes (`mean(avg_long_yr)`, lifetime_cost), size = 2) +
 geom_smooth(method="lm",se=FALSE, color="blue", size=.5) + theme_minimal()+ scale_color_brewer(palette="Set2") +
  labs(title = "Average Lifespans Years vs. Average Lifetime Cost", x="Avg. Lifespans Years", y = "Avg. Lifetime Cost (USD)", color="Categories") # creates a scatter plot with data from previous table  
plot2
## `geom_smooth()` using formula 'y ~ x'

Analysis

I recently adopted a dog and after its first visit to the vet, I realized that caring for a dog can be really expensive. I chose to adopt the only small dog at the shelter because I thought it wouldn’t need much food so the overall expenses would be low compared to a bigger dog. This analysis will compare dogs categories based on their associated costs.

The data used in this study is from Kaggle. The dataset “Best in Show” contains information from the American Kennel Club on dogs breeds which are divided into 43 variables. Some of these variables include: categories, lifetime cost, trainability, lifetime food cost, avg longevity yr, genetic’s ailments, avg purchase price, size, weight and intelligence.

The first step taken was to clean the data from all non extings values. This particular dataset was a bit messy, it contained variables with characters instead of numerical values. I also removed dollar and comma signs from variables and then converted them to numerical values.

Once the data was cleaned, I selected the columns that I wanted to explore. When I first looked at the dataset, I wanted to know what dog category was the most expensive. Next, I wanted to find the reason for the high cost.

Prior to this analysis, I thought larger breeds were more expensive because they eat more but my first analysis showed that the toy category was the most expensive. I did a treemap to explore the food cost over the lifetime for all dogs and the hound category had the highest associated food cost.

I hypothesized that toy breeds were the most expensive because they may have more genetic ailments but the scatter plot for ailments showed no relation between the number of ailments and the lifetime costs.

Next, I explored other variables such as vet visits and grooming costs but the dataset had the same rate for all dog breeds so these variables were not helpful. Lastly, I looked at the average lifespan per dog category and found that the toy category lives the longest. I did a scatter plot and found a strong relationship between lifespan and lifetime costs. On average, the toy category consumes less food and has less ailments but lives the longest so the costs associated per year (groomin, kennel, vet visits, food) will results in an overall higher lifetime cost.

I further researched my results and found an interesting article from the American Kennel Club (https://www.akc.org/expert-advice/health/why-do-small-dogs-live-longer/). Although, the reason for the smaller breed’s longer lifespans is unknown, the article states “larger breeds grow from puppies to adults at an accelerated rate, and this may lead to a higher likelihood of abnormal cell growth and death from cancer”.

This analysis surprised me because I thought that on average all dogs have the same lifespan but they are very different. I wished the dataset had specific cost for veterinarian visits instead of a yearly averages which were applied to all breeds. I also would have liked to explore data from groomers as they can vary per breed and category.