#This analysis will compare dog breeds in the US in regards to cost. The souce from this data is Kaggle: (https://www.kaggle.com/paultimothymooney/best-in-show-data-about-dogs?select=best_in_show.csv)
library(tidyverse) #installs tidyverse package
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) #installs dplyr package
setwd("C:/Users/Dano/Documents/") #sets working directoy
dogs <- read_csv("best_in_show.csv")
## Warning: Duplicated column names deduplicated: 'avg purchase price' => 'avg
## purchase price_1' [30]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## `datadog score` = col_double(),
## popularity = col_double(),
## `TOP DOG SCORE (with kids) intelligence + longevity + ailments + cost scores (50% each) + 100% grooming score. highest possible score: 5` = col_double(),
## `TOP DATA DOG RANKING wo kids` = col_double()
## )
## i Use `spec()` for the full column specifications.
# uploads data
dogs1 <- na.omit(dogs) #removes missing values
names(dogs1) <- tolower(names(dogs1)) # converts headings to lowercase column names
names(dogs1) <- gsub(" ","_",names(dogs1)) # replaces spaces in headings with underscore
dogs2 <- dogs1 %>%
select(breed, category, lifetime_cost, trainability,food_cost_lifetime, avg_long_yr, genetics_ailments, avg_purchase_price, size, `weight_(kg)`, intelligence, `obedience_(%_of_time)`, congenital_ailments, other_costs__lifetime, toys_treats_per_yr,pet_sitters_yr, grooming_yr, vet_fees_yr, kennels_yr, misc)
# selects the variables that I want to explore
view(dogs2) #displays the columns that I selected
dogs3 <- dogs2
dogs3$lifetime_cost = (gsub("\\$", "", dogs3$lifetime_cost))
dogs3$lifetime_cost = (gsub("\\,", "", dogs3$lifetime_cost))
dogs3$avg_purchase_price = (gsub("\\$", "", dogs3$avg_purchase_price))
dogs3$food_cost_lifetime = (gsub("\\$", "", dogs3$food_cost_lifetime))
dogs3$food_cost_lifetime = (gsub("\\,", "", dogs3$food_cost_lifetime))
dogs3$other_costs__lifetime = (gsub("\\$", "", dogs3$other_costs__lifetime))
dogs3$other_costs__lifetime = (gsub("\\,", "", dogs3$other_costs__lifetime))
dogs3$toys_treats_per_yr = (gsub("\\$", "", dogs3$toys_treats_per_yr))
dogs3$pet_sitters_yr = (gsub("\\$", "", dogs3$pet_sitters_yr))
dogs3$grooming_yr = (gsub("\\$", "", dogs3$grooming_yr))
dogs3$vet_fees_yr = (gsub("\\$", "", dogs3$vet_fees_yr))
dogs3$kennels_yr = (gsub("\\$", "", dogs3$kennels_yr))
dogs3$misc = (gsub("\\$", "", dogs3$misc))
# removes the dollar and comma sign from selected columns
dogs3$lifetime_cost <- as.numeric(dogs3$lifetime_cost)
dogs3$avg_purchase_price <- as.numeric(dogs3$avg_purchase_price)
## Warning: NAs introduced by coercion
dogs3$food_cost_lifetime <- as.numeric(dogs3$food_cost_lifetime)
dogs3$genetics_ailments <- as.numeric(dogs3$genetics_ailments)
dogs3$other_costs__lifetime <- as.numeric(dogs3$other_costs__lifetime)
dogs3$toys_treats_per_yr <- as.numeric(dogs3$toys_treats_per_yr)
dogs3$pet_sitters_yr <- as.numeric(dogs3$pet_sitters_yr)
dogs3$grooming_yr <- as.numeric(dogs3$grooming_yr)
dogs3$vet_fees_yr <- as.numeric(dogs3$vet_fees_yr)
dogs3$kennels_yr <- as.numeric(dogs3$kennels_yr)
dogs3$avg_long_yr <- as.numeric(dogs3$avg_long_yr)
dogs3$misc<- as.numeric(dogs3$misc)
dogs3$`weight_(kg)`<- as.numeric(dogs3$`weight_(kg)`)
## Warning: NAs introduced by coercion
# converts characters to numeric values
str(dogs3)
## tibble [21 x 20] (S3: tbl_df/tbl/data.frame)
## $ breed : chr [1:21] "Welsh Springer Spaniel" "Cocker Spaniel" "Siberian Husky" "Chihuahua" ...
## $ category : chr [1:21] "sporting" "sporting" "working" "toy" ...
## $ lifetime_cost : num [1:21] 20224 24330 22049 26250 22107 ...
## $ trainability : chr [1:21] "31" "20" "45" "67" ...
## $ food_cost_lifetime : num [1:21] 3478 6149 5035 4594 3762 ...
## $ avg_long_yr : num [1:21] 12.5 12.5 12.6 16.5 13.5 ...
## $ genetics_ailments : num [1:21] 1 2 0 1 2 2 1 0 1 1 ...
## $ avg_purchase_price : num [1:21] 750 465 650 588 NA 900 294 913 700 650 ...
## $ size : chr [1:21] "medium" "small" "medium" "small" ...
## $ weight_(kg) : num [1:21] NA 11 22 2 9 NA 27 NA 28 NA ...
## $ intelligence : chr [1:21] "Above average" "Excellent" "Average" "Fair" ...
## $ obedience_(%_of_time): chr [1:21] ">70" ">85" ">50" ">30" ...
## $ congenital_ailments : chr [1:21] "hip dysplasia" "Retinal dysplasia, seborrhea" "none" "Patellar luxation" ...
## $ other_costs__lifetime: num [1:21] 13064 13074 13158 17258 14131 ...
## $ toys_treats_per_yr : num [1:21] 121 121 121 121 121 121 121 121 121 121 ...
## $ pet_sitters_yr : num [1:21] 126 126 126 126 126 126 126 126 126 126 ...
## $ grooming_yr : num [1:21] 244 244 244 244 244 244 244 244 244 244 ...
## $ vet_fees_yr : num [1:21] 177 177 177 177 177 177 177 177 177 177 ...
## $ kennels_yr : num [1:21] 116 116 116 116 116 116 116 116 116 116 ...
## $ misc : num [1:21] 200 200 200 200 200 200 200 200 200 200 ...
## - attr(*, "na.action")= 'omit' Named int [1:152] 1 2 3 4 6 8 9 10 12 13 ...
## ..- attr(*, "names")= chr [1:152] "1" "2" "3" "4" ...
# displays variables structures to confirm that the columns were converted to numeric variables
dog_cost <- dogs3 %>%
group_by(category) %>%
summarise(lifetime_cost= mean(lifetime_cost)) %>%
arrange(desc(lifetime_cost))
#creates a new table with average lifetime cost per dog category which includes 21 breeds
library(RColorBrewer) #installs Rcolorbewer for color palette
ggplot(dog_cost, aes(x=category, y=lifetime_cost, fill=category)) + #graphs barplot based on table created in line 80
ggtitle("Average Lifetime Cost per Dog Category") + labs(x="Dog Category", y = "Avg. Lifetime Cost (USD)") + #adds title and axis title to graph
geom_bar(stat = "identity") + theme_minimal() + theme(axis.title.y = element_text(size = 12)) + theme(axis.title.x = element_text(size = 12)) + theme(axis.text.x = element_text(size = 8)) + theme(axis.text.y = element_text(size = 8)) + scale_fill_brewer(palette="Set2")
#changes theme of barplot, text size and sets the color for the bars
dogsfood <- dogs3 %>%
group_by(category) %>%
summarise(lifetime_cost= mean(lifetime_cost), mean(food_cost_lifetime))%>%
arrange(desc(lifetime_cost)) #creates a new table with average values from prior table for lifetime cost and food cost
library(treemap) #istalls treemap package
## Warning: package 'treemap' was built under R version 4.0.4
treemap(dogsfood, index="category", vSize = "mean(food_cost_lifetime)", vColor="mean(food_cost_lifetime)", type="manual", palette = "Blues",
title="Average Lifetime Food Cost",
title.legend = "Avg. Food Cost per Dog Category (USD)")
#graphs treemap for food cost per category, sets colors and adds titles
dogail <-dogs3 %>%
group_by(category) %>%
summarise(lifetime_cost= mean(lifetime_cost), mean(genetics_ailments)) %>%
arrange(desc(lifetime_cost)) #creates a new table with average values for lifetime cost and ailments variables
plot1 <- dogail %>%
ggplot( aes (`mean(genetics_ailments)`, lifetime_cost, color=category))+
geom_point( aes (`mean(genetics_ailments)`, lifetime_cost), size = 3) +
geom_smooth(method="lm",se=FALSE, color="blue", size=.5) + theme_minimal() + scale_color_brewer(palette="Set2") + labs(title = "Average Genetic Ailments vs. Average Lifetime Cost", x="Avg. Genetic Ailments", y = "Avg. Lifetime Cost (USD)", color="Categories") # creates a scatter plot with data from previous table
plot1
## `geom_smooth()` using formula 'y ~ x'
dogyr <- dogs3 %>%
group_by(category) %>%
summarise(lifetime_cost= mean(lifetime_cost), mean(avg_long_yr)) %>%
arrange(desc(lifetime_cost)) # creates a new table with average lifetime cost and life years
plot2 <- dogyr %>%
ggplot( aes ( `mean(avg_long_yr)`, lifetime_cost, color=category))+
geom_point( aes (`mean(avg_long_yr)`, lifetime_cost), size = 2) +
geom_smooth(method="lm",se=FALSE, color="blue", size=.5) + theme_minimal()+ scale_color_brewer(palette="Set2") +
labs(title = "Average Lifespans Years vs. Average Lifetime Cost", x="Avg. Lifespans Years", y = "Avg. Lifetime Cost (USD)", color="Categories") # creates a scatter plot with data from previous table
plot2
## `geom_smooth()` using formula 'y ~ x'