Dog_Moves.csv, Dog_Descriptions.csv, and Dog_Travel.csv.
# Import Necessary packages
library(ggplot2) #visualization, plotting, and graphing of data
library(rmarkdown) #create final report
library(readr) #import data from excel files
library(DT) #visualize data in table format
library(plyr) #statistical analysis
library(ggmap) #map visualization
library(corrplot) #used in correlation matrix
library(skimr)
library("tidyverse") #This package is extremely important for EDA and is a combination of many crucial packages
library(dplyr) #cleaning data
'dog_description.csv', 'dog_moves.csv' and 'dog_travel.csv'.'dog_moves' data from Here. The head and tail function gives us a condensed look at the 'dog_moves' data.'dog_moves.csv' has 90 rows (entries) and 5 attributes. Some facts regarding the 'dog_moves.csv' data:-
'dog_moves' data, 1 attribute is a character, 3 attributes are numeric, and 1 attribute is logical.'dog_moves.csv' are recorded as ‘NA’.head(dog_moves, 10)
## # A tibble: 10 x 5
## location exported imported total inUS
## <chr> <dbl> <dbl> <dbl> <lgl>
## 1 Texas 635 NA 566 TRUE
## 2 Alabama 268 2 1428 TRUE
## 3 North Carolina 158 14 2627 TRUE
## 4 South Carolina 139 12 1618 TRUE
## 5 Georgia 137 19 3479 TRUE
## 6 Puerto Rico 131 NA NA FALSE
## 7 California 130 3 1664 TRUE
## 8 South Korea 76 NA NA FALSE
## 9 Tennessee 66 20 1769 TRUE
## 10 Kentucky 57 4 1123 TRUE
tail(dog_moves, 10)
## # A tibble: 10 x 5
## location exported imported total inUS
## <chr> <dbl> <dbl> <dbl> <lgl>
## 1 Wyoming 1 NA 52 TRUE
## 2 Rhode Island NA 87 607 TRUE
## 3 Minnesota NA 67 958 TRUE
## 4 Vermont NA 45 510 TRUE
## 5 New Hampshire NA 33 335 TRUE
## 6 Delaware NA 29 296 TRUE
## 7 Michigan NA 12 673 TRUE
## 8 Washington DC NA 11 336 TRUE
## 9 Montana NA NA 18 TRUE
## 10 Alaska NA NA 15 TRUE
str(dog_moves)
## spec_tbl_df [90 x 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ location: chr [1:90] "Texas" "Alabama" "North Carolina" "South Carolina" ...
## $ exported: num [1:90] 635 268 158 139 137 131 130 76 66 57 ...
## $ imported: num [1:90] NA 2 14 12 19 NA 3 NA 20 4 ...
## $ total : num [1:90] 566 1428 2627 1618 3479 ...
## $ inUS : logi [1:90] TRUE TRUE TRUE TRUE TRUE FALSE ...
## - attr(*, "spec")=
## .. cols(
## .. location = col_character(),
## .. exported = col_double(),
## .. imported = col_double(),
## .. total = col_double(),
## .. inUS = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
summary(dog_moves)
## location exported imported total
## Length:90 Min. : 1.00 Min. : 1.00 Min. : 15.0
## Class :character 1st Qu.: 1.00 1st Qu.: 11.25 1st Qu.: 477.5
## Mode :character Median : 4.00 Median : 24.50 Median : 857.0
## Mean : 30.37 Mean : 64.74 Mean :1139.8
## 3rd Qu.: 18.00 3rd Qu.: 73.25 3rd Qu.:1650.0
## Max. :635.00 Max. :390.00 Max. :4002.0
## NA's :9 NA's :52 NA's :39
## inUS
## Mode :logical
## FALSE:39
## TRUE :51
##
##
##
##
'dog_observations.csv' has 58180 rows (entries) and 36 attributes. Some facts regarding the 'dog_observations.csv' data:-
'dog_observations' data, 1 attribute is a character, 3 attributes are numeric, and 1 attribute is logical.dim(dog_descriptions)
## [1] 58180 36
head(dog_descriptions, 10)
## # A tibble: 10 x 36
## id org_id url species breed_primary breed_secondary breed_mixed
## <dbl> <chr> <chr> <chr> <chr> <chr> <lgl>
## 1 46042150 NV163 https://w~ Dog American Staf~ Mixed Breed TRUE
## 2 46042002 NV163 https://w~ Dog Pit Bull Terr~ Mixed Breed TRUE
## 3 46040898 NV99 https://w~ Dog Shepherd <NA> FALSE
## 4 46039877 NV202 https://w~ Dog German Shephe~ <NA> FALSE
## 5 46039306 NV184 https://w~ Dog Dachshund <NA> FALSE
## 6 46039304 NV184 https://w~ Dog Boxer Beagle TRUE
## 7 46039303 NV184 https://w~ Dog Italian Greyh~ Chihuahua TRUE
## 8 46039302 NV184 https://w~ Dog Cattle Dog <NA> TRUE
## 9 46039301 NV184 https://w~ Dog Cattle Dog <NA> TRUE
## 10 46038709 NV184 https://w~ Dog Cattle Dog <NA> TRUE
## # ... with 29 more variables: breed_unknown <lgl>, color_primary <chr>,
## # color_secondary <chr>, color_tertiary <chr>, age <chr>, sex <chr>,
## # size <chr>, coat <chr>, fixed <lgl>, house_trained <lgl>, declawed <lgl>,
## # special_needs <lgl>, shots_current <lgl>, env_children <lgl>,
## # env_dogs <lgl>, env_cats <lgl>, name <chr>, tags <chr>, photo <lgl>,
## # status <chr>, posted <dttm>, contact_city <chr>, contact_state <chr>,
## # contact_zip <chr>, contact_country <chr>, stateQ <chr>, ...
tail(dog_descriptions, 10)
## # A tibble: 10 x 36
## id org_id url species breed_primary breed_secondary breed_mixed
## <dbl> <chr> <chr> <chr> <chr> <chr> <lgl>
## 1 44682662 WY04 https://w~ Dog Labrador Ret~ Pit Bull Terrier TRUE
## 2 44682652 WY04 https://w~ Dog Labrador Ret~ Shar-Pei TRUE
## 3 44658860 WY24 https://w~ Dog Pit Bull Ter~ Mixed Breed TRUE
## 4 44629286 WY04 https://w~ Dog Labrador Ret~ Hound TRUE
## 5 44629272 WY04 https://w~ Dog Shepherd Australian Catt~ TRUE
## 6 44605893 WY20 https://w~ Dog Border Collie <NA> FALSE
## 7 44457061 WY24 https://w~ Dog Australian S~ Australian Catt~ TRUE
## 8 42865848 WY20 https://w~ Dog Border Collie <NA> FALSE
## 9 42734734 WY24 https://w~ Dog Boxer Mixed Breed TRUE
## 10 42663515 WY24 https://w~ Dog Labrador Ret~ Pit Bull Terrier TRUE
## # ... with 29 more variables: breed_unknown <lgl>, color_primary <chr>,
## # color_secondary <chr>, color_tertiary <chr>, age <chr>, sex <chr>,
## # size <chr>, coat <chr>, fixed <lgl>, house_trained <lgl>, declawed <lgl>,
## # special_needs <lgl>, shots_current <lgl>, env_children <lgl>,
## # env_dogs <lgl>, env_cats <lgl>, name <chr>, tags <chr>, photo <lgl>,
## # status <chr>, posted <dttm>, contact_city <chr>, contact_state <chr>,
## # contact_zip <chr>, contact_country <chr>, stateQ <chr>, ...
str(dog_descriptions)
## spec_tbl_df [58,180 x 36] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:58180] 4.6e+07 4.6e+07 4.6e+07 4.6e+07 4.6e+07 ...
## $ org_id : chr [1:58180] "NV163" "NV163" "NV99" "NV202" ...
## $ url : chr [1:58180] "https://www.petfinder.com/dog/harley-46042150/nv/las-vegas/animal-network-nv163/?referrer_id=87b31e7d-4508-41d1"| __truncated__ "https://www.petfinder.com/dog/biggie-46042002/nv/las-vegas/animal-network-nv163/?referrer_id=87b31e7d-4508-41d1"| __truncated__ "https://www.petfinder.com/dog/ziggy-46040898/nv/mesquite/city-of-mesquite-animal-shelter-nv99/?referrer_id=87b3"| __truncated__ "https://www.petfinder.com/dog/gypsy-46039877/nv/pahrump/pets-are-worth-saving-paws-nv202/?referrer_id=87b31e7d-"| __truncated__ ...
## $ species : chr [1:58180] "Dog" "Dog" "Dog" "Dog" ...
## $ breed_primary : chr [1:58180] "American Staffordshire Terrier" "Pit Bull Terrier" "Shepherd" "German Shepherd Dog" ...
## $ breed_secondary: chr [1:58180] "Mixed Breed" "Mixed Breed" NA NA ...
## $ breed_mixed : logi [1:58180] TRUE TRUE FALSE FALSE FALSE TRUE ...
## $ breed_unknown : logi [1:58180] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ color_primary : chr [1:58180] "White / Cream" "Brown / Chocolate" "Brindle" NA ...
## $ color_secondary: chr [1:58180] "Yellow / Tan / Blond / Fawn" "White / Cream" NA NA ...
## $ color_tertiary : chr [1:58180] NA NA NA NA ...
## $ age : chr [1:58180] "Senior" "Adult" "Adult" "Baby" ...
## $ sex : chr [1:58180] "Male" "Male" "Male" "Female" ...
## $ size : chr [1:58180] "Medium" "Large" "Large" "Large" ...
## $ coat : chr [1:58180] "Short" "Short" "Short" NA ...
## $ fixed : logi [1:58180] TRUE TRUE TRUE FALSE TRUE TRUE ...
## $ house_trained : logi [1:58180] TRUE TRUE FALSE FALSE FALSE FALSE ...
## $ declawed : logi [1:58180] NA NA NA NA NA NA ...
## $ special_needs : logi [1:58180] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ shots_current : logi [1:58180] TRUE TRUE TRUE FALSE TRUE TRUE ...
## $ env_children : logi [1:58180] NA NA NA NA TRUE TRUE ...
## $ env_dogs : logi [1:58180] NA NA NA NA TRUE TRUE ...
## $ env_cats : logi [1:58180] NA NA NA NA TRUE TRUE ...
## $ name : chr [1:58180] "HARLEY" "BIGGIE" "Ziggy" "Gypsy" ...
## $ tags : chr [1:58180] NA NA NA NA ...
## $ photo : logi [1:58180] NA NA NA NA NA NA ...
## $ status : chr [1:58180] "adoptable" "adoptable" "adoptable" "adoptable" ...
## $ posted : POSIXct[1:58180], format: "2019-09-20 16:37:59" "2019-09-20 16:24:57" ...
## $ contact_city : chr [1:58180] "Las Vegas" "Las Vegas" "Mesquite" "Pahrump" ...
## $ contact_state : chr [1:58180] "NV" "NV" "NV" "NV" ...
## $ contact_zip : chr [1:58180] "89147" "89147" "89027" "89048" ...
## $ contact_country: chr [1:58180] "US" "US" "US" "US" ...
## $ stateQ : chr [1:58180] "89009" "89009" "89009" "89009" ...
## $ accessed : Date[1:58180], format: "2019-09-20" "2019-09-20" ...
## $ type : chr [1:58180] "Dog" "Dog" "Dog" "Dog" ...
## $ description : chr [1:58180] "Harley is not sure how he wound up at shelter in his senior years but as you see from the pictures the shelter "| __truncated__ "6 year old Biggie has lost his home and really wants a home of his own. We are getting more information about h"| __truncated__ "Approx 2 years old.\n Did I catch your eye? I don't blame you if you had to stop and stare, I am quite cute if "| __truncated__ NA ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. org_id = col_character(),
## .. url = col_character(),
## .. species = col_character(),
## .. breed_primary = col_character(),
## .. breed_secondary = col_character(),
## .. breed_mixed = col_logical(),
## .. breed_unknown = col_logical(),
## .. color_primary = col_character(),
## .. color_secondary = col_character(),
## .. color_tertiary = col_character(),
## .. age = col_character(),
## .. sex = col_character(),
## .. size = col_character(),
## .. coat = col_character(),
## .. fixed = col_logical(),
## .. house_trained = col_logical(),
## .. declawed = col_logical(),
## .. special_needs = col_logical(),
## .. shots_current = col_logical(),
## .. env_children = col_logical(),
## .. env_dogs = col_logical(),
## .. env_cats = col_logical(),
## .. name = col_character(),
## .. tags = col_character(),
## .. photo = col_logical(),
## .. status = col_character(),
## .. posted = col_datetime(format = ""),
## .. contact_city = col_character(),
## .. contact_state = col_character(),
## .. contact_zip = col_character(),
## .. contact_country = col_character(),
## .. stateQ = col_character(),
## .. accessed = col_date(format = ""),
## .. type = col_character(),
## .. description = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
skim(dog_descriptions) # breaks down the variables by type with relevant summary information
| Name | dog_descriptions |
| Number of rows | 58180 |
| Number of columns | 36 |
| _______________________ | |
| Column type frequency: | |
| character | 22 |
| Date | 1 |
| logical | 11 |
| numeric | 1 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| org_id | 0 | 1.00 | 4 | 6 | 0 | 3969 | 0 |
| url | 0 | 1.00 | 112 | 365 | 0 | 58180 | 0 |
| species | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
| breed_primary | 0 | 1.00 | 3 | 40 | 0 | 216 | 0 |
| breed_secondary | 37359 | 0.36 | 3 | 36 | 0 | 190 | 0 |
| color_primary | 32046 | 0.45 | 5 | 32 | 0 | 15 | 0 |
| color_secondary | 46121 | 0.21 | 5 | 32 | 0 | 15 | 0 |
| color_tertiary | 56963 | 0.02 | 5 | 32 | 0 | 14 | 0 |
| age | 0 | 1.00 | 4 | 6 | 0 | 4 | 0 |
| sex | 0 | 1.00 | 4 | 7 | 0 | 3 | 0 |
| size | 0 | 1.00 | 5 | 11 | 0 | 4 | 0 |
| coat | 30995 | 0.47 | 4 | 8 | 0 | 6 | 0 |
| name | 0 | 1.00 | 1 | 220 | 0 | 22953 | 0 |
| tags | 53078 | 0.09 | 1 | 281 | 0 | 3962 | 0 |
| status | 0 | 1.00 | 9 | 24 | 0 | 34 | 0 |
| contact_city | 0 | 1.00 | 2 | 22 | 0 | 2189 | 0 |
| contact_state | 0 | 1.00 | 2 | 5 | 0 | 75 | 0 |
| contact_zip | 12 | 1.00 | 2 | 7 | 0 | 3483 | 0 |
| contact_country | 0 | 1.00 | 2 | 5 | 0 | 15 | 0 |
| stateQ | 0 | 1.00 | 2 | 10 | 0 | 62 | 0 |
| type | 640 | 0.99 | 3 | 3 | 0 | 1 | 0 |
| description | 8705 | 0.85 | 1 | 9972 | 0 | 46289 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| accessed | 33 | 1 | 2019-09-20 | 2019-09-20 | 2019-09-20 | 1 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| breed_mixed | 0 | 1.00 | 0.71 | TRU: 41591, FAL: 16589 |
| breed_unknown | 0 | 1.00 | 0.00 | FAL: 58180 |
| fixed | 0 | 1.00 | 0.80 | TRU: 46621, FAL: 11559 |
| house_trained | 0 | 1.00 | 0.35 | FAL: 37624, TRU: 20556 |
| declawed | 58180 | 0.00 | NaN | : |
| special_needs | 0 | 1.00 | 0.04 | FAL: 56034, TRU: 2146 |
| shots_current | 0 | 1.00 | 0.72 | TRU: 42068, FAL: 16112 |
| env_children | 30153 | 0.48 | 0.84 | TRU: 23588, FAL: 4439 |
| env_dogs | 23511 | 0.60 | 0.90 | TRU: 31122, FAL: 3547 |
| env_cats | 38828 | 0.33 | 0.65 | TRU: 12542, FAL: 6810 |
| photo | 58180 | 0.00 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 44251485 | 3818970 | 604115 | 44516882 | 45596624 | 45916674 | 46043149 | ▁▁▁▁▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| posted | 33 | 1 | 2003-05-18 | 2019-09-20 17:32:30 | 2019-08-13 15:55:42 | 48351 |
'dog_travel' data from Here. The head and tail function gives us a condensed look at the 'dog_travel' data.'dog_travel.csv' has 6194 (5833 unique entries) and 8 attributes. Some facts regarding the 'dog_travel.csv' data:-
'dog_travel' data, 1 attribute is numerical, 4 attributes are charachters, and 2 attributes are logical.'dog_travel.csv' are recorded as ‘NA’.head(dog_travel, 10)
## # A tibble: 10 x 8
## id contact_city contact_state description found manual remove still_there
## <dbl> <chr> <chr> <chr> <chr> <chr> <lgl> <lgl>
## 1 4.45e7 Anoka MN "Boris is ~ Arka~ <NA> NA NA
## 2 4.47e7 Groveland FL "Duke is a~ Abac~ Baham~ NA NA
## 3 4.60e7 Adamstown MD "Zac Woof-~ Adam Maryl~ NA NA
## 4 4.45e7 Saint Cloud MN "~~Came in~ Adap~ <NA> TRUE NA
## 5 4.39e7 Pueblo CO "Palang is~ Afgh~ <NA> NA NA
## 6 4.31e7 Manchester CT "Brooke ha~ Afgh~ <NA> NA NA
## 7 4.53e7 Wooster OH "Tate is a~ Akron Ohio NA NA
## 8 4.53e7 Wooster OH "Tate is a~ Akron Ohio NA NA
## 9 4.60e7 Locust Fork AL "Meet Trix~ Alab~ <NA> NA NA
## 10 4.59e7 Locust Fork AL "Meet Reba~ Alab~ <NA> NA NA
tail(dog_travel, 10)
## # A tibble: 10 x 8
## id contact_city contact_state description found manual remove still_there
## <dbl> <chr> <chr> <chr> <chr> <chr> <lgl> <lgl>
## 1 4.47e7 Fairmont WV "Please co~ WV West ~ NA TRUE
## 2 4.43e7 Fairmont WV "Please co~ WV West ~ NA TRUE
## 3 4.26e7 Fairmont WV "Please co~ WV <NA> TRUE NA
## 4 4.21e7 Fairmont WV "This is D~ WV West ~ NA TRUE
## 5 4.13e7 Fairmont WV "Please co~ WV <NA> TRUE NA
## 6 4.05e7 Fairmont WV "Please co~ WV <NA> TRUE NA
## 7 4.58e7 Eagle Mount~ UT "Shiny is ~ Wyom~ <NA> NA NA
## 8 3.43e7 Newnan GA "Yanni is ~ Yazm~ <NA> TRUE NA
## 9 4.45e7 Dayton OH "Callie is~ Young Ohio NA NA
## 10 3.67e7 New York NY "This guy ~ Zazu <NA> TRUE NA
str(dog_travel)
## spec_tbl_df [6,194 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:6194] 44520267 44698509 45983838 44475904 43877389 ...
## $ contact_city : chr [1:6194] "Anoka" "Groveland" "Adamstown" "Saint Cloud" ...
## $ contact_state: chr [1:6194] "MN" "FL" "MD" "MN" ...
## $ description : chr [1:6194] "Boris is a handsome mini schnauzer who made his long trek up her from Arkansas on 4/2019. He loves rope toys an"| __truncated__ "Duke is an almost 2 year old Potcake from Abacos in the Bahamas. He is a happy boy, who loves his toys and bone"| __truncated__ "Zac Woof-ron is a heartthrob movie star looking to settle down with the right person! \n\nAs you know from his "| __truncated__ "~~Came in to the shelter as a transfer from another rescue ~~Interacted with other dogs and was cautious around"| __truncated__ ...
## $ found : chr [1:6194] "Arkansas" "Abacos" "Adam" "Adaptil" ...
## $ manual : chr [1:6194] NA "Bahamas" "Maryland" NA ...
## $ remove : logi [1:6194] NA NA NA TRUE NA NA ...
## $ still_there : logi [1:6194] NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. contact_city = col_character(),
## .. contact_state = col_character(),
## .. description = col_character(),
## .. found = col_character(),
## .. manual = col_character(),
## .. remove = col_logical(),
## .. still_there = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
summary(dog_travel)
## id contact_city contact_state description
## Min. : 8619716 Length:6194 Length:6194 Length:6194
## 1st Qu.:44940096 Class :character Class :character Class :character
## Median :45734990 Mode :character Mode :character Mode :character
## Mean :44786280
## 3rd Qu.:45923182
## Max. :46043149
## found manual remove still_there
## Length:6194 Length:6194 Mode:logical Mode:logical
## Class :character Class :character TRUE:1738 TRUE:319
## Mode :character Mode :character NA's:4456 NA's:5875
##
##
##
Now we would like to change the names of the attributes to make them more understandable. The new attribute names are ‘location’, ‘exported_dogs’, ‘imported_dogs’, ‘total_dogs’, ‘location_in_US’.
#Original attribute names
names(dog_moves)
## [1] "location" "exported" "imported" "total" "inUS"
####**Renamed attribute names:-**
colnames(dog_moves) <- c('location', 'exported_dogs', 'imported_dogs', 'total_dogs', 'location_in_US')
names(dog_moves)
## [1] "location" "exported_dogs" "imported_dogs" "total_dogs"
## [5] "location_in_US"
Now we can clean the data by first implementing the is.na() function which shows us the attributes in this dataset that consists of ‘NA’ values.
colSums(is.na(dog_moves))
## location exported_dogs imported_dogs total_dogs location_in_US
## 0 9 52 39 0
exported_dogs= 1, imported_dogs='NA' and exported_dogs='NA' are satisfied. We can do this as these observations wouldn’t be impacting the overall data analysis that much. So the original 'dog_moves' data is transformed into 'dog_moves_clean'. We can also clearly see that the attributes in 'dog_moves_clean' data now have reduced number of ‘NA’ values.'dog_moves_nonUS' and 'dog_moves_US' ) based on whether the location is within the USA or not. This will give us a good idea regarding the dog trends within USA and worldwide.
'dog_moves_nonUS' is 19.'dog_moves_US' is 51.dog_moves_clean <- dog_moves[!(is.na(dog_moves$imported_dogs) & is.na(dog_moves$total_dogs) & dog_moves$exported_dogs== 1),]
colSums(is.na(dog_moves_clean))
## location exported_dogs imported_dogs total_dogs location_in_US
## 0 9 32 19 0
#New entire data:-
dog_moves_clean
## # A tibble: 70 x 5
## location exported_dogs imported_dogs total_dogs location_in_US
## <chr> <dbl> <dbl> <dbl> <lgl>
## 1 Texas 635 NA 566 TRUE
## 2 Alabama 268 2 1428 TRUE
## 3 North Carolina 158 14 2627 TRUE
## 4 South Carolina 139 12 1618 TRUE
## 5 Georgia 137 19 3479 TRUE
## 6 Puerto Rico 131 NA NA FALSE
## 7 California 130 3 1664 TRUE
## 8 South Korea 76 NA NA FALSE
## 9 Tennessee 66 20 1769 TRUE
## 10 Kentucky 57 4 1123 TRUE
## # ... with 60 more rows
dim(dog_moves_clean)
## [1] 70 5
#Data from locations other than USA:-
dog_moves_nonUS <- subset(dog_moves_clean, location_in_US == 0)
dog_moves_nonUS
## # A tibble: 19 x 5
## location exported_dogs imported_dogs total_dogs location_in_US
## <chr> <dbl> <dbl> <dbl> <lgl>
## 1 Puerto Rico 131 NA NA FALSE
## 2 South Korea 76 NA NA FALSE
## 3 Mexico 54 NA NA FALSE
## 4 China 28 NA NA FALSE
## 5 Thailand 20 NA NA FALSE
## 6 India 18 NA NA FALSE
## 7 Taiwan 18 NA NA FALSE
## 8 Egypt 15 NA NA FALSE
## 9 Bahamas 13 NA NA FALSE
## 10 Spain 8 NA NA FALSE
## 11 Haiti 7 NA NA FALSE
## 12 Oman 7 NA NA FALSE
## 13 Kuwait 5 NA NA FALSE
## 14 Qatar 4 NA NA FALSE
## 15 Costa Rica 3 NA NA FALSE
## 16 Greece 3 NA NA FALSE
## 17 Russia 3 NA NA FALSE
## 18 Afghanistan 2 NA NA FALSE
## 19 England 2 NA NA FALSE
dim(dog_moves_nonUS)
## [1] 19 5
#Data from locations within USA:-
dog_moves_US <- subset(dog_moves_clean, location_in_US == 1)
dog_moves_US
## # A tibble: 51 x 5
## location exported_dogs imported_dogs total_dogs location_in_US
## <chr> <dbl> <dbl> <dbl> <lgl>
## 1 Texas 635 NA 566 TRUE
## 2 Alabama 268 2 1428 TRUE
## 3 North Carolina 158 14 2627 TRUE
## 4 South Carolina 139 12 1618 TRUE
## 5 Georgia 137 19 3479 TRUE
## 6 California 130 3 1664 TRUE
## 7 Tennessee 66 20 1769 TRUE
## 8 Kentucky 57 4 1123 TRUE
## 9 Mississippi 55 NA 510 TRUE
## 10 Louisiana 53 NA 912 TRUE
## # ... with 41 more rows
dim(dog_moves_US)
## [1] 51 5
data.frame(colnames(dog_descriptions)) #Returns column index numbers in table format,df=DataFrame name
## colnames.dog_descriptions.
## 1 id
## 2 org_id
## 3 url
## 4 species
## 5 breed_primary
## 6 breed_secondary
## 7 breed_mixed
## 8 breed_unknown
## 9 color_primary
## 10 color_secondary
## 11 color_tertiary
## 12 age
## 13 sex
## 14 size
## 15 coat
## 16 fixed
## 17 house_trained
## 18 declawed
## 19 special_needs
## 20 shots_current
## 21 env_children
## 22 env_dogs
## 23 env_cats
## 24 name
## 25 tags
## 26 photo
## 27 status
## 28 posted
## 29 contact_city
## 30 contact_state
## 31 contact_zip
## 32 contact_country
## 33 stateQ
## 34 accessed
## 35 type
## 36 description
df <- dog_descriptions[,-c(1,2,3,4,18:20,24:27,31,33:36)] #removing unused variables from above
data.frame(colnames(df))
## colnames.df.
## 1 breed_primary
## 2 breed_secondary
## 3 breed_mixed
## 4 breed_unknown
## 5 color_primary
## 6 color_secondary
## 7 color_tertiary
## 8 age
## 9 sex
## 10 size
## 11 coat
## 12 fixed
## 13 house_trained
## 14 env_children
## 15 env_dogs
## 16 env_cats
## 17 posted
## 18 contact_city
## 19 contact_state
## 20 contact_country
skim(df)
| Name | df |
| Number of rows | 58180 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 12 |
| logical | 7 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| breed_primary | 0 | 1.00 | 3 | 40 | 0 | 216 | 0 |
| breed_secondary | 37359 | 0.36 | 3 | 36 | 0 | 190 | 0 |
| color_primary | 32046 | 0.45 | 5 | 32 | 0 | 15 | 0 |
| color_secondary | 46121 | 0.21 | 5 | 32 | 0 | 15 | 0 |
| color_tertiary | 56963 | 0.02 | 5 | 32 | 0 | 14 | 0 |
| age | 0 | 1.00 | 4 | 6 | 0 | 4 | 0 |
| sex | 0 | 1.00 | 4 | 7 | 0 | 3 | 0 |
| size | 0 | 1.00 | 5 | 11 | 0 | 4 | 0 |
| coat | 30995 | 0.47 | 4 | 8 | 0 | 6 | 0 |
| contact_city | 0 | 1.00 | 2 | 22 | 0 | 2189 | 0 |
| contact_state | 0 | 1.00 | 2 | 5 | 0 | 75 | 0 |
| contact_country | 0 | 1.00 | 2 | 5 | 0 | 15 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| breed_mixed | 0 | 1.00 | 0.71 | TRU: 41591, FAL: 16589 |
| breed_unknown | 0 | 1.00 | 0.00 | FAL: 58180 |
| fixed | 0 | 1.00 | 0.80 | TRU: 46621, FAL: 11559 |
| house_trained | 0 | 1.00 | 0.35 | FAL: 37624, TRU: 20556 |
| env_children | 30153 | 0.48 | 0.84 | TRU: 23588, FAL: 4439 |
| env_dogs | 23511 | 0.60 | 0.90 | TRU: 31122, FAL: 3547 |
| env_cats | 38828 | 0.33 | 0.65 | TRU: 12542, FAL: 6810 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| posted | 33 | 1 | 2003-05-18 | 2019-09-20 17:32:30 | 2019-08-13 15:55:42 | 48351 |
head(df,150)
## # A tibble: 150 x 20
## breed_primary breed_secondary breed_mixed breed_unknown color_primary
## <chr> <chr> <lgl> <lgl> <chr>
## 1 American Staffordsh~ Mixed Breed TRUE FALSE White / Cream
## 2 Pit Bull Terrier Mixed Breed TRUE FALSE Brown / Choco~
## 3 Shepherd <NA> FALSE FALSE Brindle
## 4 German Shepherd Dog <NA> FALSE FALSE <NA>
## 5 Dachshund <NA> FALSE FALSE <NA>
## 6 Boxer Beagle TRUE FALSE <NA>
## 7 Italian Greyhound Chihuahua TRUE FALSE <NA>
## 8 Cattle Dog <NA> TRUE FALSE <NA>
## 9 Cattle Dog <NA> TRUE FALSE <NA>
## 10 Cattle Dog <NA> TRUE FALSE <NA>
## # ... with 140 more rows, and 15 more variables: color_secondary <chr>,
## # color_tertiary <chr>, age <chr>, sex <chr>, size <chr>, coat <chr>,
## # fixed <lgl>, house_trained <lgl>, env_children <lgl>, env_dogs <lgl>,
## # env_cats <lgl>, posted <dttm>, contact_city <chr>, contact_state <chr>,
## # contact_country <chr>
#there are multiple missing values below as we can see but we are not replacing any of the NA values. For example most of the missing values can't be predicted here such as breed, color etc.
# Remove duplicate entries from dog travel data
dog_travel <- unique(dog_travel)
# Remove unecessary columns from dog travel
dog_descriptions <- dog_descriptions_orig[,-c(2,3,4,18:20,24:27,31,33:36)]
dog_travel <- select(dog_travel, -contact_state, -contact_city, -manual, -still_there, -description, -remove)
# Over Half of the elements in manual field are NA. Due to the inconsistency of the data, we have decided to remove it from our dataset
# Almost 75% of the elements listed in the remove field are NA, for thi reason we have decided to emit it from our dataset
# nearly every entry entry in the still_there field is NA, therefore, we will emit the column from our dataset
# calculated number of relocations by dog_id
dog_relocations <- dplyr::group_by(dog_travel, id) %>%
dplyr::summarise( locations = length(found), relocations = length(found) - 1) %>%
arrange(desc(relocations))
#joined travel data to dog descriptions
joined <- inner_join(dog_descriptions, dog_relocations, by = 'id')
head(joined,5)
## # A tibble: 5 x 23
## id breed_primary breed_secondary breed_mixed breed_unknown color_primary
## <dbl> <chr> <chr> <lgl> <lgl> <chr>
## 1 45923603 German Sheph~ <NA> TRUE FALSE <NA>
## 2 45923600 German Sheph~ <NA> TRUE FALSE <NA>
## 3 45289127 Beagle <NA> FALSE FALSE <NA>
## 4 44801851 Boxer <NA> TRUE FALSE Bicolor
## 5 44731850 Maltese Poodle TRUE FALSE White / Cream
## # ... with 17 more variables: color_secondary <chr>, color_tertiary <chr>,
## # age <chr>, sex <chr>, size <chr>, coat <chr>, fixed <lgl>,
## # house_trained <lgl>, env_children <lgl>, env_dogs <lgl>, env_cats <lgl>,
## # posted <dttm>, contact_city <chr>, contact_state <chr>,
## # contact_country <chr>, locations <int>, relocations <dbl>
# dog_relocation_breed
dog_relocations_breed <- dplyr::group_by(joined, breed_primary) %>%
dplyr::summarise(avg_relocations = mean(relocations)) %>%
arrange(desc(avg_relocations)) %>%
head(5)
# dog_relocation_city
dog_relocations_city <- dplyr::group_by(joined, contact_city) %>%
dplyr::summarise(avg_relocations = mean(relocations)) %>%
arrange(desc(avg_relocations)) %>%
head(5)
# dog_relocation_age
dog_relocations_age <- dplyr::group_by(joined, age) %>%
dplyr::summarise(avg_relocations = mean(relocations)) %>%
arrange(desc(avg_relocations)) %>%
head(5)
# dog_relocation_fixed
dog_relocations_fixed <- dplyr::group_by(joined, fixed) %>%
dplyr::summarise(avg_relocations = mean(relocations)) %>%
arrange(desc(avg_relocations)) %>%
head(5)
# dog_relocation_sex
dog_relocations_sex <- dplyr::group_by(joined, sex) %>%
dplyr::summarise(avg_relocations = mean(relocations)) %>%
arrange(desc(avg_relocations)) %>%
head(5)
# dog_relocation_size
dog_relocations_size <- dplyr::group_by(joined, size) %>%
dplyr::summarise(avg_relocations = mean(relocations)) %>%
arrange(desc(avg_relocations)) %>%
head(5)
'dog_moves_nonUS' and 'dog_moves_US'.'dog_moves_nonUS'), we are only using the exported_dogs attribute.#
top_n(dog_moves_nonUS, n=5, exported_dogs) %>%
ggplot(., aes(x=location, y=exported_dogs))+
geom_bar(stat='identity')
#The top 5 total number of adoptable dogs locations within USA:-
top_n(dog_moves_US, n=5, total_dogs) %>%
ggplot(., aes(x=location, y=total_dogs))+
geom_bar(stat='identity')
#The top 5 dog exporting states within USA:-
top_n(dog_moves_US, n=5, exported_dogs) %>%
ggplot(., aes(x=location, y=exported_dogs))+
geom_bar(stat='identity')
#The top 5 dog importing states within USA:-
top_n(dog_moves_US, n=5, imported_dogs) %>%
ggplot(., aes(x=location, y=imported_dogs))+
geom_bar(stat='identity')
### Exploratory Analyses of Dog_observations Data
breeds_primary <- table(df$breed_primary)
breeds_primary <- sort(breeds_primary, decreasing = TRUE)[1:10]
breeds_primary
##
## Pit Bull Terrier Labrador Retriever
## 7890 7198
## Chihuahua Mixed Breed
## 3766 3242
## Terrier Hound
## 2641 2282
## German Shepherd Dog Boxer
## 2122 2050
## Shepherd American Staffordshire Terrier
## 1972 1862
#histogram of top 10
par(mar=c(11,4,4,4))
plot(breeds_primary, type="h", las = 2, main = "Top 10 Available Breeds", ylab = "Count")
#From these results, we can see the most populous breed in shelters is a Pit Bull Terrier, followed by Labrador Retrievers at almost half that amount.
#ages of available dogs
#create dataframe with age data for OH
dog_age <- subset(df, select = c("age"))
age_order = c("Baby", "Young", "Adult", "Senior") #order data by age
dog_age <- transform(dog_age, age = factor(age, levels=age_order))
age_plot <- ggplot(dog_age, aes(age))
age_plot + geom_bar() + labs(title = "Available Dogs' Age Frequency", x = "Age", y = "Count") + geom_text(stat = 'count', aes(label = ..count..), vjust = -1)
#By far, there was the highest frequency of adult dogs, with 27955 observations, followed by 16194 young dogs.
#Mixed Breed Versus Pure Breed Count
ggplot(df, aes(breed_mixed)) + geom_bar() + scale_x_discrete("Type", labels = c("Purebred", "Mixed")) + labs(title = "Mixed and Purebred Count", y = "Count", x = "Type") + geom_text(stat = 'count', aes(label = ..count..), vjust = -1)
#From this visualization, we can see that there are 41591 mixed dogs, but only 16589 purebred, a ratio of 2.5:1 mixed to purebred.
#Cities with highest count of available dogs
cities_df <- subset(df, select = c("contact_state", "contact_city"))
cities_df <- table(cities_df$contact_city)
cities_df <- sort(cities_df, decreasing = TRUE)[1:10]
cities_df
##
## Phoenix Atlanta Las Vegas New York Chamblee Albuquerque
## 756 710 604 589 495 452
## Seattle Columbia Washington Columbus
## 436 432 417 414
#From these results, it is clear that Phoenix is the city with the highest count of available dogs, followed by Atlanta.
plot1 <- ggplot(data = dog_relocations_breed, aes(x = breed_primary, y = avg_relocations, fill = breed_primary)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text = element_text(size = 7),
axis.title = element_text(size = 14,face = "bold"))
plot2 <- ggplot(data = dog_relocations_city, aes(x = contact_city, y = avg_relocations, fill = contact_city)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text = element_text(size = 9),
axis.title = element_text(size = 14,face = "bold"))
plot3 <- ggplot(data = dog_relocations_age, aes(x = age, y = avg_relocations, fill = age)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text = element_text(size = 9),
axis.title = element_text(size = 14,face = "bold"))
plot4 <- ggplot(data = dog_relocations_fixed, aes(x = fixed, y = avg_relocations, fill = fixed)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text = element_text(size = 9),
axis.title = element_text(size = 14,face = "bold"))
plot5 <- ggplot(data = dog_relocations_sex, aes(x = sex, y = avg_relocations, fill = sex)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text = element_text(size = 9),
axis.title = element_text(size = 14,face = "bold"))
plot6 <- ggplot(data = dog_relocations_size, aes(x = size, y = avg_relocations, fill = size)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette = "Dark2") +
theme(axis.text = element_text(size = 7),
axis.title = element_text(size = 14,face = "bold"))
#### Relocation by City