library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(skimr)
national_parks <- read.csv("parks_species_join_og.csv")
skim_without_charts(national_parks)
| Name | national_parks |
| Number of rows | 119248 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 12 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Park_Code | 0 | 1 | 4 | 4 | 0 | 56 | 0 |
| Park_Name | 0 | 1 | 18 | 46 | 0 | 56 | 0 |
| State | 0 | 1 | 2 | 10 | 0 | 27 | 0 |
| Park_Code_1 | 0 | 1 | 4 | 4 | 0 | 56 | 0 |
| Species_ID | 0 | 1 | 9 | 9 | 0 | 119248 | 0 |
| Park_Name_1 | 0 | 1 | 18 | 46 | 0 | 56 | 0 |
| Category | 0 | 1 | 4 | 19 | 0 | 14 | 0 |
| Order | 0 | 1 | 5 | 18 | 0 | 85 | 0 |
| Family | 0 | 1 | 7 | 17 | 0 | 220 | 0 |
| Scientific_Name | 0 | 1 | 3 | 69 | 0 | 46022 | 0 |
| Nativeness | 0 | 1 | 6 | 13 | 0 | 5 | 0 |
| Conservation_Status | 0 | 1 | 7 | 19 | 0 | 12 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Acres | 0 | 1 | 932032.19 | 1611322.24 | 5550.00 | 91440.00 | 249561.00 | 865952.00 | 8323148.00 |
| Latitude | 0 | 1 | 40.15 | 9.82 | 19.38 | 35.68 | 38.53 | 44.35 | 67.78 |
| Longitude | 0 | 1 | -111.99 | 21.98 | -159.28 | -121.75 | -112.14 | -102.50 | -68.21 |
head(national_parks)
## Park_Code Park_Name State Acres Latitude Longitude
## 1 DENA Denali National Park and Preserve AK 3372402 63.33 -150.5
## 2 DENA Denali National Park and Preserve AK 3372402 63.33 -150.5
## 3 DENA Denali National Park and Preserve AK 3372402 63.33 -150.5
## 4 DENA Denali National Park and Preserve AK 3372402 63.33 -150.5
## 5 DENA Denali National Park and Preserve AK 3372402 63.33 -150.5
## 6 DENA Denali National Park and Preserve AK 3372402 63.33 -150.5
## Park_Code_1 Species_ID Park_Name_1 Category
## 1 DENA DENA-1077 Denali National Park and Preserve Bird
## 2 DENA DENA-1085 Denali National Park and Preserve Bird
## 3 DENA DENA-1211 Denali National Park and Preserve Bird
## 4 DENA DENA-1130 Denali National Park and Preserve Bird
## 5 DENA DENA-1148 Denali National Park and Preserve Bird
## 6 DENA DENA-1214 Denali National Park and Preserve Bird
## Order Family Scientific_Name Nativeness
## 1 Not Confirmed Not Confirmed Melanitta nigra Unknown
## 2 Not Confirmed Not Confirmed Pluvialis dominica Unknown
## 3 Not Confirmed Not Confirmed Picoides dorsalis Unknown
## 4 Not Confirmed Not Confirmed Tympanuchus phasianellus Unknown
## 5 Not Confirmed Not Confirmed Junco hyemalis Unknown
## 6 Not Confirmed Not Confirmed Podiceps auritus Unknown
## Conservation_Status
## 1 Protected
## 2 Protected
## 3 Protected
## 4 Protected
## 5 Protected
## 6 Protected
glimpse(national_parks)
## Rows: 119,248
## Columns: 15
## $ Park_Code <chr> "DENA", "DENA", "DENA", "DENA", "DENA", "DENA", "D~
## $ Park_Name <chr> "Denali National Park and Preserve", "Denali Natio~
## $ State <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "A~
## $ Acres <int> 3372402, 3372402, 3372402, 3372402, 3372402, 33724~
## $ Latitude <dbl> 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 6~
## $ Longitude <dbl> -150.5, -150.5, -150.5, -150.5, -150.5, -150.5, -1~
## $ Park_Code_1 <chr> "DENA", "DENA", "DENA", "DENA", "DENA", "DENA", "D~
## $ Species_ID <chr> "DENA-1077", "DENA-1085", "DENA-1211", "DENA-1130"~
## $ Park_Name_1 <chr> "Denali National Park and Preserve", "Denali Natio~
## $ Category <chr> "Bird", "Bird", "Bird", "Bird", "Bird", "Bird", "B~
## $ Order <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Family <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Scientific_Name <chr> "Melanitta nigra", "Pluvialis dominica", "Picoides~
## $ Nativeness <chr> "Unknown", "Unknown", "Unknown", "Unknown", "Unkno~
## $ Conservation_Status <chr> "Protected", "Protected", "Protected", "Protected"~
national_parks %>%
select(-Park_Code_1, -Park_Name_1) %>%
glimpse()
## Rows: 119,248
## Columns: 13
## $ Park_Code <chr> "DENA", "DENA", "DENA", "DENA", "DENA", "DENA", "D~
## $ Park_Name <chr> "Denali National Park and Preserve", "Denali Natio~
## $ State <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "A~
## $ Acres <int> 3372402, 3372402, 3372402, 3372402, 3372402, 33724~
## $ Latitude <dbl> 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 6~
## $ Longitude <dbl> -150.5, -150.5, -150.5, -150.5, -150.5, -150.5, -1~
## $ Species_ID <chr> "DENA-1077", "DENA-1085", "DENA-1211", "DENA-1130"~
## $ Category <chr> "Bird", "Bird", "Bird", "Bird", "Bird", "Bird", "B~
## $ Order <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Family <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Scientific_Name <chr> "Melanitta nigra", "Pluvialis dominica", "Picoides~
## $ Nativeness <chr> "Unknown", "Unknown", "Unknown", "Unknown", "Unkno~
## $ Conservation_Status <chr> "Protected", "Protected", "Protected", "Protected"~
national_parks %>%
distinct(Category)
## Category
## 1 Bird
## 2 Fish
## 3 Fungi
## 4 Insect
## 5 Mammal
## 6 Amphibian
## 7 Invertebrate
## 8 Vascular Plant
## 9 Nonvascular Plant
## 10 Algae
## 11 Reptile
## 12 Slug/Snail
## 13 Spider/Scorpion
## 14 Crab/Lobster/Shrimp
national_parks %>%
count(Category) %>%
rename(Num_Category = n) %>%
arrange(-Num_Category)
## Category Num_Category
## 1 Vascular Plant 65221
## 2 Bird 14601
## 3 Insect 14349
## 4 Fungi 6203
## 5 Nonvascular Plant 4278
## 6 Fish 3956
## 7 Mammal 3867
## 8 Invertebrate 1566
## 9 Reptile 1343
## 10 Algae 976
## 11 Slug/Snail 787
## 12 Spider/Scorpion 776
## 13 Amphibian 743
## 14 Crab/Lobster/Shrimp 582
national_parks %>%
distinct(Category) %>%
count() %>%
rename(Category_Num = n)
## Category_Num
## 1 14
national_parks %>%
distinct(Park_Name)
## Park_Name
## 1 Denali National Park and Preserve
## 2 Gates Of The Arctic National Park and Preserve
## 3 Glacier Bay National Park and Preserve
## 4 Katmai National Park and Preserve
## 5 Kenai Fjords National Park
## 6 Kobuk Valley National Park
## 7 Lake Clark National Park and Preserve
## 8 Wrangell - St Elias National Park and Preserve
## 9 Hot Springs National Park
## 10 Grand Canyon National Park
## 11 Petrified Forest National Park
## 12 Saguaro National Park
## 13 Channel Islands National Park
## 14 Joshua Tree National Park
## 15 Lassen Volcanic National Park
## 16 Pinnacles National Park
## 17 Redwood National Park
## 18 Sequoia and Kings Canyon National Parks
## 19 Yosemite National Park
## 20 Black Canyon of the Gunnison National Park
## 21 Great Sand Dunes National Park and Preserve
## 22 Mesa Verde National Park
## 23 Rocky Mountain National Park
## 24 Biscayne National Park
## 25 Dry Tortugas National Park
## 26 Everglades National Park
## 27 Haleakala National Park
## 28 Hawaii Volcanoes National Park
## 29 Mammoth Cave National Park
## 30 Acadia National Park
## 31 Isle Royale National Park
## 32 Voyageurs National Park
## 33 Glacier National Park
## 34 Theodore Roosevelt National Park
## 35 Carlsbad Caverns National Park
## 36 Great Basin National Park
## 37 Cuyahoga Valley National Park
## 38 Crater Lake National Park
## 39 Congaree National Park
## 40 Badlands National Park
## 41 Wind Cave National Park
## 42 Big Bend National Park
## 43 Guadalupe Mountains National Park
## 44 Arches National Park
## 45 Bryce Canyon National Park
## 46 Canyonlands National Park
## 47 Capitol Reef National Park
## 48 Zion National Park
## 49 Shenandoah National Park
## 50 Mount Rainier National Park
## 51 North Cascades National Park
## 52 Olympic National Park
## 53 Grand Teton National Park
## 54 Death Valley National Park
## 55 Great Smoky Mountains National Park
## 56 Yellowstone National Park
national_parks %>%
distinct(Park_Name) %>%
count() %>%
rename(Total_Number_U.S._National_Parks = n)
## Total_Number_U.S._National_Parks
## 1 56
national_parks %>%
group_by(Park_Name) %>%
count() %>%
arrange(-n) %>%
rename(num_species = n)
## # A tibble: 56 x 2
## # Groups: Park_Name [56]
## Park_Name num_species
## <chr> <int>
## 1 Great Smoky Mountains National Park 6623
## 2 Redwood National Park 6310
## 3 Shenandoah National Park 4655
## 4 Death Valley National Park 4439
## 5 Yellowstone National Park 3966
## 6 Crater Lake National Park 3760
## 7 North Cascades National Park 3363
## 8 Hawaii Volcanoes National Park 3298
## 9 Rocky Mountain National Park 3152
## 10 Great Basin National Park 2653
## # ... with 46 more rows
national_parks %>%
distinct(Order) %>%
count() %>%
rename(Num_Order = n)
## Num_Order
## 1 85
national_parks %>%
group_by(Order) %>%
filter(Order != "Not Confirmed") %>%
count() %>%
arrange(-n)
## # A tibble: 84 x 2
## # Groups: Order [84]
## Order n
## <chr> <int>
## 1 Asterales 254
## 2 Poales 191
## 3 Passeriformes 161
## 4 Caryophyllales 133
## 5 Fabales 86
## 6 Lamiales 85
## 7 Brassicales 84
## 8 Charadriiformes 70
## 9 Boraginales 62
## 10 Rosales 54
## # ... with 74 more rows
national_parks %>%
group_by(Park_Name, Category) %>%
count()
## # A tibble: 549 x 3
## # Groups: Park_Name, Category [549]
## Park_Name Category n
## <chr> <chr> <int>
## 1 Acadia National Park Amphibian 15
## 2 Acadia National Park Bird 364
## 3 Acadia National Park Fish 38
## 4 Acadia National Park Mammal 55
## 5 Acadia National Park Reptile 11
## 6 Acadia National Park Vascular Plant 1226
## 7 Arches National Park Amphibian 8
## 8 Arches National Park Bird 205
## 9 Arches National Park Fish 11
## 10 Arches National Park Mammal 59
## # ... with 539 more rows
national_parks %>%
distinct(Conservation_Status)
## Conservation_Status
## 1 Protected
## 2 Species of Concern
## 3 Under Review
## 4 Threatened
## 5 Endangered
## 6 In Recovery
## 7 Proposed Threatened
## 8 Resident
## 9 Breeder
## 10 Migratory
## 11 Proposed Endangered
## 12 Extinct
endangered <- national_parks %>%
count(Conservation_Status) %>%
arrange(-n)
head(endangered)
## Conservation_Status n
## 1 Protected 114530
## 2 Species of Concern 3843
## 3 Endangered 374
## 4 Under Review 194
## 5 Threatened 184
## 6 In Recovery 77
First, I wanted to shorten the [Park_Name] column values by taking “National Park” out.
national_parks_shrt <- national_parks %>%
separate(Park_Name,into = "Park_Name_Shrt", sep = "National")
## Warning: Expected 1 pieces. Additional pieces discarded in 119248 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
national_parks_shrt %>%
distinct(Park_Name_Shrt)
## Park_Name_Shrt
## 1 Denali
## 2 Gates Of The Arctic
## 3 Glacier Bay
## 4 Katmai
## 5 Kenai Fjords
## 6 Kobuk Valley
## 7 Lake Clark
## 8 Wrangell - St Elias
## 9 Hot Springs
## 10 Grand Canyon
## 11 Petrified Forest
## 12 Saguaro
## 13 Channel Islands
## 14 Joshua Tree
## 15 Lassen Volcanic
## 16 Pinnacles
## 17 Redwood
## 18 Sequoia and Kings Canyon
## 19 Yosemite
## 20 Black Canyon of the Gunnison
## 21 Great Sand Dunes
## 22 Mesa Verde
## 23 Rocky Mountain
## 24 Biscayne
## 25 Dry Tortugas
## 26 Everglades
## 27 Haleakala
## 28 Hawaii Volcanoes
## 29 Mammoth Cave
## 30 Acadia
## 31 Isle Royale
## 32 Voyageurs
## 33 Glacier
## 34 Theodore Roosevelt
## 35 Carlsbad Caverns
## 36 Great Basin
## 37 Cuyahoga Valley
## 38 Crater Lake
## 39 Congaree
## 40 Badlands
## 41 Wind Cave
## 42 Big Bend
## 43 Guadalupe Mountains
## 44 Arches
## 45 Bryce Canyon
## 46 Canyonlands
## 47 Capitol Reef
## 48 Zion
## 49 Shenandoah
## 50 Mount Rainier
## 51 North Cascades
## 52 Olympic
## 53 Grand Teton
## 54 Death Valley
## 55 Great Smoky Mountains
## 56 Yellowstone
national_parks_shrt %>%
filter(Conservation_Status == "Endangered") %>%
ggplot() +
geom_bar(mapping = aes(x = Park_Name_Shrt, fill = Park_Name_Shrt), width = 0.9) +
scale_x_discrete(expand = c(0,0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(legend.position = "none", axis.text.x = element_text(angle = 45)) +
labs(title="Endangered Species",
subtitle = paste0("Data represents the number of species that are endangered in each park. "),
x="U.S. National Parks",
y="Number of Endangered Species")
Obviously, this graph is impossible to understand mainly for two reasons: 1. Although I removed “National Park” from each [Park_Name] column, it’s still illegible. 2. The y-axis has no scale…how many endangered species does each park have?
This was an attempt to fix both these issues!
national_parks_shrt %>%
filter(Conservation_Status == "Endangered") %>%
ggplot() +
geom_bar(mapping = aes(y = Park_Name_Shrt, fill = Park_Name_Shrt), width = 0.6) +
theme(legend.position = "none", axis.text.y = element_text(size = 6)) +
labs(title="Endangered Species",
subtitle = paste0("Data represents the number of species that are endangered in each park. "),
x="Number of Endangered Species",
y="U.S. National Parks")
Still not happy haha. I guess there are too many National Parks, making the graph hard to interpret. I decided to filter the national parks, by only including those with at least 10 endandered species.
endangered_park <- national_parks_shrt %>%
select(Park_Name_Shrt, Conservation_Status) %>%
filter(Conservation_Status == "Endangered") %>%
group_by(Park_Name_Shrt) %>%
count(Conservation_Status) %>%
rename("Num_Endangered_Species" = n) %>%
filter(Num_Endangered_Species >= 10)
endangered_park %>%
ggplot() +
geom_col(mapping = aes(y = Park_Name_Shrt, x = Num_Endangered_Species, fill = Park_Name_Shrt), width = 0.6) +
theme(legend.position = "none", axis.text.y = element_text(size = 7.5)) +
labs(title="Endangered Species",
subtitle = paste0("Data represents U.S. National Parks with the highest number of endangered species."),
x="Number of Endangered Species",
y="U.S. National Parks")
national_parks_shrt %>%
filter(Conservation_Status != "Protected", Conservation_Status != "Resident", Conservation_Status != "Migratory", Conservation_Status != "Breeder", Conservation_Status != "Proposed Threatened", Conservation_Status != "Proposed Endangered") %>%
ggplot() +
geom_bar(mapping = aes(y = Park_Name_Shrt, fill = Conservation_Status), width = 0.6, position = "fill") +
theme(axis.text.y = element_text(size = 6)) +
labs(title="Breakdown of Species' Conservation Status",
x="Percentage of Conservation Status",
y="U.S. National Parks")