1. Load R Packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(skimr)

2. Import U.S. National Park File

national_parks <- read.csv("parks_species_join_og.csv")

3. Review The Data

skim_without_charts(national_parks)
Data summary
Name national_parks
Number of rows 119248
Number of columns 15
_______________________
Column type frequency:
character 12
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Park_Code 0 1 4 4 0 56 0
Park_Name 0 1 18 46 0 56 0
State 0 1 2 10 0 27 0
Park_Code_1 0 1 4 4 0 56 0
Species_ID 0 1 9 9 0 119248 0
Park_Name_1 0 1 18 46 0 56 0
Category 0 1 4 19 0 14 0
Order 0 1 5 18 0 85 0
Family 0 1 7 17 0 220 0
Scientific_Name 0 1 3 69 0 46022 0
Nativeness 0 1 6 13 0 5 0
Conservation_Status 0 1 7 19 0 12 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Acres 0 1 932032.19 1611322.24 5550.00 91440.00 249561.00 865952.00 8323148.00
Latitude 0 1 40.15 9.82 19.38 35.68 38.53 44.35 67.78
Longitude 0 1 -111.99 21.98 -159.28 -121.75 -112.14 -102.50 -68.21
head(national_parks)
##   Park_Code                         Park_Name State   Acres Latitude Longitude
## 1      DENA Denali National Park and Preserve    AK 3372402    63.33    -150.5
## 2      DENA Denali National Park and Preserve    AK 3372402    63.33    -150.5
## 3      DENA Denali National Park and Preserve    AK 3372402    63.33    -150.5
## 4      DENA Denali National Park and Preserve    AK 3372402    63.33    -150.5
## 5      DENA Denali National Park and Preserve    AK 3372402    63.33    -150.5
## 6      DENA Denali National Park and Preserve    AK 3372402    63.33    -150.5
##   Park_Code_1 Species_ID                       Park_Name_1 Category
## 1        DENA  DENA-1077 Denali National Park and Preserve     Bird
## 2        DENA  DENA-1085 Denali National Park and Preserve     Bird
## 3        DENA  DENA-1211 Denali National Park and Preserve     Bird
## 4        DENA  DENA-1130 Denali National Park and Preserve     Bird
## 5        DENA  DENA-1148 Denali National Park and Preserve     Bird
## 6        DENA  DENA-1214 Denali National Park and Preserve     Bird
##           Order        Family          Scientific_Name Nativeness
## 1 Not Confirmed Not Confirmed          Melanitta nigra    Unknown
## 2 Not Confirmed Not Confirmed       Pluvialis dominica    Unknown
## 3 Not Confirmed Not Confirmed        Picoides dorsalis    Unknown
## 4 Not Confirmed Not Confirmed Tympanuchus phasianellus    Unknown
## 5 Not Confirmed Not Confirmed           Junco hyemalis    Unknown
## 6 Not Confirmed Not Confirmed         Podiceps auritus    Unknown
##   Conservation_Status
## 1           Protected
## 2           Protected
## 3           Protected
## 4           Protected
## 5           Protected
## 6           Protected
glimpse(national_parks)
## Rows: 119,248
## Columns: 15
## $ Park_Code           <chr> "DENA", "DENA", "DENA", "DENA", "DENA", "DENA", "D~
## $ Park_Name           <chr> "Denali National Park and Preserve", "Denali Natio~
## $ State               <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "A~
## $ Acres               <int> 3372402, 3372402, 3372402, 3372402, 3372402, 33724~
## $ Latitude            <dbl> 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 6~
## $ Longitude           <dbl> -150.5, -150.5, -150.5, -150.5, -150.5, -150.5, -1~
## $ Park_Code_1         <chr> "DENA", "DENA", "DENA", "DENA", "DENA", "DENA", "D~
## $ Species_ID          <chr> "DENA-1077", "DENA-1085", "DENA-1211", "DENA-1130"~
## $ Park_Name_1         <chr> "Denali National Park and Preserve", "Denali Natio~
## $ Category            <chr> "Bird", "Bird", "Bird", "Bird", "Bird", "Bird", "B~
## $ Order               <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Family              <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Scientific_Name     <chr> "Melanitta nigra", "Pluvialis dominica", "Picoides~
## $ Nativeness          <chr> "Unknown", "Unknown", "Unknown", "Unknown", "Unkno~
## $ Conservation_Status <chr> "Protected", "Protected", "Protected", "Protected"~

4. Drop Columns

national_parks %>%
  select(-Park_Code_1, -Park_Name_1) %>%
  glimpse()
## Rows: 119,248
## Columns: 13
## $ Park_Code           <chr> "DENA", "DENA", "DENA", "DENA", "DENA", "DENA", "D~
## $ Park_Name           <chr> "Denali National Park and Preserve", "Denali Natio~
## $ State               <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "A~
## $ Acres               <int> 3372402, 3372402, 3372402, 3372402, 3372402, 33724~
## $ Latitude            <dbl> 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 63.33, 6~
## $ Longitude           <dbl> -150.5, -150.5, -150.5, -150.5, -150.5, -150.5, -1~
## $ Species_ID          <chr> "DENA-1077", "DENA-1085", "DENA-1211", "DENA-1130"~
## $ Category            <chr> "Bird", "Bird", "Bird", "Bird", "Bird", "Bird", "B~
## $ Order               <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Family              <chr> "Not Confirmed", "Not Confirmed", "Not Confirmed",~
## $ Scientific_Name     <chr> "Melanitta nigra", "Pluvialis dominica", "Picoides~
## $ Nativeness          <chr> "Unknown", "Unknown", "Unknown", "Unknown", "Unkno~
## $ Conservation_Status <chr> "Protected", "Protected", "Protected", "Protected"~

5. What categories of species are available in the data?

national_parks %>%
  distinct(Category)
##               Category
## 1                 Bird
## 2                 Fish
## 3                Fungi
## 4               Insect
## 5               Mammal
## 6            Amphibian
## 7         Invertebrate
## 8       Vascular Plant
## 9    Nonvascular Plant
## 10               Algae
## 11             Reptile
## 12          Slug/Snail
## 13     Spider/Scorpion
## 14 Crab/Lobster/Shrimp

6. Find the number of instances of each category

national_parks %>%
  count(Category) %>%
  rename(Num_Category = n) %>%
  arrange(-Num_Category)
##               Category Num_Category
## 1       Vascular Plant        65221
## 2                 Bird        14601
## 3               Insect        14349
## 4                Fungi         6203
## 5    Nonvascular Plant         4278
## 6                 Fish         3956
## 7               Mammal         3867
## 8         Invertebrate         1566
## 9              Reptile         1343
## 10               Algae          976
## 11          Slug/Snail          787
## 12     Spider/Scorpion          776
## 13           Amphibian          743
## 14 Crab/Lobster/Shrimp          582

7. How many Categories of Species are there?

national_parks %>%
  distinct(Category) %>%
  count() %>%
  rename(Category_Num = n)
##   Category_Num
## 1           14

8. Which National Parks are included within the Dataset?

national_parks %>%
  distinct(Park_Name)
##                                         Park_Name
## 1               Denali National Park and Preserve
## 2  Gates Of The Arctic National Park and Preserve
## 3          Glacier Bay National Park and Preserve
## 4               Katmai National Park and Preserve
## 5                      Kenai Fjords National Park
## 6                      Kobuk Valley National Park
## 7           Lake Clark National Park and Preserve
## 8  Wrangell - St Elias National Park and Preserve
## 9                       Hot Springs National Park
## 10                     Grand Canyon National Park
## 11                 Petrified Forest National Park
## 12                          Saguaro National Park
## 13                  Channel Islands National Park
## 14                      Joshua Tree National Park
## 15                  Lassen Volcanic National Park
## 16                        Pinnacles National Park
## 17                          Redwood National Park
## 18        Sequoia and Kings Canyon National Parks
## 19                         Yosemite National Park
## 20     Black Canyon of the Gunnison National Park
## 21    Great Sand Dunes National Park and Preserve
## 22                       Mesa Verde National Park
## 23                   Rocky Mountain National Park
## 24                         Biscayne National Park
## 25                     Dry Tortugas National Park
## 26                       Everglades National Park
## 27                        Haleakala National Park
## 28                 Hawaii Volcanoes National Park
## 29                     Mammoth Cave National Park
## 30                           Acadia National Park
## 31                      Isle Royale National Park
## 32                        Voyageurs National Park
## 33                          Glacier National Park
## 34               Theodore Roosevelt National Park
## 35                 Carlsbad Caverns National Park
## 36                      Great Basin National Park
## 37                  Cuyahoga Valley National Park
## 38                      Crater Lake National Park
## 39                         Congaree National Park
## 40                         Badlands National Park
## 41                        Wind Cave National Park
## 42                         Big Bend National Park
## 43              Guadalupe Mountains National Park
## 44                           Arches National Park
## 45                     Bryce Canyon National Park
## 46                      Canyonlands National Park
## 47                     Capitol Reef National Park
## 48                             Zion National Park
## 49                       Shenandoah National Park
## 50                    Mount Rainier National Park
## 51                   North Cascades National Park
## 52                          Olympic National Park
## 53                      Grand Teton National Park
## 54                     Death Valley National Park
## 55            Great Smoky Mountains National Park
## 56                      Yellowstone National Park

9. How many parks are included?

  national_parks %>%
  distinct(Park_Name) %>%
  count() %>%
  rename(Total_Number_U.S._National_Parks = n)
##   Total_Number_U.S._National_Parks
## 1                               56

10. How many species were found in each Park?

national_parks %>%
  group_by(Park_Name) %>%
  count() %>%
  arrange(-n) %>%
  rename(num_species = n)
## # A tibble: 56 x 2
## # Groups:   Park_Name [56]
##    Park_Name                           num_species
##    <chr>                                     <int>
##  1 Great Smoky Mountains National Park        6623
##  2 Redwood National Park                      6310
##  3 Shenandoah National Park                   4655
##  4 Death Valley National Park                 4439
##  5 Yellowstone National Park                  3966
##  6 Crater Lake National Park                  3760
##  7 North Cascades National Park               3363
##  8 Hawaii Volcanoes National Park             3298
##  9 Rocky Mountain National Park               3152
## 10 Great Basin National Park                  2653
## # ... with 46 more rows

11. How many different Order of Species are there?

national_parks %>%
  distinct(Order) %>%
  count() %>%
  rename(Num_Order = n)
##   Num_Order
## 1        85

12. Which Order contains the most species?

national_parks %>%
  group_by(Order) %>%
  filter(Order != "Not Confirmed") %>%
  count() %>%
  arrange(-n)
## # A tibble: 84 x 2
## # Groups:   Order [84]
##    Order               n
##    <chr>           <int>
##  1 Asterales         254
##  2 Poales            191
##  3 Passeriformes     161
##  4 Caryophyllales    133
##  5 Fabales            86
##  6 Lamiales           85
##  7 Brassicales        84
##  8 Charadriiformes    70
##  9 Boraginales        62
## 10 Rosales            54
## # ... with 74 more rows

13. Group species by Park Name and Category

national_parks %>%
  group_by(Park_Name, Category) %>%
  count()
## # A tibble: 549 x 3
## # Groups:   Park_Name, Category [549]
##    Park_Name            Category           n
##    <chr>                <chr>          <int>
##  1 Acadia National Park Amphibian         15
##  2 Acadia National Park Bird             364
##  3 Acadia National Park Fish              38
##  4 Acadia National Park Mammal            55
##  5 Acadia National Park Reptile           11
##  6 Acadia National Park Vascular Plant  1226
##  7 Arches National Park Amphibian          8
##  8 Arches National Park Bird             205
##  9 Arches National Park Fish              11
## 10 Arches National Park Mammal            59
## # ... with 539 more rows

14. Represent the Conservation Status of the Species

national_parks %>%
  distinct(Conservation_Status)
##    Conservation_Status
## 1            Protected
## 2   Species of Concern
## 3         Under Review
## 4           Threatened
## 5           Endangered
## 6          In Recovery
## 7  Proposed Threatened
## 8             Resident
## 9              Breeder
## 10           Migratory
## 11 Proposed Endangered
## 12             Extinct
endangered <- national_parks %>%
  count(Conservation_Status) %>%
  arrange(-n)
head(endangered)
##   Conservation_Status      n
## 1           Protected 114530
## 2  Species of Concern   3843
## 3          Endangered    374
## 4        Under Review    194
## 5          Threatened    184
## 6         In Recovery     77

15. Represent the Number of Endangered Species of each U.S. National Park via Bar Chart

First, I wanted to shorten the [Park_Name] column values by taking “National Park” out.

national_parks_shrt <- national_parks %>%
  separate(Park_Name,into = "Park_Name_Shrt", sep = "National")
## Warning: Expected 1 pieces. Additional pieces discarded in 119248 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
national_parks_shrt %>%
  distinct(Park_Name_Shrt)
##                   Park_Name_Shrt
## 1                        Denali 
## 2           Gates Of The Arctic 
## 3                   Glacier Bay 
## 4                        Katmai 
## 5                  Kenai Fjords 
## 6                  Kobuk Valley 
## 7                    Lake Clark 
## 8           Wrangell - St Elias 
## 9                   Hot Springs 
## 10                 Grand Canyon 
## 11             Petrified Forest 
## 12                      Saguaro 
## 13              Channel Islands 
## 14                  Joshua Tree 
## 15              Lassen Volcanic 
## 16                    Pinnacles 
## 17                      Redwood 
## 18     Sequoia and Kings Canyon 
## 19                     Yosemite 
## 20 Black Canyon of the Gunnison 
## 21             Great Sand Dunes 
## 22                   Mesa Verde 
## 23               Rocky Mountain 
## 24                     Biscayne 
## 25                 Dry Tortugas 
## 26                   Everglades 
## 27                    Haleakala 
## 28             Hawaii Volcanoes 
## 29                 Mammoth Cave 
## 30                       Acadia 
## 31                  Isle Royale 
## 32                    Voyageurs 
## 33                      Glacier 
## 34           Theodore Roosevelt 
## 35             Carlsbad Caverns 
## 36                  Great Basin 
## 37              Cuyahoga Valley 
## 38                  Crater Lake 
## 39                     Congaree 
## 40                     Badlands 
## 41                    Wind Cave 
## 42                     Big Bend 
## 43          Guadalupe Mountains 
## 44                       Arches 
## 45                 Bryce Canyon 
## 46                  Canyonlands 
## 47                 Capitol Reef 
## 48                         Zion 
## 49                   Shenandoah 
## 50                Mount Rainier 
## 51               North Cascades 
## 52                      Olympic 
## 53                  Grand Teton 
## 54                 Death Valley 
## 55        Great Smoky Mountains 
## 56                  Yellowstone
national_parks_shrt %>%
  filter(Conservation_Status == "Endangered") %>%
  ggplot() +
  geom_bar(mapping = aes(x = Park_Name_Shrt, fill = Park_Name_Shrt), width = 0.9) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_discrete(expand = c(0, 0)) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 45)) +
  labs(title="Endangered Species",
       subtitle = paste0("Data represents the number of species that are endangered in each park. "),
       x="U.S. National Parks",
       y="Number of Endangered Species")

Obviously, this graph is impossible to understand mainly for two reasons: 1. Although I removed “National Park” from each [Park_Name] column, it’s still illegible. 2. The y-axis has no scale…how many endangered species does each park have?

This was an attempt to fix both these issues!

national_parks_shrt %>%
  filter(Conservation_Status == "Endangered") %>%
  ggplot() +
  geom_bar(mapping = aes(y = Park_Name_Shrt, fill = Park_Name_Shrt), width = 0.6) +
  theme(legend.position = "none", axis.text.y = element_text(size = 6)) +
  labs(title="Endangered Species",
       subtitle = paste0("Data represents the number of species that are endangered in each park. "),
      x="Number of Endangered Species",
      y="U.S. National Parks")

Still not happy haha. I guess there are too many National Parks, making the graph hard to interpret. I decided to filter the national parks, by only including those with at least 10 endandered species.

endangered_park <- national_parks_shrt %>%
  select(Park_Name_Shrt, Conservation_Status) %>%
  filter(Conservation_Status == "Endangered") %>%
  group_by(Park_Name_Shrt) %>%
  count(Conservation_Status) %>%
  rename("Num_Endangered_Species" = n) %>%
  filter(Num_Endangered_Species >= 10)
endangered_park %>%
  ggplot() +
  geom_col(mapping = aes(y = Park_Name_Shrt, x = Num_Endangered_Species, fill = Park_Name_Shrt), width = 0.6) +
  theme(legend.position = "none", axis.text.y = element_text(size = 7.5)) +
  labs(title="Endangered Species",
       subtitle = paste0("Data represents U.S. National Parks with the highest number of endangered species."),
      x="Number of Endangered Species",
      y="U.S. National Parks")

16. This was random…just for fun

national_parks_shrt %>%
  filter(Conservation_Status != "Protected", Conservation_Status != "Resident", Conservation_Status != "Migratory", Conservation_Status != "Breeder", Conservation_Status != "Proposed Threatened", Conservation_Status != "Proposed Endangered") %>%
  ggplot() +
  geom_bar(mapping = aes(y = Park_Name_Shrt, fill = Conservation_Status), width = 0.6, position = "fill") +
  theme(axis.text.y = element_text(size = 6)) +
  labs(title="Breakdown of Species' Conservation Status",
      x="Percentage of Conservation Status",
      y="U.S. National Parks")