The rendered R Markdown for this file can be viewed here.

The results of this data cleaning can be found on GitHub here. The original data source is a Wikipedia page here– this code was applied to the first table in the article.

library(rvest)
library(dplyr)
library(stringr)
url <- "https://en.wikipedia.org/wiki/Fastest_animals"
fast_animals <- url %>%
  read_html() %>%
  html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>%
  html_table(fill = TRUE)
fast_animals <- fast_animals[[1]]

Check out the structure and start cleaning up the table

head(fast_animals)
##   Sl.no                          Animal                Maximum Speed Class
## 1     1                Peregrine falcon     389 km/h (242 mph)[2][7]  Bird
## 2     2                    Golden eagle   240–320 km/h (150–200 mph)  Bird
## 3     3 White-throated needletail swift 169 km/h (105 mph)[8][9][10]  Bird
## 4     4                  Eurasian hobby       160 km/h (100 mph)[11]  Bird
## 5     5                     Frigatebird        153 km/h (95 mph)[12]  Bird
## 6    5b              Rock dove (pigeon)        148 km/h (92 mph)[12]  Bird
##                                                                                                                                                                                                                                                                                                                 Notes
## 1 The peregrine falcon is the fastest aerial animal, fastest animal in flight, fastest bird, and the overall fastest member of the animal kingdom. Though it is not fast enough in horizontal level flight, its hunting dive, the stoop, it soars to a great height, then dives steeply at speeds of over 200 mph.[2]
## 2                                                                                                                                                                                                                                                                                                                    
## 3                                                                                                                                                                                                                                                                                                                    
## 4                                                                                                                                                                                                                                                                                      Can sometimes outfly the swift
## 5                                                                                                                                                                                                                                                                                                                    
## 6                                                                                                                                                                                                                                         Pigeons have been clocked flying 92.5 mph average speed on a 400 mile race.
str(fast_animals)
## 'data.frame':    22 obs. of  5 variables:
##  $ Sl.no        : chr  "1" "2" "3" "4" ...
##  $ Animal       : chr  "Peregrine falcon" "Golden eagle" "White-throated needletail swift" "Eurasian hobby" ...
##  $ Maximum Speed: chr  "389 km/h (242 mph)[2][7]" "240–320 km/h (150–200 mph)" "169 km/h (105 mph)[8][9][10]" "160 km/h (100 mph)[11]" ...
##  $ Class        : chr  "Bird" "Bird" "Bird" "Bird" ...
##  $ Notes        : chr  "The peregrine falcon is the fastest aerial animal, fastest animal in flight, fastest bird, and the overall fastest member of th"| __truncated__ "" "" "Can sometimes outfly the swift" ...
colnames(fast_animals)
## [1] "Sl.no"         "Animal"        "Maximum Speed" "Class"        
## [5] "Notes"
# drop the columns we do not need
drops <- c("Sl.no", "Notes")
fast_animals <- fast_animals[,!(names(fast_animals) %in% drops)]
str(fast_animals)
## 'data.frame':    22 obs. of  3 variables:
##  $ Animal       : chr  "Peregrine falcon" "Golden eagle" "White-throated needletail swift" "Eurasian hobby" ...
##  $ Maximum Speed: chr  "389 km/h (242 mph)[2][7]" "240–320 km/h (150–200 mph)" "169 km/h (105 mph)[8][9][10]" "160 km/h (100 mph)[11]" ...
##  $ Class        : chr  "Bird" "Bird" "Bird" "Bird" ...
head(fast_animals)
##                            Animal                Maximum Speed Class
## 1                Peregrine falcon     389 km/h (242 mph)[2][7]  Bird
## 2                    Golden eagle   240–320 km/h (150–200 mph)  Bird
## 3 White-throated needletail swift 169 km/h (105 mph)[8][9][10]  Bird
## 4                  Eurasian hobby       160 km/h (100 mph)[11]  Bird
## 5                     Frigatebird        153 km/h (95 mph)[12]  Bird
## 6              Rock dove (pigeon)        148 km/h (92 mph)[12]  Bird
# rename columns
colnames(fast_animals) <- c("animal", "max_speed", "class")

All we need is the mph data, so we’ll extract it out from the () in the Speed column and create a new column with just this value

# remove all [number] pattern from the animal column
fast_animals$animal <- gsub("\\[(.+)\\].+?", "", fast_animals$animal)
fast_animals$animal <- gsub("\\[(.+)\\]", "", fast_animals$animal)
# remove all but parens in the speed column
fast_animals$max_speed <- gsub("[\\(\\)]", "", regmatches(fast_animals$max_speed, gregexpr("\\(.*?\\)", fast_animals$max_speed)))
# Remove mph from speed column and take just the highest max speed
fast_animals$max_speed <- gsub("mph", "", fast_animals$max_speed)
# Remove the white space
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
fast_animals$max_speed <- trim(fast_animals$max_speed)
# manually clean up two range records
fast_animals[2,2] <- (150+200)/2
fast_animals[12,2] <- (68+75)/2
fast_animals$max_speed <- as.numeric(fast_animals$max_speed)
fast_animals
##                                 animal max_speed  class
## 1                     Peregrine falcon    242.00   Bird
## 2                         Golden eagle    175.00   Bird
## 3      White-throated needletail swift    105.00   Bird
## 4                       Eurasian hobby    100.00   Bird
## 5                          Frigatebird     95.00   Bird
## 6                   Rock dove (pigeon)     92.00   Bird
## 7                    Spur-winged goose     88.00   Bird
## 8               Red-breasted merganser     81.00   Bird
## 9                         Black marlin     80.00   Fish
## 10                           Gyrfalcon     80.00   Bird
## 11               Grey-headed albatross     79.00   Bird
## 12                             Cheetah     71.50 Mammal
## 13                            Sailfish     67.85   Fish
## 14                  Anna's hummingbird     61.06   Bird
## 15                           Swordfish     60.00   Fish
## 16                             Ostrich     60.00   Bird
## 17 Mexican free-tailed bat.(in flight)     60.00 Mammal
## 18                           Pronghorn     55.00 Mammal
## 19                           Springbok     55.00 Mammal
## 20                     Blue wildebeest     50.00 Mammal
## 21                                Lion     50.00 Mammal
## 22                           Blackbuck     50.00 Mammal

Now I will convert the speed column to yards/minute and write the results to a CSV file.

fast_animals <- mutate(fast_animals, max_speed_ym = ((max_speed*1760)/60))
# Rename/reorder the columns
colnames(fast_animals) <- c("animal", "max_speed_mph", "class", "max_speed_ym")
fast_animals <- select(fast_animals, animal, class, max_speed_mph, max_speed_ym) #reorder
fast_animals
##                                 animal  class max_speed_mph max_speed_ym
## 1                     Peregrine falcon   Bird        242.00     7098.667
## 2                         Golden eagle   Bird        175.00     5133.333
## 3      White-throated needletail swift   Bird        105.00     3080.000
## 4                       Eurasian hobby   Bird        100.00     2933.333
## 5                          Frigatebird   Bird         95.00     2786.667
## 6                   Rock dove (pigeon)   Bird         92.00     2698.667
## 7                    Spur-winged goose   Bird         88.00     2581.333
## 8               Red-breasted merganser   Bird         81.00     2376.000
## 9                         Black marlin   Fish         80.00     2346.667
## 10                           Gyrfalcon   Bird         80.00     2346.667
## 11               Grey-headed albatross   Bird         79.00     2317.333
## 12                             Cheetah Mammal         71.50     2097.333
## 13                            Sailfish   Fish         67.85     1990.267
## 14                  Anna's hummingbird   Bird         61.06     1791.093
## 15                           Swordfish   Fish         60.00     1760.000
## 16                             Ostrich   Bird         60.00     1760.000
## 17 Mexican free-tailed bat.(in flight) Mammal         60.00     1760.000
## 18                           Pronghorn Mammal         55.00     1613.333
## 19                           Springbok Mammal         55.00     1613.333
## 20                     Blue wildebeest Mammal         50.00     1466.667
## 21                                Lion Mammal         50.00     1466.667
## 22                           Blackbuck Mammal         50.00     1466.667
# write restults to CSV
write.csv(fast_animals, "fast_animals.csv")