The rendered R Markdown for this file can be viewed here.
The results of this data cleaning can be found on GitHub here. The original data source is a Wikipedia page here– this code was applied to the first table in the article.
library(rvest)
library(dplyr)
library(stringr)
url <- "https://en.wikipedia.org/wiki/Fastest_animals"
fast_animals <- url %>%
read_html() %>%
html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>%
html_table(fill = TRUE)
fast_animals <- fast_animals[[1]]
Check out the structure and start cleaning up the table
head(fast_animals)
## Sl.no Animal Maximum Speed Class
## 1 1 Peregrine falcon 389 km/h (242 mph)[2][7] Bird
## 2 2 Golden eagle 240–320 km/h (150–200 mph) Bird
## 3 3 White-throated needletail swift 169 km/h (105 mph)[8][9][10] Bird
## 4 4 Eurasian hobby 160 km/h (100 mph)[11] Bird
## 5 5 Frigatebird 153 km/h (95 mph)[12] Bird
## 6 5b Rock dove (pigeon) 148 km/h (92 mph)[12] Bird
## Notes
## 1 The peregrine falcon is the fastest aerial animal, fastest animal in flight, fastest bird, and the overall fastest member of the animal kingdom. Though it is not fast enough in horizontal level flight, its hunting dive, the stoop, it soars to a great height, then dives steeply at speeds of over 200 mph.[2]
## 2
## 3
## 4 Can sometimes outfly the swift
## 5
## 6 Pigeons have been clocked flying 92.5 mph average speed on a 400 mile race.
str(fast_animals)
## 'data.frame': 22 obs. of 5 variables:
## $ Sl.no : chr "1" "2" "3" "4" ...
## $ Animal : chr "Peregrine falcon" "Golden eagle" "White-throated needletail swift" "Eurasian hobby" ...
## $ Maximum Speed: chr "389 km/h (242 mph)[2][7]" "240–320 km/h (150–200 mph)" "169 km/h (105 mph)[8][9][10]" "160 km/h (100 mph)[11]" ...
## $ Class : chr "Bird" "Bird" "Bird" "Bird" ...
## $ Notes : chr "The peregrine falcon is the fastest aerial animal, fastest animal in flight, fastest bird, and the overall fastest member of th"| __truncated__ "" "" "Can sometimes outfly the swift" ...
colnames(fast_animals)
## [1] "Sl.no" "Animal" "Maximum Speed" "Class"
## [5] "Notes"
# drop the columns we do not need
drops <- c("Sl.no", "Notes")
fast_animals <- fast_animals[,!(names(fast_animals) %in% drops)]
str(fast_animals)
## 'data.frame': 22 obs. of 3 variables:
## $ Animal : chr "Peregrine falcon" "Golden eagle" "White-throated needletail swift" "Eurasian hobby" ...
## $ Maximum Speed: chr "389 km/h (242 mph)[2][7]" "240–320 km/h (150–200 mph)" "169 km/h (105 mph)[8][9][10]" "160 km/h (100 mph)[11]" ...
## $ Class : chr "Bird" "Bird" "Bird" "Bird" ...
head(fast_animals)
## Animal Maximum Speed Class
## 1 Peregrine falcon 389 km/h (242 mph)[2][7] Bird
## 2 Golden eagle 240–320 km/h (150–200 mph) Bird
## 3 White-throated needletail swift 169 km/h (105 mph)[8][9][10] Bird
## 4 Eurasian hobby 160 km/h (100 mph)[11] Bird
## 5 Frigatebird 153 km/h (95 mph)[12] Bird
## 6 Rock dove (pigeon) 148 km/h (92 mph)[12] Bird
# rename columns
colnames(fast_animals) <- c("animal", "max_speed", "class")
All we need is the mph data, so we’ll extract it out from the () in the Speed column and create a new column with just this value
# remove all [number] pattern from the animal column
fast_animals$animal <- gsub("\\[(.+)\\].+?", "", fast_animals$animal)
fast_animals$animal <- gsub("\\[(.+)\\]", "", fast_animals$animal)
# remove all but parens in the speed column
fast_animals$max_speed <- gsub("[\\(\\)]", "", regmatches(fast_animals$max_speed, gregexpr("\\(.*?\\)", fast_animals$max_speed)))
# Remove mph from speed column and take just the highest max speed
fast_animals$max_speed <- gsub("mph", "", fast_animals$max_speed)
# Remove the white space
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
fast_animals$max_speed <- trim(fast_animals$max_speed)
# manually clean up two range records
fast_animals[2,2] <- (150+200)/2
fast_animals[12,2] <- (68+75)/2
fast_animals$max_speed <- as.numeric(fast_animals$max_speed)
fast_animals
## animal max_speed class
## 1 Peregrine falcon 242.00 Bird
## 2 Golden eagle 175.00 Bird
## 3 White-throated needletail swift 105.00 Bird
## 4 Eurasian hobby 100.00 Bird
## 5 Frigatebird 95.00 Bird
## 6 Rock dove (pigeon) 92.00 Bird
## 7 Spur-winged goose 88.00 Bird
## 8 Red-breasted merganser 81.00 Bird
## 9 Black marlin 80.00 Fish
## 10 Gyrfalcon 80.00 Bird
## 11 Grey-headed albatross 79.00 Bird
## 12 Cheetah 71.50 Mammal
## 13 Sailfish 67.85 Fish
## 14 Anna's hummingbird 61.06 Bird
## 15 Swordfish 60.00 Fish
## 16 Ostrich 60.00 Bird
## 17 Mexican free-tailed bat.(in flight) 60.00 Mammal
## 18 Pronghorn 55.00 Mammal
## 19 Springbok 55.00 Mammal
## 20 Blue wildebeest 50.00 Mammal
## 21 Lion 50.00 Mammal
## 22 Blackbuck 50.00 Mammal
Now I will convert the speed column to yards/minute and write the results to a CSV file.
fast_animals <- mutate(fast_animals, max_speed_ym = ((max_speed*1760)/60))
# Rename/reorder the columns
colnames(fast_animals) <- c("animal", "max_speed_mph", "class", "max_speed_ym")
fast_animals <- select(fast_animals, animal, class, max_speed_mph, max_speed_ym) #reorder
fast_animals
## animal class max_speed_mph max_speed_ym
## 1 Peregrine falcon Bird 242.00 7098.667
## 2 Golden eagle Bird 175.00 5133.333
## 3 White-throated needletail swift Bird 105.00 3080.000
## 4 Eurasian hobby Bird 100.00 2933.333
## 5 Frigatebird Bird 95.00 2786.667
## 6 Rock dove (pigeon) Bird 92.00 2698.667
## 7 Spur-winged goose Bird 88.00 2581.333
## 8 Red-breasted merganser Bird 81.00 2376.000
## 9 Black marlin Fish 80.00 2346.667
## 10 Gyrfalcon Bird 80.00 2346.667
## 11 Grey-headed albatross Bird 79.00 2317.333
## 12 Cheetah Mammal 71.50 2097.333
## 13 Sailfish Fish 67.85 1990.267
## 14 Anna's hummingbird Bird 61.06 1791.093
## 15 Swordfish Fish 60.00 1760.000
## 16 Ostrich Bird 60.00 1760.000
## 17 Mexican free-tailed bat.(in flight) Mammal 60.00 1760.000
## 18 Pronghorn Mammal 55.00 1613.333
## 19 Springbok Mammal 55.00 1613.333
## 20 Blue wildebeest Mammal 50.00 1466.667
## 21 Lion Mammal 50.00 1466.667
## 22 Blackbuck Mammal 50.00 1466.667
# write restults to CSV
write.csv(fast_animals, "fast_animals.csv")