The RPubs version of the markdown file is available here
The resulting CSV file can be found here
library(rvest)
library(stringr)
library(knitr)
These scripts can be used to scrape a table by copying the table’s xpath in the source. I obtained this xpath using Chrome’s developer tools
url <- "https://en.wikipedia.org/wiki/List_of_birds_by_flight_speed"
bird_speeds <- url %>%
html() %>%
html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>%
html_table()
bird_speeds <- bird_speeds[[1]]
colnames(bird_speeds)
## [1] "Bird" "Image"
## [3] "Species" "Family"
## [5] "Average horizontal speed" "Maximum horizontal speed"
## [7] "Average diving speed" "Maximum airspeed"
## [9] "Flight"
# Rename the columns
colnames(bird_speeds) <- c("bird", "image", "species", "family", "avg_h_speed", "max_h_speed", "avg_d_speed", "max_airspeed", "flight")
head(bird_speeds)
## bird image species family
## 1 Peregrine falcon NA Falco peregrinus Falconidae
## 2 Golden eagle NA Aquila chrysaetos Accipitridae
## 3 Grey-headed albatross NA Thalassarche Chrysostoma Diomedeidae
## 4 Gyrfalcon NA Falco rusticolus Falconidae
## 5 White-throated needletail NA Hirundapus caudacutus Apodidae
## 6 Swift NA Apus apus Apodidae [14]
## avg_h_speed max_h_speed
## 1 65–90 km/h[3]\n40-56 mph 105–115 km/h[3]\n65-71 mph
## 2 45–51 km/h[6]\n28-32 mph 129 km/h[6]\n80 mph
## 3 127 km/h[8][9][note 1]\n78.9 mph
## 4 80-110 km/h 50-68 mph 145 km/h 90 mph[10]
## 5 169 km/h[13]\n[note 2]\n105 mph
## 6
## avg_d_speed max_airspeed
## 1 180 km/h [4]\n112 mph 389 km/h [5]\n242 mph
## 2 241 km/h [7]\n150 mph 320 km/h[6]\n200 mph
## 3
## 4 187–209 km/h [11]\n116-130 mph 209 km/h [11][12]\n130 mph
## 5
## 6 171 km/h[citation needed]\n106 mph
## flight
## 1 High speed dive - pointed long wings
## 2
## 3 2.2m (7'2") wingspan allows for high power use from wind.
## 4 High speed dive - pointed long wings
## 5 High speed wings
## 6 High speed wings
bird_speeds[1,]
## bird image species family
## 1 Peregrine falcon NA Falco peregrinus Falconidae
## avg_h_speed max_h_speed
## 1 65–90 km/h[3]\n40-56 mph 105–115 km/h[3]\n65-71 mph
## avg_d_speed max_airspeed
## 1 180 km/h [4]\n112 mph 389 km/h [5]\n242 mph
## flight
## 1 High speed dive - pointed long wings
To clean up the speed columns, I’ll remove the citation tags in square brackets using gsub
bird_speeds$avg_h_speed <- gsub("\\[(.+)\\]", "", bird_speeds$avg_h_speed)
bird_speeds$max_h_speed <- gsub("\\[(.+)\\]", "", bird_speeds$max_h_speed)
bird_speeds$avg_d_speed <- gsub("\\[(.+)\\]", "", bird_speeds$avg_d_speed)
bird_speeds$max_airspeed <- gsub("\\[(.+)\\]", "", bird_speeds$max_airspeed)
head(bird_speeds)
## bird image species family
## 1 Peregrine falcon NA Falco peregrinus Falconidae
## 2 Golden eagle NA Aquila chrysaetos Accipitridae
## 3 Grey-headed albatross NA Thalassarche Chrysostoma Diomedeidae
## 4 Gyrfalcon NA Falco rusticolus Falconidae
## 5 White-throated needletail NA Hirundapus caudacutus Apodidae
## 6 Swift NA Apus apus Apodidae [14]
## avg_h_speed max_h_speed avg_d_speed
## 1 65–90 km/h\n40-56 mph 105–115 km/h\n65-71 mph 180 km/h \n112 mph
## 2 45–51 km/h\n28-32 mph 129 km/h\n80 mph 241 km/h \n150 mph
## 3 127 km/h\n78.9 mph
## 4 80-110 km/h 50-68 mph 145 km/h 90 mph 187–209 km/h \n116-130 mph
## 5 169 km/h\n105 mph
## 6
## max_airspeed
## 1 389 km/h \n242 mph
## 2 320 km/h\n200 mph
## 3
## 4 209 km/h \n130 mph
## 5
## 6 171 km/h\n106 mph
## flight
## 1 High speed dive - pointed long wings
## 2
## 3 2.2m (7'2") wingspan allows for high power use from wind.
## 4 High speed dive - pointed long wings
## 5 High speed wings
## 6 High speed wings
I will also now split up the speed columns so that there are separate columns for mph and km and the units aren’t embedded in the data
# average horizontal speed
bird_speeds$avg_h_speed_mph <- lapply(strsplit(as.character(bird_speeds$avg_h_speed), "km/h"), "[", 2)
bird_speeds$avg_h_speed_km <- lapply(strsplit(as.character(bird_speeds$avg_h_speed), "km/h"), "[", 1)
head(bird_speeds)
## bird image species family
## 1 Peregrine falcon NA Falco peregrinus Falconidae
## 2 Golden eagle NA Aquila chrysaetos Accipitridae
## 3 Grey-headed albatross NA Thalassarche Chrysostoma Diomedeidae
## 4 Gyrfalcon NA Falco rusticolus Falconidae
## 5 White-throated needletail NA Hirundapus caudacutus Apodidae
## 6 Swift NA Apus apus Apodidae [14]
## avg_h_speed max_h_speed avg_d_speed
## 1 65–90 km/h\n40-56 mph 105–115 km/h\n65-71 mph 180 km/h \n112 mph
## 2 45–51 km/h\n28-32 mph 129 km/h\n80 mph 241 km/h \n150 mph
## 3 127 km/h\n78.9 mph
## 4 80-110 km/h 50-68 mph 145 km/h 90 mph 187–209 km/h \n116-130 mph
## 5 169 km/h\n105 mph
## 6
## max_airspeed
## 1 389 km/h \n242 mph
## 2 320 km/h\n200 mph
## 3
## 4 209 km/h \n130 mph
## 5
## 6 171 km/h\n106 mph
## flight
## 1 High speed dive - pointed long wings
## 2
## 3 2.2m (7'2") wingspan allows for high power use from wind.
## 4 High speed dive - pointed long wings
## 5 High speed wings
## 6 High speed wings
## avg_h_speed_mph avg_h_speed_km
## 1 \n40-56 mph 65–90
## 2 \n28-32 mph 45–51
## 3 NA NA
## 4 50-68 mph 80-110
## 5 NA NA
## 6 NA NA
# Remove the units and the new line characters
bird_speeds$avg_h_speed_mph <- gsub("\n", "", bird_speeds$avg_h_speed_mph)
bird_speeds$avg_h_speed_mph <- gsub("mph", "", bird_speeds$avg_h_speed_mph)
# remove the old avg_h_speed column and others likely to not be useful
bird_speeds <- subset(bird_speeds, select = c(bird,species,family,max_h_speed,avg_d_speed,max_airspeed,avg_h_speed_mph, avg_h_speed_km))
# Check the structure
head(bird_speeds)
## bird species family
## 1 Peregrine falcon Falco peregrinus Falconidae
## 2 Golden eagle Aquila chrysaetos Accipitridae
## 3 Grey-headed albatross Thalassarche Chrysostoma Diomedeidae
## 4 Gyrfalcon Falco rusticolus Falconidae
## 5 White-throated needletail Hirundapus caudacutus Apodidae
## 6 Swift Apus apus Apodidae [14]
## max_h_speed avg_d_speed max_airspeed
## 1 105–115 km/h\n65-71 mph 180 km/h \n112 mph 389 km/h \n242 mph
## 2 129 km/h\n80 mph 241 km/h \n150 mph 320 km/h\n200 mph
## 3 127 km/h\n78.9 mph
## 4 145 km/h 90 mph 187–209 km/h \n116-130 mph 209 km/h \n130 mph
## 5 169 km/h\n105 mph
## 6 171 km/h\n106 mph
## avg_h_speed_mph avg_h_speed_km
## 1 40-56 65–90
## 2 28-32 45–51
## 3 <NA> NA
## 4 50-68 80-110
## 5 <NA> NA
## 6 <NA> NA
Perform these tasks again with other speed measure columns
# max horizontal speed
bird_speeds$max_h_speed_mph <- lapply(strsplit(as.character(bird_speeds$max_h_speed), "km/h"), "[", 2)
bird_speeds$max_h_speed_km <- lapply(strsplit(as.character(bird_speeds$max_h_speed), "km/h"), "[", 1)
bird_speeds$max_h_speed_mph <- gsub("\n", "", bird_speeds$max_h_speed_mph)
bird_speeds$max_h_speed_mph <- gsub("mph", "", bird_speeds$max_h_speed_mph)
# average dive speed
bird_speeds$avg_d_speed_mph <- lapply(strsplit(as.character(bird_speeds$avg_d_speed), "km/h"), "[", 2)
bird_speeds$avg_d_speed_km <- lapply(strsplit(as.character(bird_speeds$avg_d_speed), "km/h"), "[", 1)
# Remove the units and the new line characters
bird_speeds$avg_d_speed_mph <- gsub("\n", "", bird_speeds$avg_d_speed_mph)
bird_speeds$avg_d_speed_mph <- gsub("mph", "", bird_speeds$avg_d_speed_mph)
# max airspeed
bird_speeds$max_airspeed_mph <- lapply(strsplit(as.character(bird_speeds$max_airspeed), "km/h"), "[", 2)
bird_speeds$max_airspeed_km <- lapply(strsplit(as.character(bird_speeds$max_airspeed), "km/h"), "[", 1)
# Remove the units and the new line characters
bird_speeds$max_airspeed_mph <- gsub("\n", "", bird_speeds$max_airspeed_mph)
bird_speeds$max_airspeed_mph <- gsub("mph", "", bird_speeds$max_airspeed_mph)
head(bird_speeds)
## bird species family
## 1 Peregrine falcon Falco peregrinus Falconidae
## 2 Golden eagle Aquila chrysaetos Accipitridae
## 3 Grey-headed albatross Thalassarche Chrysostoma Diomedeidae
## 4 Gyrfalcon Falco rusticolus Falconidae
## 5 White-throated needletail Hirundapus caudacutus Apodidae
## 6 Swift Apus apus Apodidae [14]
## max_h_speed avg_d_speed max_airspeed
## 1 105–115 km/h\n65-71 mph 180 km/h \n112 mph 389 km/h \n242 mph
## 2 129 km/h\n80 mph 241 km/h \n150 mph 320 km/h\n200 mph
## 3 127 km/h\n78.9 mph
## 4 145 km/h 90 mph 187–209 km/h \n116-130 mph 209 km/h \n130 mph
## 5 169 km/h\n105 mph
## 6 171 km/h\n106 mph
## avg_h_speed_mph avg_h_speed_km max_h_speed_mph max_h_speed_km
## 1 40-56 65–90 65-71 105–115
## 2 28-32 45–51 80 129
## 3 <NA> NA 78.9 127
## 4 50-68 80-110 90 145
## 5 <NA> NA 105 169
## 6 <NA> NA <NA> NA
## avg_d_speed_mph avg_d_speed_km max_airspeed_mph max_airspeed_km
## 1 112 180 242 389
## 2 150 241 200 320
## 3 <NA> NA <NA> NA
## 4 116-130 187–209 130 209
## 5 <NA> NA <NA> NA
## 6 <NA> NA 106 171
# Select only the columns we want
bird_speeds <- subset(bird_speeds, select = c(bird,species,family,avg_h_speed_mph, avg_h_speed_km,max_h_speed_mph, max_h_speed_km, avg_d_speed_mph, avg_d_speed_km, max_airspeed_mph, max_airspeed_km ))
head(bird_speeds)
## bird species family
## 1 Peregrine falcon Falco peregrinus Falconidae
## 2 Golden eagle Aquila chrysaetos Accipitridae
## 3 Grey-headed albatross Thalassarche Chrysostoma Diomedeidae
## 4 Gyrfalcon Falco rusticolus Falconidae
## 5 White-throated needletail Hirundapus caudacutus Apodidae
## 6 Swift Apus apus Apodidae [14]
## avg_h_speed_mph avg_h_speed_km max_h_speed_mph max_h_speed_km
## 1 40-56 65–90 65-71 105–115
## 2 28-32 45–51 80 129
## 3 <NA> NA 78.9 127
## 4 50-68 80-110 90 145
## 5 <NA> NA 105 169
## 6 <NA> NA <NA> NA
## avg_d_speed_mph avg_d_speed_km max_airspeed_mph max_airspeed_km
## 1 112 180 242 389
## 2 150 241 200 320
## 3 <NA> NA <NA> NA
## 4 116-130 187–209 130 209
## 5 <NA> NA <NA> NA
## 6 <NA> NA 106 171
There also appear to be citation brackets in the family column so I’ll remove them
bird_speeds$family <- gsub("\\[(.+)\\]", "", bird_speeds$family)
I’ll also trim up the whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
bird_speeds$bird <- trim(bird_speeds$bird)
bird_speeds$species <- trim(bird_speeds$species)
bird_speeds$family <- trim(bird_speeds$family)
bird_speeds$avg_h_speed_mph <- trim(bird_speeds$avg_h_speed_mph)
bird_speeds$avg_h_speed_km <- trim(bird_speeds$avg_h_speed_km)
bird_speeds$max_h_speed_mph <- trim(bird_speeds$max_h_speed_mph)
bird_speeds$max_h_speed_km <- trim(bird_speeds$max_h_speed_km)
bird_speeds$avg_d_speed_mph <- trim(bird_speeds$avg_d_speed_mph)
bird_speeds$avg_d_speed_km <- trim(bird_speeds$avg_d_speed_km)
bird_speeds$max_airspeed_mph <- trim(bird_speeds$max_airspeed_mph)
bird_speeds$max_airspeed_km <- trim(bird_speeds$max_airspeed_km)
Now I’ll show the resulting dataframe and write the results to a CSV
kable(head(bird_speeds), format = "markdown")
| bird | species | family | avg_h_speed_mph | avg_h_speed_km | max_h_speed_mph | max_h_speed_km | avg_d_speed_mph | avg_d_speed_km | max_airspeed_mph | max_airspeed_km |
|---|---|---|---|---|---|---|---|---|---|---|
| Peregrine falcon | Falco peregrinus | Falconidae | 40-56 | 65–90 | 65-71 | 105–115 | 112 | 180 | 242 | 389 |
| Golden eagle | Aquila chrysaetos | Accipitridae | 28-32 | 45–51 | 80 | 129 | 150 | 241 | 200 | 320 |
| Grey-headed albatross | Thalassarche Chrysostoma | Diomedeidae | NA | NA | 78.9 | 127 | NA | NA | NA | NA |
| Gyrfalcon | Falco rusticolus | Falconidae | 50-68 | 80-110 | 90 | 145 | 116-130 | 187–209 | 130 | 209 |
| White-throated needletail | Hirundapus caudacutus | Apodidae | NA | NA | 105 | 169 | NA | NA | NA | NA |
| Swift | Apus apus | Apodidae | NA | NA | NA | NA | NA | NA | 106 | 171 |
#coerce dataframe to solve "list" encode error. Some of my columns became type "list", so the data.frame was no longer 2-dimensional and couldn't be exported to a 2d csv-file
bird_speeds <- data.frame(lapply(bird_speeds, as.character), stringsAsFactors=FALSE)
write.csv(bird_speeds, file = "bird_speed_table.csv")