Download and parse https://en.wikipedia.org/wiki/Timeline_of_materials_technology with rvest:
library(rvest, quietly = TRUE)
page_url <- "https://en.wikipedia.org/w/index.php?title=Timeline_of_materials_technology&oldid=688238016"
html_page <- read_html(page_url)
tech <-
html_nodes(html_page, xpath = "//h2/../ul/li") %>%
#head(-4) %>%
html_text() %>%
gsub("–", "-", .)
Extract numeric year from description:
extract_year <- function(x) {
# convert milleniums, centuries and years to years
# 1. Simple year:
if(grepl("^[0-9]+$", substr(x, 1, 4))) {
return( as.numeric(substr(x, 1, 4)))
}
# 2. millenium BC, century BC, century
if(grepl("^[0-9]+[st|nd|rd|th]+", x)) {
pos <- regexpr("^[0-9]+[st|nd|rd|th]+", x)
pos_to <- attr(pos, "match.length")-2
i <- as.numeric(substr(x, pos, pos_to))
if(substr(x, pos_to + 4, pos_to + 16) == "millennium BC") {
return(- (i-0.1) * 1000)
} else if(substr(x, pos_to + 4, pos_to + 13) == "century BC") {
return(- (i - 0.5) * 100)
} else if(substr(x, pos_to + 4, pos_to + 10) == "century") {
return((i - 0.5) * 100)
}
}
# 3. decade
if(grepl("^[0-9]{2}s BC", x)) {
pos <- regexpr("^[0-9]{2}s BC", x)
pos_to <- pos + 1
i <- as.numeric(substr(x, pos, pos_to))
return(- i)
}
return(NA)
}
years <- unlist(lapply(tech, extract_year))
df1 <- data.frame(year = years, technology = tech, stringsAsFactors = FALSE)
df1 <- df1[!is.na(years), ]
df1$technology <- factor(df1$technology, levels = unique(df1$technology))
Using ggplot2:
library(ggplot2)
ggplot(df1, aes(year, technology)) +
geom_point(size = 3, shape = "o") +
theme_minimal() +
labs(title = "Timeline of materials technology")
Timeline of materials technology. (2015, October 30). In Wikipedia, the free encyclopedia. Retrieved from https://en.wikipedia.org/w/index.php?title=Timeline_of_materials_technology&oldid=688238016