library(rvest)
library(dplyr)
library(stringr)
library(readr)
Web Table Data Scraping
Intro
This post aims to demonstrate how to scrape web data as tables. It is recommended to use the Google Chrome browser because it centers at the table that I want to get data when I inspect it. This makes it easier to identify the table’s name and class.
Data
I will get table data ‘Database management system’ from the website below.
Scrap table data
# Read the table from Wikipedia page
<- "https://en.wikipedia.org/wiki/Table_(database)"
url <- read_html(url)
html
# Extract the first table
<- html |>
webtable html_element("table") |>
html_table()
Check the table
head(webtable, 7)
# A tibble: 7 × 2
.mw-parser-output .navbar{display:inline;font-size:88…¹ .mw-parser-output .n…²
<chr> <chr>
1 "Types" "Object-oriented\ncom…
2 "Concepts" "Database\nACID\nArms…
3 "Objects" "Relation\ntable\ncol…
4 "Components" "Concurrency control\…
5 "Functions" "Administration\nQuer…
6 "Related topics" "Database models\nDat…
7 "Category\n Outline" "Category\n Outline"
# ℹ abbreviated names:
# ¹`.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}vteDatabase management systems`,
# ²`.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}vteDatabase management systems`
Rename the columns
# Rename columns as "Attributes" and "Details"
colnames(webtable) <- c("Attributes", "Details")
head(webtable, 7)
# A tibble: 7 × 2
Attributes Details
<chr> <chr>
1 "Types" "Object-oriented\ncomparison\nRelational\nlist\ncomparis…
2 "Concepts" "Database\nACID\nArmstrong's axioms\nCodd's 12 rules\nCA…
3 "Objects" "Relation\ntable\ncolumn\nrow\nView\nTransaction\nTransa…
4 "Components" "Concurrency control\nData dictionary\nJDBC\nXQJ\nODBC\n…
5 "Functions" "Administration\nQuery optimization\nReplication\nShardi…
6 "Related topics" "Database models\nDatabase normalization\nDatabase stora…
7 "Category\n Outline" "Category\n Outline"
Replace with “,” and delete the last row.
<- webtable[1:6,] |> mutate(Details = str_replace_all(Details, "(\n)([a-z])", " \\2"), # Replace \n before lowercase with " "
webtable Details = str_replace_all(Details, "(\n)([A-Z])", ", \\2")) # Replace \n before uppercase with ", "
head(webtable, 7)
# A tibble: 6 × 2
Attributes Details
<chr> <chr>
1 Types Object-oriented comparison, Relational list comparison, Key–va…
2 Concepts Database, ACID, Armstrong's axioms, Codd's 12 rules, CAP theor…
3 Objects Relation table column row, View, Transaction, Transaction log,…
4 Components Concurrency control, Data dictionary, JDBC, XQJ, ODBC, Query l…
5 Functions Administration, Query optimization, Replication, Sharding
6 Related topics Database models, Database normalization, Database storage, Dis…
Save the table to local direstory
# Save the table as a CSV file
write.csv(webtable, file = "/Users/nnthieu/web_scraping.csv", row.names = FALSE)
# Print message confirming save location
cat("Table saved successfully at:", normalizePath("/Users/nnthieu/web_scraping.csv"), "\n")
Table saved successfully at: /Users/nnthieu/web_scraping.csv
Results
# Read the CSV file
<- read_csv("/Users/nnthieu/web_scraping.csv")
webtable
webtable
# A tibble: 6 × 2
Attributes Details
<chr> <chr>
1 Types Object-oriented comparison, Relational list comparison, Key–va…
2 Concepts Database, ACID, Armstrong's axioms, Codd's 12 rules, CAP theor…
3 Objects Relation table column row, View, Transaction, Transaction log,…
4 Components Concurrency control, Data dictionary, JDBC, XQJ, ODBC, Query l…
5 Functions Administration, Query optimization, Replication, Sharding
6 Related topics Database models, Database normalization, Database storage, Dis…
That is what i want.