I go to the public library often to print or work in a quiet space without feeling the the obligation of commerce. This is a simple demonstration of scraping useful information from a public website, and then organizing it into a table in R.
“Having fun isn’t hard, when you’ve got a library card!”
-Arthur and Friends
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(hms)
##
## Attaching package: 'hms'
##
## The following object is masked from 'package:lubridate':
##
## hms
Scraping data from Multnomah County website:
#url to scrape from
result = read_html('https://multcolib.org/hours-and-locations')
#library names and addresses
lib_names = result %>%
html_nodes('.p-name') %>% html_text()
lib_addresses = result %>%
html_nodes('.p-street-address') %>% html_text()
Creating the table:
#hours
hours = result %>%
html_nodes('.hours') %>%
html_text()
#removing html line break characters
hours = (str_split(hours, "\r\n"))
#removing "Weekly Hours" string"
hours = lapply(hours, function(x) x[-1])
hours = data.frame(hours)
#temporary colnames
colnames(hours) = c(1:22)
#removing day of week from time interval
hours = hours %>%
mutate(across(everything(),~str_remove(.,"\\w+day")))
hours2 = hours %>% gather(key = "key", value = "value", 1:22)
#adding day of week per open hours
hours2$day = rep(c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", NA),22)
#spreading out days of week
hours3 = hours2 %>%
spread(key = day, value = value)
#converting index to double
hours3$key = as.numeric(hours3$key)
hours3 = hours3[,c("key","monday","tuesday","wednesday","thursday","friday","saturday","sunday")] %>% arrange(key)
#bringing in library names
hours3$library = lib_names
#renaming key as id
hours3 = hours3 %>% rename(id = key)
ds = hours3
#adding library addresses:
ds$address = lib_addresses
ds = ds[,c("id", "library","address","monday","tuesday","wednesday","thursday","friday","saturday","sunday")]
ds
## id library address
## 1 1 Albina Library 216 NE Knott Street
## 2 2 Belmont Library 1038 SE César E. Chávez Boulevard
## 3 3 Capitol Hill Library 10723 SW Capitol Highway
## 4 4 Central Library 801 SW 10th Avenue
## 5 5 Fairview-Columbia Library 1520 NE Village Street
## 6 6 Gregory Heights Library 7921 NE Sandy Boulevard
## 7 7 Gresham Library 385 NW Miller Avenue
## 8 8 Hillsdale Library 1525 SW Sunset Boulevard
## 9 9 Holgate Library 7905 SE Holgate Boulevard
## 10 10 Hollywood Library 4040 NE Tillamook Street
## 11 11 Isom Operations Center 205 NE Russell Street
## 12 12 Kenton Library 8226 N Denver Avenue
## 13 13 Library Administration 919 NE 19th Avenue, Suite 250
## 14 14 Midland Library 805 SE 122nd Avenue
## 15 15 North Portland Library 512 N Killingsworth Street
## 16 16 Northwest Library 2300 NW Thurman Street
## 17 17 Rockwood Library 17917 SE Stark Street
## 18 18 Sellwood-Moreland Library 7860 SE 13th Avenue
## 19 19 St. Johns Library 7510 N Charleston Avenue
## 20 20 The Title Wave Used Bookstore 216 NE Knott Street
## 21 21 Troutdale Library 2451 SW Cherry Park Road
## 22 22 Woodstock Library 6008 SE 49th Avenue
## monday tuesday wednesday thursday
## 1 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 2 10 am - 6 pm 10 am - 8 pm 12 pm - 8 pm 12 pm - 8 pm
## 3 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 4 closed closed closed closed
## 5 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 6 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 7 10 am - 6 pm 10 am - 8 pm 12 pm - 8 pm 12 pm - 8 pm
## 8 10 am - 6 pm 10 am - 8 pm 12 pm - 8 pm 12 pm - 8 pm
## 9 closed closed closed closed
## 10 10 am - 6 pm 10 am - 8 pm 12 pm - 8 pm 12 pm - 8 pm
## 11 closed closed closed closed
## 12 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 13 9 am - 5 pm 9 am - 5 pm 9 am - 5 pm 9 am - 5 pm
## 14 closed closed closed closed
## 15 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 16 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 17 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 18 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 19 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 20 closed closed closed closed
## 21 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## 22 12 pm - 8 pm 12 pm - 8 pm 10 am - 6 pm 10 am - 6 pm
## friday saturday sunday
## 1 10 am - 6 pm closed closed
## 2 10 am - 6 pm 10 am - 6 pm 10 am - 5 pm
## 3 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 4 closed closed closed
## 5 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 6 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 7 10 am - 6 pm 10 am - 6 pm 10 am - 5 pm
## 8 10 am - 6 pm 10 am - 6 pm 10 am - 5 pm
## 9 closed closed closed
## 10 10 am - 6 pm 10 am - 6 pm 10 am - 5 pm
## 11 closed closed closed
## 12 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 13 9 am - 5 pm closed closed
## 14 closed closed closed
## 15 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 16 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 17 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 18 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 19 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 20 closed closed closed
## 21 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm
## 22 10 am - 6 pm 10 am - 6 pm 12 pm - 5 pm