[reference] https://www.r-bloggers.com/web-scraping-product-data-in-r-with-rvest-and-purrr/
This article comes from Joon Im, a student in Business Science University. Joon has completed both the 201 (Advanced Machine Learning with H2O) and 102 (Shiny Web Applications) courses. Joon shows off his progress in this Web Scraping Tutorial with rvest.
My Workflow
library(rvest) # HTML Hacking & Web Scraping
library(jsonlite) # JSON manipulation
library(tidyverse) # Data Manipulation
library(tidyquant) # ggplot2 theme
library(xopen) # Opens URL in Browser
library(knitr) # Pretty HTML Tables
# URL to View All Bikes
url <- "https://www.specialized.com/us/en/shop/bikes/c/bikes?q=%3Aprice-desc%3Aarchived%3Afalse&show=All"
# Read HTML from URL
html <- read_html(url)
html %>%
html_nodes(".product-list__item-wrapper")
## {xml_nodeset (403)}
## [1] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [2] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [3] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [4] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [5] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [6] <div class="product-list__item-wrapper" data-product-ic='{"name":"F ...
## [7] <div class="product-list__item-wrapper" data-product-ic='{"name":"F ...
## [8] <div class="product-list__item-wrapper" data-product-ic='{"name":"F ...
## [9] <div class="product-list__item-wrapper" data-product-ic='{"name":"R ...
## [10] <div class="product-list__item-wrapper" data-product-ic='{"name":"R ...
## [11] <div class="product-list__item-wrapper" data-product-ic='{"name":"R ...
## [12] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [13] <div class="product-list__item-wrapper" data-product-ic="{"nam ...
## [14] <div class="product-list__item-wrapper" data-product-ic="{"nam ...
## [15] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [16] <div class="product-list__item-wrapper" data-product-ic='{"name":"T ...
## [17] <div class="product-list__item-wrapper" data-product-ic='{"name":"T ...
## [18] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [19] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## [20] <div class="product-list__item-wrapper" data-product-ic='{"name":"S ...
## ...
# Store JSON as object
json <- html %>%
html_nodes(".product-list__item-wrapper") %>%
html_attr("data-product-ic") # F12 -> SelectorGadget
# Show the 1st JSON element (1st bike of 399 bikes)
json[1]
## [1] "{\"name\":\"S-Works Roubaix - SRAM Red eTap AXS\",\"id\":\"171042\",\"brand\":\"Specialized\",\"price\":11500,\"currencyCode\":\"USD\",\"position\":\"\",\"variant\":\"56\",\"dimension1\":\"Bikes\",\"dimension2\":\"Road\",\"dimension3\":\"Roubaix\",\"dimension4\":\"\",\"dimension5\":\"Performance Road\",\"dimension6\":\"S-Works\",\"dimension7\":\"\",\"dimension8\":\"Men/Women\"}"
# Make Function
from_json_to_tibble <- function(json) {
json %>%
fromJSON() %>%
as_tibble()
}
json[1] %>%
from_json_to_tibble()
## # A tibble: 1 x 15
## name id brand price currencyCode position variant dimension1
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
## 1 S-Wo~ 1710~ Spec~ 11500 USD "" 56 Bikes
## # ... with 7 more variables: dimension2 <chr>, dimension3 <chr>,
## # dimension4 <chr>, dimension5 <chr>, dimension6 <chr>,
## # dimension7 <chr>, dimension8 <chr>
# Iterate - All JSON objects ----
bike_data_list <- json %>%
map(safely(from_json_to_tibble))
# Inspect for Errors
error_tbl <- bike_data_list %>%
# Grab just the $error elements
map(~ pluck(., "error")) %>%
# Convert from list to tibble
enframe(name = "row") %>%
# Return TRUE if element has error
mutate(is_error = map(value, function(x) !is.null(x))) %>%
# Unnest nested list
unnest(is_error) %>%
# Filter where error == TRUE
filter(is_error)
error_tbl
## # A tibble: 2 x 3
## row value is_error
## <int> <list> <lgl>
## 1 218 <smplErrr> TRUE
## 2 286 <smplErrr> TRUE
We got two errors – Bike 222 and 288. We can use pluck() to grab the first error in the “value” column. It’s the result of an errant " symbol that represents inches.
## <simpleError: lexical error: invalid char in json text.
## osition":"","variant":"22.5" TT","dimension1":"Bikes","dimen
## (right here) ------^
## >
## # A tibble: 1 x 15
## name id brand price currencyCode position variant dimension1
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
## 1 Demo~ 1543~ Spec~ 2500 USD "" S4 Framesets
## # ... with 7 more variables: dimension2 <chr>, dimension3 <chr>,
## # dimension4 <chr>, dimension5 <chr>, dimension6 <chr>,
## # dimension7 <chr>, dimension8 <chr>
json[222] %>%
str_replace('\\"BMX / Dirt Jump\\"', 'BMX / Dirt Jump') %>%
str_replace('22.5\\" TT', '22.5 TT') %>%
from_json_to_tibble()
## # A tibble: 1 x 15
## name id brand price currencyCode position variant dimension1
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
## 1 Demo~ 1543~ Spec~ 2500 USD "" S4 Framesets
## # ... with 7 more variables: dimension2 <chr>, dimension3 <chr>,
## # dimension4 <chr>, dimension5 <chr>, dimension6 <chr>,
## # dimension7 <chr>, dimension8 <chr>
# Fix errors, re-run
bike_features_tbl <- json %>%
str_replace('\\"BMX / Dirt Jump\\"', 'BMX / Dirt Jump') %>%
str_replace('22.5\\" TT', '22.5 TT') %>%
map_dfr(from_json_to_tibble)
# Show first 6 rows
bike_features_tbl %>%
head() %>%
kable()
name | id | brand | price | currencyCode | position | variant | dimension1 | dimension2 | dimension3 | dimension4 | dimension5 | dimension6 | dimension7 | dimension8 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
S-Works Roubaix - SRAM Red eTap AXS | 171042 | Specialized | 11500 | USD | 56 | Bikes | Road | Roubaix | Performance Road | S-Works | Men/Women | |||
S-Works Roubaix - Shimano Dura-Ace Di2 | 170241 | Specialized | 11000 | USD | 52 | Bikes | Road | Roubaix | Performance Road | S-Works | Men/Women | |||
S-Works Epic AXS | 171229 | Specialized | 11020 | USD | S | Bikes | Mountain | Epic FSR/Epic | Cross Country | S-Works | Men/Women | |||
Stumpjumper EVO Comp Carbon 29 | 173494 | Specialized | 4520 | USD | S3 | Bikes | Mountain | Stumpjumper EVO | Trail | Men/Women | ||||
Stumpjumper EVO Comp Carbon 27.5 | 173495 | Specialized | 4520 | USD | S2 | Bikes | Mountain | Stumpjumper EVO | Trail | Men/Women | ||||
Fuse Expert 29 | 171068 | Specialized | 2150 | USD | XS | Bikes | Mountain | Fuse | Trail | Men/Women |
## Classes 'tbl_df', 'tbl' and 'data.frame': 403 obs. of 15 variables:
## $ name : chr "S-Works Roubaix - SRAM Red eTap AXS" "S-Works Roubaix -<U+00A0>Shimano Dura-Ace Di2" "S-Works Epic AXS" "Stumpjumper EVO Comp Carbon 29" ...
## $ id : chr "171042" "170241" "171229" "173494" ...
## $ brand : chr "Specialized" "Specialized" "Specialized" "Specialized" ...
## $ price : num 11500 11000 11020 4520 4520 ...
## $ currencyCode: chr "USD" "USD" "USD" "USD" ...
## $ position : chr "" "" "" "" ...
## $ variant : chr "56" "52" "S" "S3" ...
## $ dimension1 : chr "Bikes" "Bikes" "Bikes" "Bikes" ...
## $ dimension2 : chr "Road" "Road" "Mountain" "Mountain" ...
## $ dimension3 : chr "Roubaix" "Roubaix" "Epic FSR/Epic" "Stumpjumper EVO" ...
## $ dimension4 : chr "" "" "" "" ...
## $ dimension5 : chr "Performance Road" "Performance Road" "Cross Country" "Trail" ...
## $ dimension6 : chr "S-Works" "S-Works" "S-Works" "" ...
## $ dimension7 : chr "" "" "" "" ...
## $ dimension8 : chr "Men/Women" "Men/Women" "Men/Women" "Men/Women" ...
bike_features_tbl %>%
select(dimension3, price) %>%
mutate(dimension3 = as_factor(dimension3) %>%
fct_reorder(price, .fun = median)) %>%
# boxplot(Sepal.Width ~ fct_reorder(Species, Sepal.Width, .desc = TRUE), data = iris)
# Plot
ggplot(aes(dimension3, price)) +
geom_boxplot() +
coord_flip() + # 꺼꾸로
theme_tq() + # 외곽선
scale_y_continuous(labels = scales::dollar_format()) + # $표시
labs(title = "Specialized Bike Models by Price") # 제목
bike_features_tbl %>%
select(name, price, dimension3) %>%
mutate(s_works = ifelse(str_detect(name, "S-Works"), "S-Works", "Not S-Works")) %>%
mutate(dimension3 = as_factor(dimension3) %>%
fct_reorder(price, .fun = median)) %>%
# Plot
ggplot(aes(dimension3, price, color = s_works)) +
geom_boxplot() +
coord_flip() +
facet_wrap(~ s_works, ncol = 1, scales = "free_y") +
theme_tq() +
scale_color_tq() +
scale_y_continuous(labels = scales::dollar_format()) +
labs(title = "S-Works Effect on Price by Model")