Assignment 8 110

Author

Javier Mantilla

Webscraping Tutorial with books.toscrape.com

Load the Libraries

library(rvest)
library(tidyverse)

Warning: package 'ggplot2' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()         masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag()            masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Step 1: Load the data and fetch the webpage

url <- "https://books.toscrape.com"

webpage <- read_html(url)

Step 2: Inspect the selectors

Step 3: Extract all the fields (columns of information)

books <- webpage |>
  html_elements("article.product_pod")

How many books are on this page?

length(books)

[1] 20

20 books are on this page.

Book Titles

titles <- books |>
  html_element("h3 > a") |>
  html_attr("title")
head(titles)

[1] "A Light in the Attic"                 
[2] "Tipping the Velvet"                   
[3] "Soumission"                           
[4] "Sharp Objects"                        
[5] "Sapiens: A Brief History of Humankind"
[6] "The Requiem Red"

Star Ratings

ratings_raw <- books |>
  html_element("p.star-rating") |>
  html_attr("class")

Remove “star-rating” prefix to isolate the word

ratings <- gsub("star-rating ", "", ratings_raw)
head(ratings)

[1] "Three" "One"   "One"   "Four"  "Five"  "One"

Prices

prices <- books |>
  html_element("p.price_color") |>
  html_text2()
head(prices)

[1] "£51.77" "£53.74" "£50.10" "£47.82" "£54.23" "£22.65"

Clean the prices

prices <- as.numeric(gsub("£", "", prices))
head(prices)

[1] 51.77 53.74 50.10 47.82 54.23 22.65

Stock Status

stock <- books |>
  html_element("p.availability") |>
  html_text2() |>
  trimws()
head(stock)

[1] "In stock" "In stock" "In stock" "In stock" "In stock" "In stock"

Book Images

images <- books |>
  html_element("img.thumbnail") |>
  html_attr("src") |>
  gsub("../../", "https://books.toscrape.com/", x = _)
head(images)

[1] "media/cachttps://books.toscrape.com/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"
[2] "media/cachttps://books.toscrape.com/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg"
[3] "media/cachttps://books.toscrape.com/ef/3eef99c9d9adef34639f510662022830.jpg"
[4] "media/cachttps://books.toscrape.com/51/3251cf3a3412f53f339e42cac2134093.jpg"
[5] "media/cachttps://books.toscrape.com/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg"
[6] "media/cachttps://books.toscrape.com/33/68339b4c9bc034267e1da611ab3b34f8.jpg"

Step 5: Build a Clean Data frame

test_df <- tibble(
  title = titles,
  rating = ratings,
  price = prices,
  stock = stock,
  image = images
)
head(test_df)

# A tibble: 6 × 5
  title                                 rating price stock    image             
  <chr>                                 <chr>  <dbl> <chr>    <chr>             
1 A Light in the Attic                  Three   51.8 In stock media/cachttps://…
2 Tipping the Velvet                    One     53.7 In stock media/cachttps://…
3 Soumission                            One     50.1 In stock media/cachttps://…
4 Sharp Objects                         Four    47.8 In stock media/cachttps://…
5 Sapiens: A Brief History of Humankind Five    54.2 In stock media/cachttps://…
6 The Requiem Red                       One     22.6 In stock media/cachttps://…

Step 6: Paginate

# Empty list to collect each page's data
all_books <- list()
for (i in 1:50) {
# Page 1 uses the root URL; pages 2+ use /catalogue/page-N.html
if (i == 1) {
url <- "https://books.toscrape.com"
} else {
url <- paste0("https://books.toscrape.com/catalogue/page-", i, ".html")
}
page <- read_html(url)
books <- html_elements(page, "article.product_pod")
all_books[[i]] <- tibble(
title = html_attr(html_element(books, "h3 > a"), "title"),
rating = gsub("star-rating ", "", html_attr(html_element(books, "p.star-rating"),"class")),
price = html_text2(html_element(books, "p.price_color")),
stock = trimws(html_text2(html_element(books, "p.availability"))),
image = gsub("../../", "https://books.toscrape.com/",
html_attr(html_element(books, "img.thumbnail"), "src"))
)
# Be polite — pause briefly between requests
Sys.sleep(0.5)
cat("Scraped page", i, "\n")
}

Scraped page 1 
Scraped page 2 
Scraped page 3 
Scraped page 4 
Scraped page 5 
Scraped page 6 
Scraped page 7 
Scraped page 8 
Scraped page 9 
Scraped page 10 
Scraped page 11 
Scraped page 12 
Scraped page 13 
Scraped page 14 
Scraped page 15 
Scraped page 16 
Scraped page 17 
Scraped page 18 
Scraped page 19 
Scraped page 20 
Scraped page 21 
Scraped page 22 
Scraped page 23 
Scraped page 24 
Scraped page 25 
Scraped page 26 
Scraped page 27 
Scraped page 28 
Scraped page 29 
Scraped page 30 
Scraped page 31 
Scraped page 32 
Scraped page 33 
Scraped page 34 
Scraped page 35 
Scraped page 36 
Scraped page 37 
Scraped page 38 
Scraped page 39 
Scraped page 40 
Scraped page 41 
Scraped page 42 
Scraped page 43 
Scraped page 44 
Scraped page 45 
Scraped page 46 
Scraped page 47 
Scraped page 48 
Scraped page 49 
Scraped page 50

Combine all pages into one data frame

full_df <- bind_rows(all_books)
nrow(full_df) # Should be 1000

[1] 1000

Step 7: Optional Export

setwd("~/Documents/Data 110")
# Save to your working directory
write.csv(full_df, "bookstoscrape.csv", row.names = FALSE)

Now Explore the Data

full_df$price <- as.numeric(gsub("£", "", full_df$price))
head(full_df)

# A tibble: 6 × 5
  title                                 rating price stock    image             
  <chr>                                 <chr>  <dbl> <chr>    <chr>             
1 A Light in the Attic                  Three   51.8 In stock media/cachttps://…
2 Tipping the Velvet                    One     53.7 In stock media/cachttps://…
3 Soumission                            One     50.1 In stock media/cachttps://…
4 Sharp Objects                         Four    47.8 In stock media/cachttps://…
5 Sapiens: A Brief History of Humankind Five    54.2 In stock media/cachttps://…
6 The Requiem Red                       One     22.6 In stock media/cachttps://…

Convert Rating Words to Numbers

rating_map <- c(One=1, Two=2, Three=3, Four=4, Five=5)

full_df1 <- full_df |>
  mutate(rating_num = rating_map[rating])

Count books per rating

table1 <- full_df1 |>
  count(rating, sort = TRUE)
table1

# A tibble: 5 × 2
  rating     n
  <chr>  <int>
1 One      226
2 Three    203
3 Five     196
4 Two      196
5 Four     179

New Dataset Creation

full_df1 <- full_df1 |>
  mutate(price_category = case_when(
    price < 20 ~ "Low",
    price < 40 ~ "Medium",
    TRUE ~ "High"
  ))

full_df1 <- full_df1 |>
  mutate(index = row_number())

Problem 1: Based on the scraped books_to_scrape data frame, create:

ggplot(full_df1, aes(x = index, y = price, color = price_category)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dotdash") +
  labs(
    title = "Book Prices Across Dataset by Category",
    x = "Book Index",
    y = "Price (£)",
    color = "Price Category",
    caption = "Data source: books.toscrape.com"
  ) +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

Problem 2: Create a bar graph

price_table <- full_df1 |>
  count(price_category)

ggplot(price_table, aes(x = price_category, y = n, fill = price_category)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Number of Books by Price Category",
    x = "Price Category",
    y = "Number of Books",
    caption = "Data source: books.toscrape.com"
  ) +
  theme_classic() +
  theme(legend.position = "none")

Problem 3: Create one more plot of your own

ggplot(full_df1, aes(x = price_category, y = price, fill = price_category)) +
  geom_boxplot() +
  labs(
    title = "Price Distribution by Category",
    x = "Price Category",
    y = "Price (£)",
    caption = "Data source: books.toscrape.com"
  ) +
  theme_bw()

Explanations

Problem 1:

This scatterplot shows how book prices vary across the dataset, with colors representing price categories. The regression line highlights the overall trend in pricing.

Problem 2:

This bar chart shows the number of books in each price category. It helps visualize how books are distributed across low, medium, and high price ranges.

Problem 3:

This boxplot compares the distribution of prices across the three categories. It clearly shows differences in spread and median price within each group.