Webscrape Wikipedia Formula One Drivers

Author

Gamaliel Ngouafon

Load the libraries

library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'tibble' was built under R version 4.5.2
Warning: package 'tidyr' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
Warning: package 'purrr' was built under R version 4.5.2
Warning: package 'dplyr' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)

Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding
library(webshot2)

Extract all tables

tables <- webpage |>
  html_nodes("table.wikitable") |>
  html_table(fill = TRUE)

# How many tables did we find?
length(tables)
[1] 3

There are three tables.

Inspect the size of the first table

# Print the number of rows in each table
sapply(tables, nrow)
[1]   3 878  42

Select the Drivers Table (the second table)

drivers_raw <- tables[[2]]

# Preview
glimpse(drivers_raw)
Rows: 878
Columns: 11
$ `Driver name`            <chr> "Carlo Abate", "George Abecassis", "Kenny Ach…
$ Nationality              <chr> "Italy", "United Kingdom", "United Kingdom", …
$ `Seasons competed`       <chr> "1962–1963", "1951–1952", "1983, 1985", "1968…
$ `Drivers' Championships` <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", …
$ `Race entries`           <chr> "3", "2", "10", "36", "2", "1[b]", "1", "9[b]…
$ `Race starts`            <chr> "0", "2", "3", "30", "2", "1", "1", "8", "4",…
$ `Pole positions`         <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", …
$ `Race wins`              <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", …
$ Podiums                  <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", …
$ `Fastest laps`           <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", …
$ `Points[a]`              <chr> "0", "0", "0", "6", "0", "0", "0", "1.5", "0"…
head(drivers_raw)
# A tibble: 6 × 11
  `Driver name`     Nationality    `Seasons competed` `Drivers' Championships`
  <chr>             <chr>          <chr>              <chr>                   
1 Carlo Abate       Italy          1962–1963          0                       
2 George Abecassis  United Kingdom 1951–1952          0                       
3 Kenny Acheson     United Kingdom 1983, 1985         0                       
4 Andrea de Adamich Italy          1968, 1970–1973    0                       
5 Philippe Adams    Belgium        1994               0                       
6 Walt Ader         United States  1950               0                       
# ℹ 7 more variables: `Race entries` <chr>, `Race starts` <chr>,
#   `Pole positions` <chr>, `Race wins` <chr>, Podiums <chr>,
#   `Fastest laps` <chr>, `Points[a]` <chr>

Clean the table

names(drivers_raw) <- tolower(names(drivers_raw))
names(drivers_raw) <- gsub(" ", "_", names(drivers_raw))
names(drivers_raw)
 [1] "driver_name"            "nationality"            "seasons_competed"      
 [4] "drivers'_championships" "race_entries"           "race_starts"           
 [7] "pole_positions"         "race_wins"              "podiums"               
[10] "fastest_laps"           "points[a]"             

Use the Formula_one Drivers data frame to create your own visualization

Be sure it has a title, axes labels, caption for the data source (the website url). Make sure you include at least 3 variables in your plot that show: colors, sizes or other distinguishing elements.

drivers_clean <- drivers_raw |>
  mutate(
    drivers_championships = as.numeric(gsub("[^0-9]", "", `drivers'_championships`)),
    race_wins = as.numeric(gsub("[^0-9]", "", race_wins)),
    race_entries = as.numeric(gsub("[^0-9]", "", race_entries)),
    race_starts = as.numeric(gsub("[^0-9]", "", race_starts)),
    points = as.numeric(gsub("[^0-9.]", "", `points[a]`))
  ) |>
  drop_na(drivers_championships, race_wins, race_entries, points)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `points = as.numeric(gsub("[^0-9.]", "", `points[a]`))`.
Caused by warning:
! NAs introduced by coercion

Plot to determine of the pole posiitons influences race wins

library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
# options(scipen = 999)


 p <- drivers_clean |>
  filter(drivers_championships >= 4) |>
  ggplot(aes(x = race_starts, y = race_wins)) +
  geom_point(aes(color = nationality, size = pole_positions), alpha = 0.8) +
  labs(
    title = "Elite Formula One Drivers Only (Champions)",
    x = "Race Starts",
    y = "Race Wins",
    caption = "Source: Wikipedia"
  ) +
  theme_minimal()

ggplotly(p)
Warning: Using size for a discrete variable is not advised.