webscraping world cup

Author

Ayomide Joe-Adigwe

Load the libraries

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(rvest)


Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding

Link

link <- "https://en.wikipedia.org/wiki/FIFA_World_Cup"

Parse HTML content and getting attributes

webpage <- read_html(link)

Select the table of interest

attendance_table <- attendance_table[[4]] # Adjust the index as needed

# Extract the attendance table and parse it into a data frame
attendance_table <- webpage |>
  html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[4]') |>
  html_table(fill = TRUE)


# View the first few rows to confirm
head(attendance_table)

[[1]]
# A tibble: 27 × 9
   Year  Hosts     `Venues/Cities` `Totalattendance †` Matches Averageattendance
   <chr> <chr>     <chr>           <chr>               <chr>   <chr>            
 1 Year  Hosts     Venues/Cities   Totalattendance †   Matches Averageattendance
 2 1930  Uruguay   3/1             590,549             18      32,808           
 3 1934  Italy     8/8             363,000             17      21,353           
 4 1938  France    10/9            375,700             18      20,872           
 5 1950  Brazil    6/6             1,045,246           22      47,511           
 6 1954  Switzerl… 6/6             768,607             26      29,562           
 7 1958  Sweden    12/12           819,810             35      23,423           
 8 1962  Chile     4/4             893,172             32      27,912           
 9 1966  England   8/7             1,563,135           32      48,848           
10 1970  Mexico    5/5             1,603,975           32      50,124           
# ℹ 17 more rows
# ℹ 3 more variables: `Highest attendances ‡` <chr>,
#   `Highest attendances ‡` <chr>, `Highest attendances ‡` <chr>

Extra Credit

# Load necessary libraries
library(tidyverse)
library(rvest)

# Specify the URL for the FIFA World Cup Wikipedia page
url <- "https://en.wikipedia.org/wiki/FIFA_World_Cup"

# Read and parse the HTML content from the URL
webpage <- read_html(url)

# Extract the attendance table by locating the correct table index on the page
attendance_df <- webpage %>%
  html_table(fill = TRUE) %>%
  .[[4]]  # This extracts the 4th table on the page, which is the attendance table

# Display the first few rows to verify successful extraction
head(attendance_df)

# A tibble: 6 × 9
  Year  Hosts      `Venues/Cities` `Totalattendance †` Matches Averageattendance
  <chr> <chr>      <chr>           <chr>               <chr>   <chr>            
1 Year  Hosts      Venues/Cities   Totalattendance †   Matches Averageattendance
2 1930  Uruguay    3/1             590,549             18      32,808           
3 1934  Italy      8/8             363,000             17      21,353           
4 1938  France     10/9            375,700             18      20,872           
5 1950  Brazil     6/6             1,045,246           22      47,511           
6 1954  Switzerla… 6/6             768,607             26      29,562           
# ℹ 3 more variables: `Highest attendances ‡` <chr>,
#   `Highest attendances ‡` <chr>, `Highest attendances ‡` <chr>

Renaming Columns to Resolve Duplicates

# Rename columns manually to resolve duplicates
colnames(attendance_df) <- c("Year", "Hosts", "Venues_Cities", "Total_Attendance", "Matches", "Average_Attendance", "Champions", "Runners_up", "Third")

# Verify the new column names
colnames(attendance_df)

[1] "Year"               "Hosts"              "Venues_Cities"     
[4] "Total_Attendance"   "Matches"            "Average_Attendance"
[7] "Champions"          "Runners_up"         "Third"

Cleaning the data

attendance_df <- attendance_df %>%
  mutate(
    Total_Attendance = as.numeric(gsub(",", "", Total_Attendance)),
    Average_Attendance = as.numeric(gsub(",", "", Average_Attendance)),
    Year = as.numeric(Year)
  ) %>%
  filter(!is.na(Year))  # Remove rows where Year is NA

Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `Total_Attendance = as.numeric(gsub(",", "",
  Total_Attendance))`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.

Visualization

  ggplot(attendance_df, aes(x = Year, y = Total_Attendance)) +
  geom_line(aes(group = 1), color = "steelblue", size = 1) +  # Line plot for trends
  geom_point(aes(size = Average_Attendance, color = Hosts), alpha = 0.7) +  # Points to show attendance
  scale_color_viridis_d() +  # Use a colorblind-friendly palette
  geom_text(aes(label = ifelse(Year %% 10 == 0, Year, "")), vjust = -0.5, color = "darkgrey", size = 3) + # Labels every 10 years
  labs(
    title = "FIFA World Cup Attendance Over the Years",
    subtitle = "Total and Average Attendance with Highlighted Hosts",
    x = "Year",
    y = "Total Attendance",
    size = "Average Attendance",
    color = "Host Country",
    caption = "Data Source: Wikipedia"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12, face = "italic"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "right"
  )

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_line()`).

Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_text()`).