Load the Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(RColorBrewer)

Parse HTML Content and Getting Attributes

webpage <- read_html(link)
# Extract the attendance table
attendance_table <- webpage |>
 html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[4]') |>
 html_table(fill = TRUE)

# Show the first 6 rows of the attendance table
head(attendance_table)
## [[1]]
## # A tibble: 27 × 9
##    Year  Hosts     `Venues/Cities` `Totalattendance †` Matches Averageattendance
##    <chr> <chr>     <chr>           <chr>               <chr>   <chr>            
##  1 Year  Hosts     Venues/Cities   Totalattendance †   Matches Averageattendance
##  2 1930  Uruguay   3/1             590,549             18      32,808           
##  3 1934  Italy     8/8             363,000             17      21,353           
##  4 1938  France    10/9            375,700             18      20,872           
##  5 1950  Brazil    6/6             1,045,246           22      47,511           
##  6 1954  Switzerl… 6/6             768,607             26      29,562           
##  7 1958  Sweden    12/12           819,810             35      23,423           
##  8 1962  Chile     4/4             893,172             32      27,912           
##  9 1966  England   8/7             1,563,135           32      48,848           
## 10 1970  Mexico    5/5             1,603,975           32      50,124           
## # ℹ 17 more rows
## # ℹ 3 more variables: `Highest attendances ‡` <chr>,
## #   `Highest attendances ‡` <chr>, `Highest attendances ‡` <chr>

Setting Up and Plotting

# Turn the table into a data frame
attendance <- data.frame(attendance_table)
attendance
##         Year                       Hosts Venues.Cities Totalattendance..
## 1       Year                       Hosts Venues/Cities Totalattendance †
## 2       1930                     Uruguay           3/1           590,549
## 3       1934                       Italy           8/8           363,000
## 4       1938                      France          10/9           375,700
## 5       1950                      Brazil           6/6         1,045,246
## 6       1954                 Switzerland           6/6           768,607
## 7       1958                      Sweden         12/12           819,810
## 8       1962                       Chile           4/4           893,172
## 9       1966                     England           8/7         1,563,135
## 10      1970                      Mexico           5/5         1,603,975
## 11      1974                West Germany           9/9         1,865,753
## 12      1978                   Argentina           6/5         1,545,791
## 13      1982                       Spain         17/14         2,109,723
## 14      1986                      Mexico         12/11         2,394,031
## 15      1990                       Italy         12/12         2,516,215
## 16      1994               United States           9/9         3,587,538
## 17      1998                      France         10/10         2,785,100
## 18      2002           South Korea Japan         20/20         2,705,197
## 19      2006                     Germany         12/12         3,359,439
## 20      2010                South Africa          10/9         3,178,856
## 21      2014                      Brazil         12/12         3,429,873
## 22      2018                      Russia         12/11         3,031,768
## 23      2022                       Qatar           8/5         3,404,252
## 24      2026 Canada Mexico United States         16/16                  
## 25 2030[n 1]      Morocco Portugal Spain                                
## 26      2034                Saudi Arabia                                
## 27   Overall                     Overall       Overall        43,936,730
##    Matches Averageattendance Highest.attendances..
## 1  Matches Averageattendance                Number
## 2       18            32,808                93,000
## 3       17            21,353                55,000
## 4       18            20,872                58,455
## 5       22            47,511           173,850[95]
## 6       26            29,562                63,000
## 7       35            23,423                50,928
## 8       32            27,912                68,679
## 9       32            48,848                98,270
## 10      32            50,124               108,192
## 11      38            49,099                83,168
## 12      38            40,679                71,712
## 13      52            40,572                95,500
## 14      52            46,039               114,600
## 15      52            48,389                74,765
## 16      52            68,991                94,194
## 17      64            43,517                80,000
## 18      64            42,269                69,029
## 19      64            52,491                72,000
## 20      64            49,670                84,490
## 21      64            53,592                74,738
## 22      64            47,371                78,011
## 23      64            53,191                88,966
## 24     104                                        
## 25     104                                        
## 26     104                                        
## 27     964            45,577           173,850[95]
##                    Highest.attendances...1
## 1                                    Venue
## 2           Estadio Centenario, Montevideo
## 3               Stadio Nazionale PNF, Rome
## 4             Olympique de Colombes, Paris
## 5         Maracanã Stadium, Rio de Janeiro
## 6                   Wankdorf Stadium, Bern
## 7               Ullevi Stadium, Gothenburg
## 8               Estadio Nacional, Santiago
## 9                  Wembley Stadium, London
## 10             Estadio Azteca, Mexico City
## 11                  Olympiastadion, Munich
## 12        Estadio Monumental, Buenos Aires
## 13                     Camp Nou, Barcelona
## 14             Estadio Azteca, Mexico City
## 15                         San Siro, Milan
## 16         Rose Bowl, Pasadena, California
## 17            Stade de France, Saint-Denis
## 18  International Stadium, Yokohama, Japan
## 19                  Olympiastadion, Berlin
## 20               Soccer City, Johannesburg
## 21        Maracanã Stadium, Rio de Janeiro
## 22                Luzhniki Stadium, Moscow
## 23                   Lusail Stadium, Qatar
## 24                                        
## 25                                        
## 26                                        
## 27 Maracanã Stadium, Rio de Janeiro (1950)
##                                              Highest.attendances...2
## 1                                                            Game(s)
## 2                                 Uruguay 6–1 Yugoslavia, semi-final
## 3                                    Italy 2–1 Czechoslovakia, final
## 4                                    France 1–3 Italy, quarter-final
## 5                                 Brazil 1–2 Uruguay, deciding match
## 6                                    West Germany 3–2 Hungary, final
## 7                               Brazil 2–0 Soviet Union, group stage
## 8                                       Brazil 4–2 Chile, semi-final
## 9                                    England 4–2 West Germany, final
## 10                                   Mexico 1–0 Belgium, group stage
## 11                               West Germany 1–0 Chile, group stage
## 12                                  Italy 1–0 Argentina, group stage
## 13                              Argentina 0–1 Belgium, Opening match
## 14 Mexico 1–1 Paraguay, group stageArgentina 3–2 West Germany, final
## 15                          West Germany 4–1 Yugoslavia, group stage
## 16                                    Brazil 0–0 (3–2p) Italy, final
## 17                                          Brazil 0–3 France, final
## 18                                         Brazil 2–0 Germany, final
## 19                       Germany 1–1 (4–2p) Argentina, quarter-final
## 20                                      Spain 1–0 Netherlands, final
## 21                                      Germany 1–0 Argentina, final
## 22                                         France 4–2 Croatia, final
## 23                                Argentina 3–3 (4–2p) France, final
## 24                                                                  
## 25                                                                  
## 26                                                                  
## 27
# Removing rows 1, 24:27 
attendance2 <- attendance[-c(1, 24:27), ]

Why did I decide to remove these rows? If you look at the original dataset, the first row is the name of the variables, which we don’t need twice so that one can go. Rows 24 - 26 are future world cups, meaning we don’t have the necessary data for those events yet. Row 27 is an overall count, which I won’t be needing for my graph.

# Checking the class of my variables
class(attendance2$Year)
## [1] "character"
class(attendance2$Hosts)
## [1] "character"
class(attendance2$Totalattendance..)
## [1] "character"

Interesting. The year and totalattendance variables are supposed to be numeric and hosts is supposed to be a factor. Let’s fix that by first renaming the values of totalattendance without commas.

attendance2 <- mutate(attendance2,
                      Totalattendance.. = recode(Totalattendance..,
                                    `590,549` = "590549",
                                    `363,000` = "363000",
                                    `375,700` = "375700",
                                    `1,045,246` = "1045246",
                                    `768,607` = "768607",
                                    `819,810` = "819810",
                                    `893,172` = "893172",
                                    `1,563,135` = "1563135",
                                    `1,603,975` = "1603975",
                                    `1,865,753` = "1865753",
                                    `1,545,791` = "1545791",
                                    `2,109,723` = "2109723",
                                    `2,394,031` = "2394031",
                                    `2,516,215` = "2516215",
                                    `3,587,538` = "3587538",
                                    `2,785,100` = "2785100",
                                    `2,705,197` = "2705197",
                                    `3,359,439` = "3359439",
                                    `3,178,856` = "3178856",
                                    `3,429,873` = "3429873",
                                    `3,031,768` = "3031768",
                                    `3,404,252` = "3404252"))
attendance2
##    Year             Hosts Venues.Cities Totalattendance.. Matches
## 2  1930           Uruguay           3/1            590549      18
## 3  1934             Italy           8/8            363000      17
## 4  1938            France          10/9            375700      18
## 5  1950            Brazil           6/6           1045246      22
## 6  1954       Switzerland           6/6            768607      26
## 7  1958            Sweden         12/12            819810      35
## 8  1962             Chile           4/4            893172      32
## 9  1966           England           8/7           1563135      32
## 10 1970            Mexico           5/5           1603975      32
## 11 1974      West Germany           9/9           1865753      38
## 12 1978         Argentina           6/5           1545791      38
## 13 1982             Spain         17/14           2109723      52
## 14 1986            Mexico         12/11           2394031      52
## 15 1990             Italy         12/12           2516215      52
## 16 1994     United States           9/9           3587538      52
## 17 1998            France         10/10           2785100      64
## 18 2002 South Korea Japan         20/20           2705197      64
## 19 2006           Germany         12/12           3359439      64
## 20 2010      South Africa          10/9           3178856      64
## 21 2014            Brazil         12/12           3429873      64
## 22 2018            Russia         12/11           3031768      64
## 23 2022             Qatar           8/5           3404252      64
##    Averageattendance Highest.attendances..
## 2             32,808                93,000
## 3             21,353                55,000
## 4             20,872                58,455
## 5             47,511           173,850[95]
## 6             29,562                63,000
## 7             23,423                50,928
## 8             27,912                68,679
## 9             48,848                98,270
## 10            50,124               108,192
## 11            49,099                83,168
## 12            40,679                71,712
## 13            40,572                95,500
## 14            46,039               114,600
## 15            48,389                74,765
## 16            68,991                94,194
## 17            43,517                80,000
## 18            42,269                69,029
## 19            52,491                72,000
## 20            49,670                84,490
## 21            53,592                74,738
## 22            47,371                78,011
## 23            53,191                88,966
##                   Highest.attendances...1
## 2          Estadio Centenario, Montevideo
## 3              Stadio Nazionale PNF, Rome
## 4            Olympique de Colombes, Paris
## 5        Maracanã Stadium, Rio de Janeiro
## 6                  Wankdorf Stadium, Bern
## 7              Ullevi Stadium, Gothenburg
## 8              Estadio Nacional, Santiago
## 9                 Wembley Stadium, London
## 10            Estadio Azteca, Mexico City
## 11                 Olympiastadion, Munich
## 12       Estadio Monumental, Buenos Aires
## 13                    Camp Nou, Barcelona
## 14            Estadio Azteca, Mexico City
## 15                        San Siro, Milan
## 16        Rose Bowl, Pasadena, California
## 17           Stade de France, Saint-Denis
## 18 International Stadium, Yokohama, Japan
## 19                 Olympiastadion, Berlin
## 20              Soccer City, Johannesburg
## 21       Maracanã Stadium, Rio de Janeiro
## 22               Luzhniki Stadium, Moscow
## 23                  Lusail Stadium, Qatar
##                                              Highest.attendances...2
## 2                                 Uruguay 6–1 Yugoslavia, semi-final
## 3                                    Italy 2–1 Czechoslovakia, final
## 4                                    France 1–3 Italy, quarter-final
## 5                                 Brazil 1–2 Uruguay, deciding match
## 6                                    West Germany 3–2 Hungary, final
## 7                               Brazil 2–0 Soviet Union, group stage
## 8                                       Brazil 4–2 Chile, semi-final
## 9                                    England 4–2 West Germany, final
## 10                                   Mexico 1–0 Belgium, group stage
## 11                               West Germany 1–0 Chile, group stage
## 12                                  Italy 1–0 Argentina, group stage
## 13                              Argentina 0–1 Belgium, Opening match
## 14 Mexico 1–1 Paraguay, group stageArgentina 3–2 West Germany, final
## 15                          West Germany 4–1 Yugoslavia, group stage
## 16                                    Brazil 0–0 (3–2p) Italy, final
## 17                                          Brazil 0–3 France, final
## 18                                         Brazil 2–0 Germany, final
## 19                       Germany 1–1 (4–2p) Argentina, quarter-final
## 20                                      Spain 1–0 Netherlands, final
## 21                                      Germany 1–0 Argentina, final
## 22                                         France 4–2 Croatia, final
## 23                                Argentina 3–3 (4–2p) France, final

Now we can go ahead and change the class of the variables.

# Changing the classes
attendance2$Year <- as.numeric(attendance2$Year)
attendance2$Totalattendance.. <- as.numeric(attendance2$Totalattendance..)
attendance2$Hosts <- factor(attendance2$Hosts)

# Checking the classes once more
class(attendance2$Year)
## [1] "numeric"
class(attendance2$Totalattendance..)
## [1] "numeric"
class(attendance2$Hosts)
## [1] "factor"
# Renaming West Germany to Germany
attendance2 <- attendance2 |>
  mutate(Hosts = case_when(
    Hosts == "West Germany" ~ "Germany",
    TRUE ~ Hosts))
# Source : ChatGPT
plot <- highchart() |>
  hc_chart(type = "line") |>
  hc_title(text = "Total Attendance for FIFA World Cup Matches") |>
  hc_xAxis(categories = attendance2$Year) |>
  hc_yAxis(title = list(text = "Total Attendance")) |>
  hc_add_series(data = attendance2$Totalattendance..,
                name = "Total Attendance",
                color = "#8D3C53",
                showInLegend = FALSE)
plot