library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(RColorBrewer)
HTTP GET request https://en.wikipedia.org/wiki/FIFA_World_Cup
link <- "https://en.wikipedia.org/wiki/FIFA_World_Cup"
webpage <- read_html(link)
# Extract the attendance table
attendance_table <- webpage |>
html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[4]') |>
html_table(fill = TRUE)
# Show the first 6 rows of the attendance table
head(attendance_table)
## [[1]]
## # A tibble: 27 × 9
## Year Hosts `Venues/Cities` `Totalattendance †` Matches Averageattendance
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Year Hosts Venues/Cities Totalattendance † Matches Averageattendance
## 2 1930 Uruguay 3/1 590,549 18 32,808
## 3 1934 Italy 8/8 363,000 17 21,353
## 4 1938 France 10/9 375,700 18 20,872
## 5 1950 Brazil 6/6 1,045,246 22 47,511
## 6 1954 Switzerl… 6/6 768,607 26 29,562
## 7 1958 Sweden 12/12 819,810 35 23,423
## 8 1962 Chile 4/4 893,172 32 27,912
## 9 1966 England 8/7 1,563,135 32 48,848
## 10 1970 Mexico 5/5 1,603,975 32 50,124
## # ℹ 17 more rows
## # ℹ 3 more variables: `Highest attendances ‡` <chr>,
## # `Highest attendances ‡` <chr>, `Highest attendances ‡` <chr>
# Turn the table into a data frame
attendance <- data.frame(attendance_table)
attendance
## Year Hosts Venues.Cities Totalattendance..
## 1 Year Hosts Venues/Cities Totalattendance †
## 2 1930 Uruguay 3/1 590,549
## 3 1934 Italy 8/8 363,000
## 4 1938 France 10/9 375,700
## 5 1950 Brazil 6/6 1,045,246
## 6 1954 Switzerland 6/6 768,607
## 7 1958 Sweden 12/12 819,810
## 8 1962 Chile 4/4 893,172
## 9 1966 England 8/7 1,563,135
## 10 1970 Mexico 5/5 1,603,975
## 11 1974 West Germany 9/9 1,865,753
## 12 1978 Argentina 6/5 1,545,791
## 13 1982 Spain 17/14 2,109,723
## 14 1986 Mexico 12/11 2,394,031
## 15 1990 Italy 12/12 2,516,215
## 16 1994 United States 9/9 3,587,538
## 17 1998 France 10/10 2,785,100
## 18 2002 South Korea Japan 20/20 2,705,197
## 19 2006 Germany 12/12 3,359,439
## 20 2010 South Africa 10/9 3,178,856
## 21 2014 Brazil 12/12 3,429,873
## 22 2018 Russia 12/11 3,031,768
## 23 2022 Qatar 8/5 3,404,252
## 24 2026 Canada Mexico United States 16/16
## 25 2030[n 1] Morocco Portugal Spain
## 26 2034 Saudi Arabia
## 27 Overall Overall Overall 43,936,730
## Matches Averageattendance Highest.attendances..
## 1 Matches Averageattendance Number
## 2 18 32,808 93,000
## 3 17 21,353 55,000
## 4 18 20,872 58,455
## 5 22 47,511 173,850[95]
## 6 26 29,562 63,000
## 7 35 23,423 50,928
## 8 32 27,912 68,679
## 9 32 48,848 98,270
## 10 32 50,124 108,192
## 11 38 49,099 83,168
## 12 38 40,679 71,712
## 13 52 40,572 95,500
## 14 52 46,039 114,600
## 15 52 48,389 74,765
## 16 52 68,991 94,194
## 17 64 43,517 80,000
## 18 64 42,269 69,029
## 19 64 52,491 72,000
## 20 64 49,670 84,490
## 21 64 53,592 74,738
## 22 64 47,371 78,011
## 23 64 53,191 88,966
## 24 104
## 25 104
## 26 104
## 27 964 45,577 173,850[95]
## Highest.attendances...1
## 1 Venue
## 2 Estadio Centenario, Montevideo
## 3 Stadio Nazionale PNF, Rome
## 4 Olympique de Colombes, Paris
## 5 Maracanã Stadium, Rio de Janeiro
## 6 Wankdorf Stadium, Bern
## 7 Ullevi Stadium, Gothenburg
## 8 Estadio Nacional, Santiago
## 9 Wembley Stadium, London
## 10 Estadio Azteca, Mexico City
## 11 Olympiastadion, Munich
## 12 Estadio Monumental, Buenos Aires
## 13 Camp Nou, Barcelona
## 14 Estadio Azteca, Mexico City
## 15 San Siro, Milan
## 16 Rose Bowl, Pasadena, California
## 17 Stade de France, Saint-Denis
## 18 International Stadium, Yokohama, Japan
## 19 Olympiastadion, Berlin
## 20 Soccer City, Johannesburg
## 21 Maracanã Stadium, Rio de Janeiro
## 22 Luzhniki Stadium, Moscow
## 23 Lusail Stadium, Qatar
## 24
## 25
## 26
## 27 Maracanã Stadium, Rio de Janeiro (1950)
## Highest.attendances...2
## 1 Game(s)
## 2 Uruguay 6–1 Yugoslavia, semi-final
## 3 Italy 2–1 Czechoslovakia, final
## 4 France 1–3 Italy, quarter-final
## 5 Brazil 1–2 Uruguay, deciding match
## 6 West Germany 3–2 Hungary, final
## 7 Brazil 2–0 Soviet Union, group stage
## 8 Brazil 4–2 Chile, semi-final
## 9 England 4–2 West Germany, final
## 10 Mexico 1–0 Belgium, group stage
## 11 West Germany 1–0 Chile, group stage
## 12 Italy 1–0 Argentina, group stage
## 13 Argentina 0–1 Belgium, Opening match
## 14 Mexico 1–1 Paraguay, group stageArgentina 3–2 West Germany, final
## 15 West Germany 4–1 Yugoslavia, group stage
## 16 Brazil 0–0 (3–2p) Italy, final
## 17 Brazil 0–3 France, final
## 18 Brazil 2–0 Germany, final
## 19 Germany 1–1 (4–2p) Argentina, quarter-final
## 20 Spain 1–0 Netherlands, final
## 21 Germany 1–0 Argentina, final
## 22 France 4–2 Croatia, final
## 23 Argentina 3–3 (4–2p) France, final
## 24
## 25
## 26
## 27
# Removing rows 1, 24:27
attendance2 <- attendance[-c(1, 24:27), ]
Why did I decide to remove these rows? If you look at the original dataset, the first row is the name of the variables, which we don’t need twice so that one can go. Rows 24 - 26 are future world cups, meaning we don’t have the necessary data for those events yet. Row 27 is an overall count, which I won’t be needing for my graph.
# Checking the class of my variables
class(attendance2$Year)
## [1] "character"
class(attendance2$Hosts)
## [1] "character"
class(attendance2$Totalattendance..)
## [1] "character"
Interesting. The year and totalattendance variables are supposed to be numeric and hosts is supposed to be a factor. Let’s fix that by first renaming the values of totalattendance without commas.
attendance2 <- mutate(attendance2,
Totalattendance.. = recode(Totalattendance..,
`590,549` = "590549",
`363,000` = "363000",
`375,700` = "375700",
`1,045,246` = "1045246",
`768,607` = "768607",
`819,810` = "819810",
`893,172` = "893172",
`1,563,135` = "1563135",
`1,603,975` = "1603975",
`1,865,753` = "1865753",
`1,545,791` = "1545791",
`2,109,723` = "2109723",
`2,394,031` = "2394031",
`2,516,215` = "2516215",
`3,587,538` = "3587538",
`2,785,100` = "2785100",
`2,705,197` = "2705197",
`3,359,439` = "3359439",
`3,178,856` = "3178856",
`3,429,873` = "3429873",
`3,031,768` = "3031768",
`3,404,252` = "3404252"))
attendance2
## Year Hosts Venues.Cities Totalattendance.. Matches
## 2 1930 Uruguay 3/1 590549 18
## 3 1934 Italy 8/8 363000 17
## 4 1938 France 10/9 375700 18
## 5 1950 Brazil 6/6 1045246 22
## 6 1954 Switzerland 6/6 768607 26
## 7 1958 Sweden 12/12 819810 35
## 8 1962 Chile 4/4 893172 32
## 9 1966 England 8/7 1563135 32
## 10 1970 Mexico 5/5 1603975 32
## 11 1974 West Germany 9/9 1865753 38
## 12 1978 Argentina 6/5 1545791 38
## 13 1982 Spain 17/14 2109723 52
## 14 1986 Mexico 12/11 2394031 52
## 15 1990 Italy 12/12 2516215 52
## 16 1994 United States 9/9 3587538 52
## 17 1998 France 10/10 2785100 64
## 18 2002 South Korea Japan 20/20 2705197 64
## 19 2006 Germany 12/12 3359439 64
## 20 2010 South Africa 10/9 3178856 64
## 21 2014 Brazil 12/12 3429873 64
## 22 2018 Russia 12/11 3031768 64
## 23 2022 Qatar 8/5 3404252 64
## Averageattendance Highest.attendances..
## 2 32,808 93,000
## 3 21,353 55,000
## 4 20,872 58,455
## 5 47,511 173,850[95]
## 6 29,562 63,000
## 7 23,423 50,928
## 8 27,912 68,679
## 9 48,848 98,270
## 10 50,124 108,192
## 11 49,099 83,168
## 12 40,679 71,712
## 13 40,572 95,500
## 14 46,039 114,600
## 15 48,389 74,765
## 16 68,991 94,194
## 17 43,517 80,000
## 18 42,269 69,029
## 19 52,491 72,000
## 20 49,670 84,490
## 21 53,592 74,738
## 22 47,371 78,011
## 23 53,191 88,966
## Highest.attendances...1
## 2 Estadio Centenario, Montevideo
## 3 Stadio Nazionale PNF, Rome
## 4 Olympique de Colombes, Paris
## 5 Maracanã Stadium, Rio de Janeiro
## 6 Wankdorf Stadium, Bern
## 7 Ullevi Stadium, Gothenburg
## 8 Estadio Nacional, Santiago
## 9 Wembley Stadium, London
## 10 Estadio Azteca, Mexico City
## 11 Olympiastadion, Munich
## 12 Estadio Monumental, Buenos Aires
## 13 Camp Nou, Barcelona
## 14 Estadio Azteca, Mexico City
## 15 San Siro, Milan
## 16 Rose Bowl, Pasadena, California
## 17 Stade de France, Saint-Denis
## 18 International Stadium, Yokohama, Japan
## 19 Olympiastadion, Berlin
## 20 Soccer City, Johannesburg
## 21 Maracanã Stadium, Rio de Janeiro
## 22 Luzhniki Stadium, Moscow
## 23 Lusail Stadium, Qatar
## Highest.attendances...2
## 2 Uruguay 6–1 Yugoslavia, semi-final
## 3 Italy 2–1 Czechoslovakia, final
## 4 France 1–3 Italy, quarter-final
## 5 Brazil 1–2 Uruguay, deciding match
## 6 West Germany 3–2 Hungary, final
## 7 Brazil 2–0 Soviet Union, group stage
## 8 Brazil 4–2 Chile, semi-final
## 9 England 4–2 West Germany, final
## 10 Mexico 1–0 Belgium, group stage
## 11 West Germany 1–0 Chile, group stage
## 12 Italy 1–0 Argentina, group stage
## 13 Argentina 0–1 Belgium, Opening match
## 14 Mexico 1–1 Paraguay, group stageArgentina 3–2 West Germany, final
## 15 West Germany 4–1 Yugoslavia, group stage
## 16 Brazil 0–0 (3–2p) Italy, final
## 17 Brazil 0–3 France, final
## 18 Brazil 2–0 Germany, final
## 19 Germany 1–1 (4–2p) Argentina, quarter-final
## 20 Spain 1–0 Netherlands, final
## 21 Germany 1–0 Argentina, final
## 22 France 4–2 Croatia, final
## 23 Argentina 3–3 (4–2p) France, final
Now we can go ahead and change the class of the variables.
# Changing the classes
attendance2$Year <- as.numeric(attendance2$Year)
attendance2$Totalattendance.. <- as.numeric(attendance2$Totalattendance..)
attendance2$Hosts <- factor(attendance2$Hosts)
# Checking the classes once more
class(attendance2$Year)
## [1] "numeric"
class(attendance2$Totalattendance..)
## [1] "numeric"
class(attendance2$Hosts)
## [1] "factor"
# Renaming West Germany to Germany
attendance2 <- attendance2 |>
mutate(Hosts = case_when(
Hosts == "West Germany" ~ "Germany",
TRUE ~ Hosts))
# Source : ChatGPT
plot <- highchart() |>
hc_chart(type = "line") |>
hc_title(text = "Total Attendance for FIFA World Cup Matches") |>
hc_xAxis(categories = attendance2$Year) |>
hc_yAxis(title = list(text = "Total Attendance")) |>
hc_add_series(data = attendance2$Totalattendance..,
name = "Total Attendance",
color = "#8D3C53",
showInLegend = FALSE)
plot