Wikipedia World Cup Webscrape Assignment

Author

Annet Isa

# load libraries
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(rvest)

Warning: package 'rvest' was built under R version 4.3.3


Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding

# link to URL for scraping
link <- "https://en.wikipedia.org/wiki/FIFA_World_Cup"

# parse HTML content and getting attributes
webpage <- read_html(link)

# extract attendance table
attendance_table <- webpage %>%
  html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[4]') %>%
  html_table(fill = TRUE)

# show first 6 rows of attendance table
head(attendance_table)

[[1]]
# A tibble: 27 × 9
   Year  Hosts     `Venues/Cities` `Totalattendance †` Matches Averageattendance
   <chr> <chr>     <chr>           <chr>               <chr>   <chr>            
 1 Year  Hosts     Venues/Cities   Totalattendance †   Matches Averageattendance
 2 1930  Uruguay   3/1             590,549             18      32,808           
 3 1934  Italy     8/8             363,000             17      21,353           
 4 1938  France    10/9            375,700             18      20,872           
 5 1950  Brazil    6/6             1,045,246           22      47,511           
 6 1954  Switzerl… 6/6             768,607             26      29,562           
 7 1958  Sweden    12/12           819,810             35      23,423           
 8 1962  Chile     4/4             893,172             32      27,912           
 9 1966  England   8/7             1,563,135           32      48,848           
10 1970  Mexico    5/5             1,603,975           32      50,124           
# ℹ 17 more rows
# ℹ 3 more variables: `Highest attendances ‡` <chr>,
#   `Highest attendances ‡` <chr>, `Highest attendances ‡` <chr>

# attendance_table is a list so I am turning it into a dataframe with chatGPT's help
attendance_df <- attendance_table[[1]]

# now the environment window looks familiar - 27 obs of 9
# loading relevant libraries
library(ggplot2)
library(highcharter)

Warning: package 'highcharter' was built under R version 4.3.3

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo

Highcharts (www.highcharts.com) is a Highsoft software product which is

not free for commercial and Governmental use

library(ggrepel)

Warning: package 'ggrepel' was built under R version 4.3.3

# at_df = attendance table dataframe
at_df <- attendance_df[2:23,1:7]
tail(at_df)

# A tibble: 6 × 7
  Year  Hosts      `Venues/Cities` `Totalattendance †` Matches Averageattendance
  <chr> <chr>      <chr>           <chr>               <chr>   <chr>            
1 2002  South Kor… 20/20           2,705,197           64      42,269           
2 2006  Germany    12/12           3,359,439           64      52,491           
3 2010  South Afr… 10/9            3,178,856           64      49,670           
4 2014  Brazil     12/12           3,429,873           64      53,592           
5 2018  Russia     12/11           3,031,768           64      47,371           
6 2022  Qatar      8/5             3,404,252           64      53,191           
# ℹ 1 more variable: `Highest attendances ‡` <chr>

# converting data types
# chatGPT provided the gsub(",", "") - it first removes commas from the numbers so R can convert the values to numeric
at_df$Year <-as.numeric(at_df$Year)

at_df$`Totalattendance †` <- as.numeric(gsub(",", "", at_df$`Totalattendance †`))

at_df$Matches <- as.numeric(at_df$Matches)

at_df$Averageattendance <- as.numeric(gsub(",", "", at_df$Averageattendance))

str(at_df)

tibble [22 × 7] (S3: tbl_df/tbl/data.frame)
 $ Year                 : num [1:22] 1930 1934 1938 1950 1954 ...
 $ Hosts                : chr [1:22] "Uruguay" "Italy" "France" "Brazil" ...
 $ Venues/Cities        : chr [1:22] "3/1" "8/8" "10/9" "6/6" ...
 $ Totalattendance †    : num [1:22] 590549 363000 375700 1045246 768607 ...
 $ Matches              : num [1:22] 18 17 18 22 26 35 32 32 32 38 ...
 $ Averageattendance    : num [1:22] 32808 21353 20872 47511 29562 ...
 $ Highest attendances ‡: chr [1:22] "93,000" "55,000" "58,455" "173,850[95]" ...

# once more converting
at_df$`Highest attendances ‡` <-as.numeric(gsub(",", "",at_df$`Highest attendances ‡`))

Warning: NAs introduced by coercion

# found separate_wider_delim after searching in the Help pane
at_2_df <- at_df %>% separate_wider_delim(`Venues/Cities`, delim = "/", names = c("Venues", "Cities"))
# and the crowd goes wild as this works!!!!!

# converting the new columns to the numeric class
# could I convert in the same chunk as separate_wider_delim using the %>% operator? 
at_2_df$Venues <- as.numeric(at_2_df$Venues)

at_2_df$Cities <- as.numeric(at_2_df$Cities)

# manually updating the "Highest Attendances" value for 1950 Brazil. 
at_2_df[4,8] <- 173850

tail(at_2_df, 8)

# A tibble: 8 × 8
   Year Hosts        Venues Cities `Totalattendance †` Matches Averageattendance
  <dbl> <chr>         <dbl>  <dbl>               <dbl>   <dbl>             <dbl>
1  1994 United Stat…      9      9             3587538      52             68991
2  1998 France           10     10             2785100      64             43517
3  2002 South Korea…     20     20             2705197      64             42269
4  2006 Germany          12     12             3359439      64             52491
5  2010 South Africa     10      9             3178856      64             49670
6  2014 Brazil           12     12             3429873      64             53592
7  2018 Russia           12     11             3031768      64             47371
8  2022 Qatar             8      5             3404252      64             53191
# ℹ 1 more variable: `Highest attendances ‡` <dbl>

# mutating by adding the countries to a new "Region" variable. 
# could come in handy if I figure out the drillable tree map

at_2_df <- at_2_df %>%
  mutate(region = case_when(
    Hosts %in% c("Uruguay", "Brazil", "Chile", "Argentina") ~ "South America",
    Hosts %in% c("South Korea Japan") ~ "Asia",
    Hosts %in% c("Qatar") ~ "Middle East",
    Hosts %in% c("South Africa") ~ "Africa",
    Hosts %in% c("Mexico", "United States", "Canada") ~ "North America",
    .default = "Europe"
  ))

#for some reason, "South Korea Japan" is being marked as Europe, not Asia. changing manually

at_2_df[17,9] <- "Asia"

# also, have to rename Total attendance column
at_2_df <- at_2_df %>%
  rename(TotalAttendance = `Totalattendance †`)

at_2_df %>% 
  hchart(
    "scatter",
    hcaes(
      x = region,
      y = Year,
      group = region,
      size = TotalAttendance,
      name = Hosts)
  ) %>%
  hc_plotOptions(
    series = list(
      jitter = list(x = 0.24),
      marker = list(radius = 3))
  ) %>%
  hc_tooltip(
    pointFormat = "<b>{point.Hosts}</b><br>Year: {point.Year}<br>Total<br>Attendance: {point.TotalAttendance:,.0f}<br>Venues: {point.Venues}<br>Cities: {point.Cities}"
  ) %>%
  hc_title(
    text = "World Cup Hosts 1930-2022"
  ) %>%
  hc_caption(
    text = "Fifa World Cup   https://en.wikipedia.org/wiki/FIFA_World_Cup"
  ) %>%
  hc_xAxis(
    title = list(text = "World Regions")
  ) %>%
  hc_yAxis(
    title = list(text = "Year")
  )