── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)library(rvest)
Attaching package: 'rvest'
The following object is masked from 'package:readr':
guess_encoding
Link
link <-"https://en.wikipedia.org/wiki/FIFA_World_Cup"
Parse HTML content and getting attributes
webpage <-read_html(link)
Select the table of interest
attendance_table <- attendance_table[[4]] # Adjust the index as needed
# Extract the attendance table and parse it into a data frameattendance_table <- webpage |>html_nodes(xpath ='//*[@id="mw-content-text"]/div[1]/table[4]') |>html_table(fill =TRUE)# View the first few rows to confirmhead(attendance_table)
# Load necessary librarieslibrary(tidyverse)library(rvest)# Specify the URL for the FIFA World Cup Wikipedia pageurl <-"https://en.wikipedia.org/wiki/FIFA_World_Cup"# Read and parse the HTML content from the URLwebpage <-read_html(url)# Extract the attendance table by locating the correct table index on the pageattendance_df <- webpage %>%html_table(fill =TRUE) %>% .[[4]] # This extracts the 4th table on the page, which is the attendance table# Display the first few rows to verify successful extractionhead(attendance_df)
attendance_df <- attendance_df %>%mutate(Total_Attendance =as.numeric(gsub(",", "", Total_Attendance)),Average_Attendance =as.numeric(gsub(",", "", Average_Attendance)),Year =as.numeric(Year) ) %>%filter(!is.na(Year)) # Remove rows where Year is NA
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `Total_Attendance = as.numeric(gsub(",", "",
Total_Attendance))`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
Visualization
ggplot(attendance_df, aes(x = Year, y = Total_Attendance)) +geom_line(aes(group =1), color ="steelblue", size =1) +# Line plot for trendsgeom_point(aes(size = Average_Attendance, color = Hosts), alpha =0.7) +# Points to show attendancescale_color_viridis_d() +# Use a colorblind-friendly palettegeom_text(aes(label =ifelse(Year %%10==0, Year, "")), vjust =-0.5, color ="darkgrey", size =3) +# Labels every 10 yearslabs(title ="FIFA World Cup Attendance Over the Years",subtitle ="Total and Average Attendance with Highlighted Hosts",x ="Year",y ="Total Attendance",size ="Average Attendance",color ="Host Country",caption ="Data Source: Wikipedia" ) +theme_minimal() +theme(plot.title =element_text(size =16, face ="bold"),plot.subtitle =element_text(size =12, face ="italic"),axis.text.x =element_text(angle =45, hjust =1),legend.position ="right" )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_text()`).