library(rvest)
library(tidyverse)
library(magrittr)
library(scales)
library(knitr)
library(lubridate)
library(plyr)
Setting up page by page sequence for Bobby Fischer
url <- "https://www.chess.com/games/bobby-fischer?&page="
page <- seq(from = 1, to = 38)
unitedata<- function(x){
full_url <- paste0(url, x)
full_url
}
finalurl <- unitedata(page)
Web scrape Function
MatchScrape <- function(x){
page <- x
white <- page %>% read_html() %>% html_nodes('.master-games-user-tagline:nth-child(1) .master-games-username') %>% html_text() %>% as.data.frame()
opponent <- page %>% read_html() %>% html_nodes('.master-games-user-tagline+ .master-games-user-tagline .master-games-username') %>% html_text() %>% as.data.frame()
opening <- page %>% read_html() %>% html_nodes('.master-games-opening span:nth-child(2)') %>% html_text() %>% as.data.frame()
score <- page %>% read_html() %>% html_nodes('.master-games-text-center .master-games-text-middle
') %>% html_text() %>% as.data.frame()
moves <- page %>% read_html() %>% html_nodes('.master-games-text-right .master-games-text-middle') %>% html_text() %>% as.data.frame()
year <- page %>% read_html() %>% html_nodes('.master-games-date .master-games-text-middle') %>% html_text() %>% as.data.frame()
#combine, name, and make it a tibble
chart <- cbind(white,opponent, opening, score, moves, year)
names(chart) <- c("White","Black", "Opening", "Score","Moves", "Year")
chart <- as.tibble(chart)
return(chart)
}
MatchScrape("https://www.chess.com/games/bobby-fischer?&page=1")
matches <- map_df(finalurl, MatchScrape)
Clean up base data
clean_matches <- matches %>%
mutate(
Opening = gsub("\n ", "", Opening),
Score = gsub("\n ", "", Score),
Moves = gsub("\n", "", Moves),
Year = gsub("\n", "", Year),
Score = gsub(" ", "", Score),
Moves = gsub(" ", "", Moves),
Moves = as.integer(Moves)
)
clean_matches$Opening <- trimws(clean_matches$Opening)
clean_matches$White <- trimws(clean_matches$White)
clean_matches$Black <- trimws(clean_matches$Black)
clean_matches$Score <- trimws(clean_matches$Score)
clean_matches$Moves <- trimws(clean_matches$Moves)
clean_matches$Year <- trimws(clean_matches$Year)
clean_matches
Results as factor
clean_matches$Score <- as.factor(clean_matches$Score)
Renaming factors #Conditions for player as White: 1 = win, 2 = Loss, 3 = Draw
clean_matches$Result <- revalue(clean_matches$Score, c("1-0"="Win", "0-1"="Loss", "½-½"="Draw"))
Bobby as white + Final Results (easier to interperate)
bobby <- clean_matches %>% filter(White == "Fischer Robert J")
RENAMING OPENINGS, then sorting into factors.
bobby$Opening[grep("Sicilian", bobby$Opening)] <- "Sicilian Defense"
bobby$Opening[grep("French", bobby$Opening)] <- "French Defense"
bobby$Opening[grep("Caro-Kann", bobby$Opening)] <- "Caro-Kann Defense"
bobby$Opening[grep("King's Indian", bobby$Opening)] <- "King's Indian Defense"
bobby$Opening[grep("Sicilian|French|Caro-Kann|King's Indian", bobby$Opening, invert = TRUE)] <- "Other"
bobby$Opening <- as.factor(bobby$Opening)
Reordering factor levels
bobby$Result <- ordered(bobby$Result, levels = c("Win", "Loss", "Draw"))
bobby$Opening <- ordered(bobby$Opening, levels = c("Sicilian Defense","French Defense","Caro-Kann Defense","King's Indian Defense","Other" ))
bg <- bobby %>% ggplot(aes(Result, group = Opening)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
scale_y_continuous(labels=scales::percent) +
facet_grid(~Opening) +
labs(y="W/L/D Percent", fill="Result", x= "Robert James Fischer (as white)")
Setting up page by page sequence. (Change for Magnus)
url1 <- "https://www.chess.com/games/magnus-carlsen?&page="
page1 <- seq(from = 1, to = 60)
We seem to get an error after page 60, though the function does work on some individual pages. Don’t need all of that anyway
unitedata1<- function(x){
full_url <- paste0(url1, x)
full_url
}
finalurl1 <- unitedata1(page1)
Web scrape
MatchScrape1 <- function(x){
page <- x
white <- page %>% read_html() %>% html_nodes('.master-games-user-tagline:nth-child(1) .master-games-username') %>% html_text() %>% as.data.frame()
opponent <- page %>% read_html() %>% html_nodes('.master-games-user-tagline+ .master-games-user-tagline .master-games-username') %>% html_text() %>% as.data.frame()
opening <- page %>% read_html() %>% html_nodes('.master-games-opening span+ span') %>% html_text() %>% as.data.frame()
score <- page %>% read_html() %>% html_nodes('.master-games-text-center .master-games-text-middle') %>% html_text() %>% as.data.frame()
moves <- page %>% read_html() %>% html_nodes('.master-games-text-right .master-games-text-middle') %>% html_text() %>% as.data.frame()
year <- page %>% read_html() %>% html_nodes('.master-games-date .master-games-text-middle') %>% html_text() %>% as.data.frame()
#combine, name, and make it a tibble
chart1 <- cbind(white, opponent, opening, score, moves, year)
names(chart1) <- c("White","Black", "Opening", "Score","Moves", "Year")
chart1 <- as.tibble(chart1)
return(chart1)
}
Just to test if my function gets the right values
MatchScrape1("https://www.chess.com/games/magnus-carlsen?&page=51")
matches_c <- map_df(finalurl1, MatchScrape1)
Clean up base data
clean_matches_c <- matches_c %>%
mutate(
Opening = gsub("\n ", "", Opening),
Score = gsub("\n ", "", Score),
Moves = gsub("\n", "", Moves),
Year = gsub("\n", "", Year),
Score = gsub(" ", "", Score),
Moves = gsub(" ", "", Moves),
Moves = as.integer(Moves)
)
clean_matches_c$Opening <- trimws(clean_matches_c$Opening)
clean_matches_c$White <- trimws(clean_matches_c$White)
clean_matches_c$Black <- trimws(clean_matches_c$Black)
clean_matches_c$Score <- trimws(clean_matches_c$Score)
clean_matches_c$Moves <- trimws(clean_matches_c$Moves)
clean_matches$Year <- trimws(clean_matches$Year)
clean_matches_c
Results as factor
clean_matches_c$Score <- as.factor(clean_matches_c$Score)
Renaming factors #Conditions for player as White: 1 = win, 2 = Loss, 3 = Draw
clean_matches_c$Result <- revalue(clean_matches_c$Score, c("1-0"="Win", "0-1"="Loss", "½-½"="Draw"))
Magnus as white + Final Results (easier to interperate)
carlsen <- clean_matches_c %>% filter( White == "Magnus Carlsen" | White == "Carlsen, M.")
carlsen
RENAMING OPENINGS, then sorting into factors.
carlsen$Opening[grep("Sicilian", carlsen$Opening)] <- "Sicilian Defense"
carlsen$Opening[grep("French", carlsen$Opening)] <- "French Defense"
carlsen$Opening[grep("Caro-Kann", carlsen$Opening)] <- "Caro-Kann Defense"
carlsen$Opening[grep("King's Indian", carlsen$Opening)] <- "King's Indian Defense"
carlsen$Opening[grep("Sicilian|French|Caro-Kann|King's Indian", carlsen$Opening, invert = TRUE)] <- "Other"
carlsen$Opening <- as.factor(carlsen$Opening)
Reordering factor levels
carlsen$Result <- ordered(carlsen$Result, levels = c("Win", "Loss", "Draw"))
carlsen$Opening <- ordered(carlsen$Opening, levels = c("Sicilian Defense","French Defense","Caro-Kann Defense","King's Indian Defense","Other" ))
cg <- carlsen %>% ggplot(aes(Result, group = Opening)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
scale_y_continuous(labels=scales::percent) +
facet_grid(~Opening) +
labs(y="W/L/D Percent", fill="Result", x= "Magnus Carlsen (as White)")
count(carlsen$Result)
count(bobby$Result)
count(carlsen$Opening)
count(bobby$Opening)
CarlsenWin <- percent(439/777)
CarlsenLoss <- percent(100/777)
BobbyWin <- percent(336/541)
BobbyLoss <- percent(70/541)
cg
bg
names(CarlsenWin) <- "Carlsen Winrate"
CarlsenWin
## Carlsen Winrate
## "56.5%"
names(BobbyWin) <- "Bobby Winrate"
BobbyWin
## Bobby Winrate
## "62.1%"
names(CarlsenLoss) <- "Carlsen Lossrate"
CarlsenLoss
## Carlsen Lossrate
## "12.9%"
names(BobbyLoss) <- "Bobby Lossrate"
BobbyLoss
## Bobby Lossrate
## "12.9%"
We can see that Bobby Fischer actually has a higher percentage of wins, though Fischer and Carlsen actually share the same % of Loss, meaning that Carlsen has a higher volume of Drawn games.
When looking at the graphical data of how they each did against certain defenses as white, we can see Carlsen was actually stronger against most of the more common opening defenses, with exception to the Sicilian.
Looking at our data though, This could be partially incorrect as we have fewer samples of games using our listed particular defenses, as the “Other” opening category makes up for 66.5% of the sampled matches for Carlsen, where as for Bobby it only accounted for 43.8%. The other most frequent defensive opening being the Sicilian, accounting for 18.1% and 32.5% of Carlsen and Bobby’s matches respectively, meaning it is likely to be the more accurate indicator in their performance against different openings by black.
In conclusion, based on the data here, we can say Bobby Fischer is likely to have been the more dominant player as white, comparing him in his era vs. Carlsen in the modern era.