Description

The goal of this project is to parse chess tournament results into:

Player Number, Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

Read the file with read lines, the data can be obtained from my github

lines <- read_lines("https://raw.githubusercontent.com/ksooklall/CUNY-SPS-Masters-DS/main/DATA_607/projects/project_1/tournamentinfo.txt")
lines[1:7]
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [7] "-----------------------------------------------------------------------------------------"

My approach will involved two dataframes - Player data (player_rows) - Round data (round_rows)

Look at the raw data I see the first row with player data is row 5. Each row containing player_name is followed by another row with player data. Finally the entire file is 195 rows in total. My approach will be to iterate over a vector starting at 5 ending at 195 at steps of 3. Each row and row+1 that is read will be parsed for player data and round data

player_rows = c()
round_rows = c()

for (i in seq(5, 195, 3)) {
  row1 <- unlist(strsplit(lines[i], '\\|'))
  player_num <- as.numeric(gsub(' ', '', row1[1]))
  player_name <- str_to_title(str_trim(row1[2]))
  total_points <- str_trim(row1[3])
  
  row2 <- unlist(strsplit(lines[i+1], '\\|'))
  players_state <- str_trim(row2[1])
  players_pre_rating <- unlist(str_extract_all(row2[2], "[[:digit:]]+"))[2]
  
  player_rows <- rbind(player_rows, c(player_num, player_name, players_state, total_points, players_pre_rating))
  temp_df <- data.frame(row1[4:10])
  temp_df$player_num <- player_num
  temp_df$players_pre_rating <- players_pre_rating
  
  round_rows <- rbind(round_rows, temp_df)
}

With the file completely parsed the player rows can now be aggregate into a dataframe and the column names renamed

df <- data.frame(player_rows)
colnames(df) <- c('player_num', 'player_name', 'player_state', 'total_points', 'players_pre_rating')
head(df)
##   player_num         player_name player_state total_points players_pre_rating
## 1          1            Gary Hua           ON          6.0               1794
## 2          2     Dakshesh Daruri           MI          6.0               1553
## 3          3        Aditya Bajaj           MI          6.0               1384
## 4          4 Patrick H Schilling           MI          5.5               1716
## 5          5          Hanshi Zuo           MI          5.5               1655
## 6          6         Hansen Song           OH          5.0               1686

Similary aggregate the round rows data into a dataframe There is a lot of string cleaning ans well as data conversion done in this dataframe

colnames(round_rows) <- c('wl_opponent_id', 'player_num', 'players_pre_rating')
round_rows$wl <- sapply(strsplit(as.character(round_rows$wl_opponent_id), ' '), '[', 1)
round_rows$opponent_id <- sapply(round_rows$wl_opponent_id, function(x)gsub('\\s+', ' ', x))
round_rows$opponent_id <- as.numeric(sapply(strsplit(as.character(round_rows$opponent_id), ' '), '[', 2))
round_rows$players_pre_rating <- as.numeric(round_rows$players_pre_rating)
round_rows <- round_rows[, c('player_num', 'wl', 'opponent_id', 'players_pre_rating')]
head(round_rows)
##   player_num wl opponent_id players_pre_rating
## 1          1  W          39               1794
## 2          1  W          21               1794
## 3          1  W          18               1794
## 4          1  W          14               1794
## 5          1  W           7               1794
## 6          1  D          12               1794

With the round_rows clean I now calculate the averages

final_cols <- c('player_name', 'player_state', 'total_points', 'players_pre_rating', 'avg')
avg_pre <- round_rows %>% group_by(opponent_id) %>% summarise(avg = as.integer(mean(players_pre_rating)), .groups='drop')

Finally merge the round_data and player_data into one dataframe

df <- merge(df, avg_pre, by.x="player_num", by.y="opponent_id")[, final_cols]
df$total_points<-as.numeric(df$total_points)
head(df)
##                player_name player_state total_points players_pre_rating  avg
## 1                 Gary Hua           ON          6.0               1794 1605
## 2                Anvit Rao           MI          5.0               1365 1554
## 3 Cameron William Mc Leman           MI          4.5               1712 1467
## 4           Kenneth J Tack           MI          4.5               1663 1506
## 5        Torrance Henry Jr           MI          4.5               1666 1497
## 6             Bradley Shaw           MI          4.5               1610 1515

Save the result into a csv for further use

write.csv(df, 'chess_data.csv')

Exploratory data analysis

The best player by average

df %>% top_n(n=20, avg) %>% ggplot(aes(x=reorder(player_name, avg), y=avg, fill=avg)) + geom_col() + coord_flip() + labs(y='Average', x='Player Name', title = 'Player Averages')

The best player by total points

df %>% top_n(n=20, total_points) %>% ggplot(aes(x=reorder(player_name, total_points), y=total_points, fill=total_points)) + geom_col() + coord_flip() + labs(y='Total Points', x='Player Name', title = 'Player Total Points')

It looks like Gary Hua is the best overall

as.data.frame(table(df$player_state)) %>% ggplot(aes(x=Var1, y=Freq, fill=Freq)) + geom_col() + geom_text(aes(label=Freq), nudge_y = 1) + labs(x='Player State')

MI is by far the most common player state