In this project, we were given a text file with chess tournament results where the information had a cross-table structure. Our job was to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Here is an example of what the file looks like:
image text-file-example
library(tidyverse)
df <- read_csv("https://raw.githubusercontent.com/justinm0rgan/data607/main/Projects/1/data/7645617.txt", show_col_types = F, skip = 1)
Create empty data frame with correct data types and final structure.
df2 <- data.frame(matrix(ncol = 6, nrow = 64))
colnames(df2) <- c("num", "name", "state", "total_pts", "pre_rating", "avg_rating_opp")
df <- df %>%
filter((row_number() %% 3 == 0) | (row_number() %% 3 == !0))
num, name and total_pts data.Use regex to extract appropriate data for each column.
df_num_name_pts <- df %>%
filter(row_number() %% 2 == 0)
head(df_num_name_pts)
# A tibble: 6 × 1
`Pair | Player Name |Total|Round|Round|Round|Round|Round…`
<chr>
1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 1…
2 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 2…
3 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 1…
4 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 1…
5 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 1…
6 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 2…
df2$num <- df_num_name_pts$`Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|` %>%
str_extract("^.{2}") %>%
trimws() %>%
as.numeric()
df2$name <- df_num_name_pts$`Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|` %>%
str_extract("([A-Z]+\\s[A-Z]+) | ([A-Z]+\\s[A-Z]+\\s[A-Z]+)") %>%
trimws()
df2$total_pts <- df_num_name_pts$`Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|` %>%
str_extract("\\d\\.\\d") %>%
as.numeric()
state and pre_rating.df_get_state_pre <- df[-1,] #delete first row
df_get_state_pre <- df_get_state_pre %>%
filter(row_number() %% 2 == 0)
head(df_get_state_pre)
# A tibble: 6 × 1
`Pair | Player Name |Total|Round|Round|Round|Round|Round…`
<chr>
1 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B …
2 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W …
3 MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B …
4 MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B …
5 MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W …
6 OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W …
df2$state <- df_get_state_pre$`Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|` %>%
str_extract("[A-Z]+")
df2$pre_rating <- df_get_state_pre$`Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|` %>%
str_extract("(\\d+).*?(\\d+)") %>%
str_sub(15,19) %>%
trimws() %>%
as.numeric()
avg_rating_opp.# extract rounds with opponent numbers
df_opp <- df_num_name_pts$`Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|` %>%
str_extract_all("\\s\\d{1,2}.*", simplify = T) %>%
data.frame()
# create oppoenent df
df_opp <- read.csv(text=sub(
"\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)$",
",\\1,\\2,\\3,\\4,\\5,\\6", df_opp$.),
col.names=c('rnd1', 'rnd2', 'rnd3','rnd4','rnd5','rnd6','rnd7'), header=FALSE)
# set player id
df_opp$player_id <- 1:nrow(df_opp)
# rearrange
df_opp <- df_opp %>%
select(player_id, rnd1, rnd2, rnd3, rnd4, rnd5, rnd6, rnd7)
head(df_opp)
player_id rnd1 rnd2 rnd3 rnd4 rnd5 rnd6 rnd7
1 1 39|W 21|W 18|W 14|W 7|D 12|D 4|
2 2 63|W 58|L 4|W 17|W 16|W 20|W 7|
3 3 8|W 61|W 25|W 21|W 11|W 13|W 12|
4 4 23|D 28|W 2|W 26|D 5|W 19|D 1|
5 5 45|W 37|D 12|D 13|D 4|W 14|W 17|
6 6 34|D 29|L 11|W 35|D 10|W 27|W 21|
# extract numbers from each column
df_opp <- df_opp %>%
as_tibble() %>%
mutate(across(.cols = rnd1:rnd7, .fns=str_extract, pattern = "\\d+"))
# change dtype of columns to numeric
df_opp <- df_opp %>%
mutate(across(.cols = rnd1:rnd7, .fns = as.numeric))
head(df_opp)
# A tibble: 6 × 8
player_id rnd1 rnd2 rnd3 rnd4 rnd5 rnd6 rnd7
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 39 21 18 14 7 12 4
2 2 63 58 4 17 16 20 7
3 3 8 61 25 21 11 13 12
4 4 23 28 2 26 5 19 1
5 5 45 37 12 13 4 14 17
6 6 34 29 11 35 10 27 21
pre-rating from main data frame to opponents data frame.df_opp$rnd1_opp <- df2$pre_rating[match(df_opp$rnd1, df2$num)]
df_opp$rnd2_opp <- df2$pre_rating[match(df_opp$rnd2, df2$num)]
df_opp$rnd3_opp <- df2$pre_rating[match(df_opp$rnd3, df2$num)]
df_opp$rnd4_opp <- df2$pre_rating[match(df_opp$rnd4, df2$num)]
df_opp$rnd5_opp <- df2$pre_rating[match(df_opp$rnd5, df2$num)]
df_opp$rnd6_opp <- df2$pre_rating[match(df_opp$rnd6, df2$num)]
df_opp$rnd7_opp <- df2$pre_rating[match(df_opp$rnd7, df2$num)]
df_opp <- df_opp %>%
select(player_id, rnd1_opp:rnd7_opp)
df_opp$opp_mean <- df_opp[,2:8] %>%
rowMeans(na.rm = T) %>%
round(3)
head(df_opp)
# A tibble: 6 × 9
player_id rnd1_opp rnd2_opp rnd3_opp rnd4_opp rnd5_opp rnd6_opp rnd7_opp
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1436 1563 1600 1610 1649 1663 1716
2 2 1175 917 1716 1629 1604 1595 1649
3 3 1641 955 1745 1563 1712 1666 1663
4 4 1363 1507 1553 1579 1655 1564 1794
5 5 1242 980 1663 1666 1716 1610 1629
6 6 1399 1602 1712 1438 1365 1552 1563
# … with 1 more variable: opp_mean <dbl>
df2$avg_rating_opp <- df_opp$opp_mean[match(df_opp$player_id, df2$num)]
head(df2)
num name state total_pts pre_rating avg_rating_opp
1 1 GARY HUA ON 6.0 1794 1605.286
2 2 DAKSHESH DARURI MI 6.0 1553 1469.286
3 3 ADITYA BAJAJ MI 6.0 1384 1563.571
4 4 PATRICK H SCHILLING MI 5.5 1716 1573.571
5 5 HANSHI ZUO MI 5.5 1655 1500.857
6 6 HANSEN SONG OH 5.0 1686 1518.714
df2 %>%
write_csv('./data/cleaned_chess_data.csv')