# Defining the column widths based on the structure of the file
col_widths <- c(8, 33, 6, 6, 6, 6, 6, 6, 6, 6)Project 1
Introduction
In this project, I take a text file with chess tournament results, and create Quarto file that generates a .CSV file with the following information for all of the players: Player’s Name, Player’s state, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.
Data Transformation
First, I determine column widths by browsing the text file.
Next, I import the data with columns of fixed width.
# URL of text file saved on github so data is accessible for anyone running the script.
file_url <-
"https://github.com/naomibuell/DATA607/raw/main/DATA607_project1.txt"
# Read the fixed-width format file into a data frame
chess_data <-
read_fwf(file_url,
fwf_widths(col_widths),
skip = 4,
show_col_types = FALSE) |>
filter(X1 != "--------") |> # remove blank rows of dashes
mutate_all(~ str_replace_all(as.character(.), c("\\|" = ""))) |> # removing "|"s from the data
mutate(row_id = row_number(), # creating row ids
pid = round_half_up((row_number()) / 2, digits = 0)) # creating person IDsSince text file structure has 2 rows per person-level observation, I transform it to just 1 row per observation by splitting each player’s two rows into separate sets of columns.
chess_data_left <-
chess_data |> # get left 12 cols of per-person obs
filter(pid == as.integer(X1))
chess_data_right <-
chess_data |> # get right 12 cols of per-person obs
filter(pid == row_id / 2)Then I join the data back together and clean up the variables.
# Join and clean variables
chess_joined <-
full_join(chess_data_left, chess_data_right, by = "pid") |> # join left and right sides of data based on pid var
janitor::clean_names() |> # rename vars
rename(
name = x2_x,
state = x1_y,
total = x3_x,
opp1 = x4_x,
opp2 = x5_x,
opp3 = x6_x,
opp4 = x7_x,
opp5 = x8_x,
opp6 = x9_x,
opp7 = x10_x,
rating = x2_y
) |>
select(-c(starts_with("row_id"), starts_with("x"))) |> # dropping unneeded columns
mutate(
# trimming strings and assigning var types
name = str_trim(name),
total = parse_double(total),
opp1 = parse_number(opp1),
opp2 = parse_number(opp2),
opp3 = parse_number(opp3),
opp4 = parse_number(opp4),
opp5 = parse_number(opp5),
opp6 = parse_number(opp6),
opp7 = parse_number(opp7),
state = as.factor(str_trim(state)),
rating = parse_number(str_extract(rating, " \\d+"))
)Here is the tidied data set with 1 row per observation. Variables opp1 through opp7 represent each player’s opponents’ ID.
head(chess_joined)# A tibble: 6 × 12
name total opp1 opp2 opp3 opp4 opp5 opp6 opp7 pid state rating
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
1 GARY HUA 6 39 21 18 14 7 12 4 1 ON 1794
2 DAKSHESH D… 6 63 58 4 17 16 20 7 2 MI 1553
3 ADITYA BAJ… 6 8 61 25 21 11 13 12 3 MI 1384
4 PATRICK H … 5.5 23 28 2 26 5 19 1 4 MI 1716
5 HANSHI ZUO 5.5 45 37 12 13 4 14 17 5 MI 1655
6 HANSEN SONG 5 34 29 11 35 10 27 21 6 OH 1686
Using these IDs, I find the corresponding opponents’ rating, and assign the average rating of each player’s opponents to a new variable opp_avg. I also drop variables from the data that are not required by the assignment.
# Look up average pre-chess rating of opponents:
chess_lookup <- chess_joined |>
mutate(
opp1 = ifelse(opp1 %in% pid, rating[match(opp1, pid)], NA),
opp2 = ifelse(opp2 %in% pid, rating[match(opp2, pid)], NA),
opp3 = ifelse(opp3 %in% pid, rating[match(opp3, pid)], NA),
opp4 = ifelse(opp4 %in% pid, rating[match(opp4, pid)], NA),
opp5 = ifelse(opp5 %in% pid, rating[match(opp5, pid)], NA),
opp6 = ifelse(opp6 %in% pid, rating[match(opp6, pid)], NA),
opp7 = ifelse(opp7 %in% pid, rating[match(opp7, pid)], NA)
)
chess_avgs <- chess_lookup |>
mutate(opp_avg = rowMeans(select(chess_lookup, starts_with("opp")), na.rm = TRUE)) |>
select(-c(pid, matches("opp\\d"))) # drop unneeded varsNow, the data set has all required info: player’s name (name), player’s state (state), total number of points (total), player’s pre-rating (rating), and average pre-chess rating of opponents (opp_avg).
head(chess_avgs)# A tibble: 6 × 5
name total state rating opp_avg
<chr> <dbl> <fct> <dbl> <dbl>
1 GARY HUA 6 ON 1794 1605.
2 DAKSHESH DARURI 6 MI 1553 1469.
3 ADITYA BAJAJ 6 MI 1384 1564.
4 PATRICK H SCHILLING 5.5 MI 1716 1574.
5 HANSHI ZUO 5.5 MI 1655 1501.
6 HANSEN SONG 5 OH 1686 1519.
Lastly, I export the file as a .CSV.
write_csv(chess_avgs, "DATA607chessdata.csv")Before I conclude, I also explore the top players in the final data set.
chess_explore <- mutate(chess_avgs, ratio = rating / opp_avg)
arrange(chess_explore, desc(rating)) # top player ratings# A tibble: 64 × 6
name total state rating opp_avg ratio
<chr> <dbl> <fct> <dbl> <dbl> <dbl>
1 GARY HUA 6 ON 1794 1605. 1.12
2 LOREN SCHWIEBERT 3.5 MI 1745 1363. 1.28
3 PATRICK H SCHILLING 5.5 MI 1716 1574. 1.09
4 CAMERON WILLIAM MC LEMAN 4.5 MI 1712 1468. 1.17
5 HANSEN SONG 5 OH 1686 1519. 1.11
6 TORRANCE HENRY JR 4.5 MI 1666 1498. 1.11
7 KENNETH J TACK 4.5 MI 1663 1506. 1.10
8 HANSHI ZUO 5.5 MI 1655 1501. 1.10
9 GARY DEE SWATHELL 5 MI 1649 1372. 1.20
10 EZEKIEL HOUGHTON 5 MI 1641 1468. 1.12
# ℹ 54 more rows
arrange(chess_explore, desc(opp_avg)) # top player opponents' ratings# A tibble: 64 × 6
name total state rating opp_avg ratio
<chr> <dbl> <fct> <dbl> <dbl> <dbl>
1 GARY HUA 6 ON 1794 1605. 1.12
2 PATRICK H SCHILLING 5.5 MI 1716 1574. 1.09
3 ADITYA BAJAJ 6 MI 1384 1564. 0.885
4 ANVIT RAO 5 MI 1365 1554. 0.878
5 BRIAN LIU 3 MI 1423 1539. 0.925
6 STEFANO LEE 5 ON 1411 1523. 0.926
7 SOFIA ADINA STANESCU-BELLU 3.5 MI 1507 1522. 0.990
8 HANSEN SONG 5 OH 1686 1519. 1.11
9 BRADLEY SHAW 4.5 MI 1610 1515 1.06
10 MAX ZHU 3.5 ON 1579 1507. 1.05
# ℹ 54 more rows
arrange(chess_explore, desc(ratio)) # highest ratio of player:opponents ratings# A tibble: 64 × 6
name total state rating opp_avg ratio
<chr> <dbl> <fct> <dbl> <dbl> <dbl>
1 GEORGE AVERY JONES 3.5 ON 1522 1144. 1.33
2 ASHWIN BALAJI 1 MI 1530 1186 1.29
3 LOREN SCHWIEBERT 3.5 MI 1745 1363. 1.28
4 GAURAV GIDWANI 3.5 MI 1552 1222. 1.27
5 JOSHUA DAVID LEE 3.5 MI 1438 1150. 1.25
6 CHIEDOZIE OKORIE 3.5 MI 1602 1314. 1.22
7 GARY DEE SWATHELL 5 MI 1649 1372. 1.20
8 EUGENE L MCCLURE 4 MI 1555 1300. 1.20
9 RISHI SHETTY 3.5 MI 1494 1260. 1.19
10 CAMERON WILLIAM MC LEMAN 4.5 MI 1712 1468. 1.17
# ℹ 54 more rows
Conclusion
In this assignment, I used REGEX, joins, and other techniques to transform and tidy chess tournament data from a .TXT file. By browsing the final data set, I found that Gary Hua had both the highest chess rating and, in turn, the highest rated chess opponents, on average. Interestingly, Aditya Bajaj played the third toughest opponents on average, despite being ranked 36th out of all pre-chess ratings of players in the tournament.