In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
tournament_data <- read.csv("https://raw.githubusercontent.com/Nick-Climaco/DATA-607-NC/main/tournament_info.txt",
header = TRUE, skip = 3)
tournament_data2 <- read_lines("https://raw.githubusercontent.com/Nick-Climaco/DATA-607-NC/main/tournament_info.txt")
# matches a string of characters for the first and last names or match first, middle and
# last names simplify =TRUE returns a character matrix instead list of character vector
player_names <- tournament_data %>%
str_extract_all("\\w+\\s\\w+ | \\w+\\s\\w+\\s\\w+", simplify = TRUE) %>%
str_flatten(collapse = ",") %>%
str_replace_all(",{2,}", ",") %>%
str_remove_all("^$") %>%
str_split(",")
player_names <- player_names[[1]] %>%
str_trim() # remove leading/trailing white space
player_state <- tournament_data %>%
str_extract_all("\\s([A-Z][A-Z])\\s\\|", simplify = TRUE) %>%
str_flatten(collapse = ",") %>%
str_replace_all(",{2,}", ",") %>%
str_remove_all("\\|") %>%
str_split(",")
player_state <- player_state[[1]] %>%
str_trim()
total_number_points <- tournament_data %>%
str_extract_all("\\d\\.\\d", simplify = TRUE) %>%
str_flatten(collapse = ",") %>%
str_replace_all(",{2,}", ",") %>%
str_split(",")
total_number_points <- total_number_points[[1]] %>%
str_trim()
player_pre_rating <- tournament_data %>%
# not all the pre rating are the same format
str_extract_all("R:\\s*(\\d\\d\\d\\d)|R:\\s*\\d\\d\\d", simplify = TRUE) %>%
str_flatten(collapse = ",") %>%
str_replace_all(",{2,}", ",") %>%
str_remove_all("R: ") %>%
str_split(",")
player_pre_rating <- player_pre_rating[[1]] %>%
str_trim()
Adding the cleaned into a data frame…
# checking if it worked
df_tournament <- data.frame(Number = seq(1:64), Name = player_names, State = player_state,
Total_Points = as.numeric(total_number_points), Pre_Rating = as.numeric(player_pre_rating))
head(df_tournament)
## Number Name State Total_Points Pre_Rating
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
opponents_data <- tournament_data2[seq(5, 196, 3)]
opponent_num <- str_extract_all(opponents_data, "(?<=\\|([A-Z])\\s{2,3})\\d{1,2}|(?<=\\|)(([A-Z])\\s{4}|\\s{5})") %>%
unlist() %>%
as.numeric()
# KEEP TRACK OF REGEX MEANINGS:
# (?<=\\|([A-Z])\\s{2,3}) :regex using a positive look behind token '(?<=..)', match a
# capital letter A-Z; whitespace 2-3 times
# \\d{1,2} : matches decimal digit 1-2 times
#'|' : alternate_match
# (?<=\\|)(([A-Z])\\s{4}|\\s{5}): positive look behind \\|; alternate_match a single
# capital letter followed by 4 with space or no letter just 5 white spaces
# Split the opponent numbers into groups of 7
opponent_groups <- split(opponent_num, rep(1:(length(opponent_num)%/%7), each = 7))
# Create a matrix to store the total and average pre-ratings
opponent_pre_rating <- matrix(0, ncol = 2, nrow = length(opponent_groups))
colnames(opponent_pre_rating) <- c("Total", "Average")
# Calculate the total and average pre-ratings for each group of opponents
opponent_pre_rating[, 1] <- unlist(lapply(opponent_groups, function(opponents) {
total_pre_rating <- sum(df_tournament$Pre_Rating[df_tournament$Number %in% opponents],
na.rm = TRUE)
return(total_pre_rating)
}))
opponent_pre_rating[, 2] <- unlist(lapply(opponent_groups, function(opponents) {
average_pre_rating <- mean(df_tournament$Pre_Rating[df_tournament$Number %in% opponents],
na.rm = TRUE)
return(average_pre_rating)
}))
df_tournament$Average_Opponents_Rating <- round(opponent_pre_rating[, 2], digits = 0)
kable(df_tournament, row.names = NA)
| Number | Name | State | Total_Points | Pre_Rating | Average_Opponents_Rating |
|---|---|---|---|---|---|
| 1 | GARY HUA | ON | 6.0 | 1794 | 1605 |
| 2 | DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| 3 | ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| 4 | PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| 5 | HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| 6 | HANSEN SONG | OH | 5.0 | 1686 | 1519 |
| 7 | GARY DEE SWATHELL | MI | 5.0 | 1649 | 1372 |
| 8 | EZEKIEL HOUGHTON | MI | 5.0 | 1641 | 1468 |
| 9 | STEFANO LEE | ON | 5.0 | 1411 | 1523 |
| 10 | ANVIT RAO | MI | 5.0 | 1365 | 1554 |
| 11 | CAMERON WILLIAM MC | MI | 4.5 | 1712 | 1468 |
| 12 | KENNETH J TACK | MI | 4.5 | 1663 | 1506 |
| 13 | TORRANCE HENRY JR | MI | 4.5 | 1666 | 1498 |
| 14 | BRADLEY SHAW | MI | 4.5 | 1610 | 1515 |
| 15 | ZACHARY JAMES HOUGHTON | MI | 4.5 | 1220 | 1484 |
| 16 | MIKE NIKITIN | MI | 4.0 | 1604 | 1386 |
| 17 | RONALD GRZEGORCZYK | MI | 4.0 | 1629 | 1499 |
| 18 | DAVID SUNDEEN | MI | 4.0 | 1600 | 1480 |
| 19 | DIPANKAR ROY | MI | 4.0 | 1564 | 1426 |
| 20 | JASON ZHENG | MI | 4.0 | 1595 | 1411 |
| 21 | DINH DANG BUI | ON | 4.0 | 1563 | 1470 |
| 22 | EUGENE L MCCLURE | MI | 4.0 | 1555 | 1300 |
| 23 | ALAN BUI | ON | 4.0 | 1363 | 1214 |
| 24 | MICHAEL R ALDRICH | MI | 4.0 | 1229 | 1357 |
| 25 | LOREN SCHWIEBERT | MI | 3.5 | 1745 | 1363 |
| 26 | MAX ZHU | ON | 3.5 | 1579 | 1507 |
| 27 | GAURAV GIDWANI | MI | 3.5 | 1552 | 1222 |
| 28 | SOFIA ADINA STANESCU | MI | 3.5 | 1507 | 1522 |
| 29 | CHIEDOZIE OKORIE | MI | 3.5 | 1602 | 1314 |
| 30 | GEORGE AVERY JONES | ON | 3.5 | 1522 | 1144 |
| 31 | RISHI SHETTY | MI | 3.5 | 1494 | 1260 |
| 32 | JOSHUA PHILIP MATHEWS | ON | 3.5 | 1441 | 1379 |
| 33 | JADE GE | MI | 3.5 | 1449 | 1277 |
| 34 | MICHAEL JEFFERY THOMAS | MI | 3.5 | 1399 | 1375 |
| 35 | JOSHUA DAVID LEE | MI | 3.5 | 1438 | 1150 |
| 36 | SIDDHARTH JHA | MI | 3.5 | 1355 | 1388 |
| 37 | AMIYATOSH PWNANANDAM | MI | 3.5 | 980 | 1385 |
| 38 | BRIAN LIU | MI | 3.0 | 1423 | 1539 |
| 39 | JOEL R HENDON | MI | 3.0 | 1436 | 1430 |
| 40 | FOREST ZHANG | MI | 3.0 | 1348 | 1391 |
| 41 | KYLE WILLIAM MURPHY | MI | 3.0 | 1403 | 1248 |
| 42 | JARED GE | MI | 3.0 | 1332 | 1150 |
| 43 | ROBERT GLEN VASEY | MI | 3.0 | 1283 | 1107 |
| 44 | JUSTIN D SCHILLING | MI | 3.0 | 1199 | 1327 |
| 45 | DEREK YAN | MI | 3.0 | 1242 | 1152 |
| 46 | JACOB ALEXANDER LAVALLEY | MI | 3.0 | 377 | 1358 |
| 47 | ERIC WRIGHT | MI | 2.5 | 1362 | 1392 |
| 48 | DANIEL KHAIN | MI | 2.5 | 1382 | 1356 |
| 49 | MICHAEL J MARTIN | MI | 2.5 | 1291 | 1286 |
| 50 | SHIVAM JHA | MI | 2.5 | 1056 | 1296 |
| 51 | TEJAS AYYAGARI | MI | 2.5 | 1011 | 1356 |
| 52 | ETHAN GUO | MI | 2.5 | 935 | 1495 |
| 53 | JOSE C YBARRA | MI | 2.0 | 1393 | 1345 |
| 54 | LARRY HODGE | MI | 2.0 | 1270 | 1206 |
| 55 | ALEX KONG | MI | 2.0 | 1186 | 1406 |
| 56 | MARISA RICCI | MI | 2.0 | 1153 | 1414 |
| 57 | MICHAEL LU | MI | 2.0 | 1092 | 1363 |
| 58 | VIRAJ MOHILE | MI | 2.0 | 917 | 1391 |
| 59 | SEAN M MC | MI | 2.0 | 853 | 1319 |
| 60 | JULIA SHEN | MI | 1.5 | 967 | 1330 |
| 61 | JEZZEL FARKAS | ON | 1.5 | 955 | 1327 |
| 62 | ASHWIN BALAJI | MI | 1.0 | 1530 | 1186 |
| 63 | THOMAS JOSEPH HOSMER | MI | 1.0 | 1175 | 1350 |
| 64 | BEN LI | MI | 1.0 | 1163 | 1263 |
write.csv(df_tournament, "chess_tournament.csv", row.names = FALSE)