In this project, you’re given a text file with chess tournament
results where the information has some structure. Your job is to create
an R Markdown file that generates a .CSV file (that could for example be
imported into a SQL database) with the following information for all of
the players:
Player’s Name, Player’s State, Total Number of Points, Player’s
Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be:
Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
library(tidyverse)
#provide the github data path
dt_path <- "https://raw.githubusercontent.com/Naik-Khyati/607_p1_chess/main/data/tournamentinfo.txt"
# read the txt data file
chess_raw_data <- read.csv(dt_path, header = FALSE, sep = "|")
head(chess_raw_data,12)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair
## 3 Num
## 4 -----------------------------------------------------------------------------------------
## 5 1
## 6 ON
## 7 -----------------------------------------------------------------------------------------
## 8 2
## 9 MI
## 10 -----------------------------------------------------------------------------------------
## 11 3
## 12 MI
## V2 V3 V4 V5 V6 V7 V8 V9
## 1
## 2 Player Name Total Round Round Round Round Round Round
## 3 USCF ID / Rtg (Pre->Post) Pts 1 2 3 4 5 6
## 4
## 5 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12
## 6 15445895 / R: 1794 ->1817 N:2 W B W B W B
## 7
## 8 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16 W 20
## 9 14598900 / R: 1553 ->1663 N:2 B W B W B W
## 10
## 11 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11 W 13
## 12 14959604 / R: 1384 ->1640 N:2 W B W B W B
## V10 V11
## 1 NA
## 2 Round NA
## 3 7 NA
## 4 NA
## 5 D 4 NA
## 6 W NA
## 7 NA
## 8 W 7 NA
## 9 B NA
## 10 NA
## 11 W 12 NA
## 12 W NA
chess_dt <- chess_raw_data %>% slice(-1:-4)
head(chess_dt)
## V1
## 1 1
## 2 ON
## 3 -----------------------------------------------------------------------------------------
## 4 2
## 5 MI
## 6 -----------------------------------------------------------------------------------------
## V2 V3 V4 V5 V6 V7 V8 V9
## 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12
## 2 15445895 / R: 1794 ->1817 N:2 W B W B W B
## 3
## 4 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16 W 20
## 5 14598900 / R: 1553 ->1663 N:2 B W B W B W
## 6
## V10 V11
## 1 D 4 NA
## 2 W NA
## 3 NA
## 4 W 7 NA
## 5 B NA
## 6 NA
#delete rows with ----
chess_dt <- chess_dt[!grepl("---", chess_dt[ ,1]), ]
head(chess_dt)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W B W
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 5 MI 14598900 / R: 1553 ->1663 N:2 B W B W B
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 8 MI 14959604 / R: 1384 ->1640 N:2 W B W B W
## V9 V10 V11
## 1 D 12 D 4 NA
## 2 B W NA
## 4 W 20 W 7 NA
## 5 W B NA
## 7 W 13 W 12 NA
## 8 B W NA
unique(chess_dt$V11)
## [1] NA
chess_dt <- chess_dt %>% select(-V11)
head(chess_dt)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W B W
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 5 MI 14598900 / R: 1553 ->1663 N:2 B W B W B
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 8 MI 14959604 / R: 1384 ->1640 N:2 W B W B W
## V9 V10
## 1 D 12 D 4
## 2 B W
## 4 W 20 W 7
## 5 W B
## 7 W 13 W 12
## 8 B W
Essentially our data is stored in alternate rows. Hence, i will be saving records in even rows as a data frame and records in odd rows into another data frame.
Also, I will only keep relevant columns in the data frame that i will require int he final data frame.
chess_dt_odd <- chess_dt[seq(1, nrow(chess_dt), 2), ]
# select (V2,V3) %>% rename (name = V2, tot_pts = V3)
chess_dt_even <- chess_dt[seq(2, nrow(chess_dt), 2), ] %>% select (V1,V2) %>% rename (state = V1, pre_rt =V2)
chess_dt_comb <- cbind(chess_dt_odd, chess_dt_even)
head(chess_dt_comb)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 10 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5
## 13 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4
## 16 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35 D 10
## V9 V10 state pre_rt
## 1 D 12 D 4 ON 15445895 / R: 1794 ->1817
## 4 W 20 W 7 MI 14598900 / R: 1553 ->1663
## 7 W 13 W 12 MI 14959604 / R: 1384 ->1640
## 10 W 19 D 1 MI 12616049 / R: 1716 ->1744
## 13 W 14 W 17 MI 14601533 / R: 1655 ->1690
## 16 W 27 W 21 OH 15055204 / R: 1686 ->1687
#split the pre rating column by character R:
chess_dt_comb <- separate(chess_dt_comb, pre_rt, c("uscf_id", "rating" ), "R:")
#cleaning the rating column further to keep only digits
chess_dt_comb$pre_rating <- str_extract(chess_dt_comb$rating, "\\d+\\d")
# Convert pre_rating column to numeric
chess_dt_comb$pre_rating <- as.numeric(chess_dt_comb$pre_rating)
# Convert V3 column to numeric
chess_dt_comb$V3 <- as.numeric(chess_dt_comb$V3)
# Remove white space from column V1
chess_dt_comb$V1 <- trimws(chess_dt_comb$V1)
# Remove white space from column V2
chess_dt_comb$V2 <- trimws(chess_dt_comb$V2)
# Remove white space from column state
chess_dt_comb$state <- trimws(chess_dt_comb$state)
opp_data <- chess_dt_comb %>%
select(V1,V4:V10) %>%
gather(key='opp_nbr',value="opp_id",V4:V10) %>% arrange(V1)
glimpse(opp_data)
## Rows: 448
## Columns: 3
## $ V1 <chr> "1", "1", "1", "1", "1", "1", "1", "10", "10", "10", "10", "10…
## $ opp_nbr <chr> "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V4", "V5", "V6", "…
## $ opp_id <chr> "W 39", "W 21", "W 18", "W 14", "W 7", "D 12", "D 4",…
# keep only numeric values in opp_id column
opp_data$opp_id <- str_extract(opp_data$opp_id,"[0-9]+")
opp_data %>%
select(V1,opp_id) %>%
summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## V1 opp_id
## 1 0 40
opp_data <- opp_data %>% drop_na(opp_id)
opp_data %>%
select(V1,opp_id) %>%
summarise_all(funs(sum(is.na(.))))
## V1 opp_id
## 1 0 0
# Left join opp_data with chess_dt_comb to fetch the average pre rating for opponents
opp_data <- opp_data %>%
left_join(chess_dt_comb, c("opp_id" = "V1")) %>% select (V1, pre_rating) %>%
group_by (V1) %>%
summarize(avg_opp_rating= round(mean(pre_rating),0))
glimpse(opp_data)
## Rows: 64
## Columns: 2
## $ V1 <chr> "1", "10", "11", "12", "13", "14", "15", "16", "17", "1…
## $ avg_opp_rating <dbl> 1605, 1554, 1468, 1506, 1498, 1515, 1484, 1386, 1499, 1…
chess_dt_final <- chess_dt_comb %>%
left_join(opp_data, c("V1" = "V1")) %>% select (V2, state, V3, pre_rating,avg_opp_rating) %>% rename(player_name = V2, tot_points = V3)
head(chess_dt_final,100)
## player_name state tot_points pre_rating avg_opp_rating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
## 7 GARY DEE SWATHELL MI 5.0 1649 1372
## 8 EZEKIEL HOUGHTON MI 5.0 1641 1468
## 9 STEFANO LEE ON 5.0 1411 1523
## 10 ANVIT RAO MI 5.0 1365 1554
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1468
## 12 KENNETH J TACK MI 4.5 1663 1506
## 13 TORRANCE HENRY JR MI 4.5 1666 1498
## 14 BRADLEY SHAW MI 4.5 1610 1515
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 1484
## 16 MIKE NIKITIN MI 4.0 1604 1386
## 17 RONALD GRZEGORCZYK MI 4.0 1629 1499
## 18 DAVID SUNDEEN MI 4.0 1600 1480
## 19 DIPANKAR ROY MI 4.0 1564 1426
## 20 JASON ZHENG MI 4.0 1595 1411
## 21 DINH DANG BUI ON 4.0 1563 1470
## 22 EUGENE L MCCLURE MI 4.0 1555 1300
## 23 ALAN BUI ON 4.0 1363 1214
## 24 MICHAEL R ALDRICH MI 4.0 1229 1357
## 25 LOREN SCHWIEBERT MI 3.5 1745 1363
## 26 MAX ZHU ON 3.5 1579 1507
## 27 GAURAV GIDWANI MI 3.5 1552 1222
## 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507 1522
## 29 CHIEDOZIE OKORIE MI 3.5 1602 1314
## 30 GEORGE AVERY JONES ON 3.5 1522 1144
## 31 RISHI SHETTY MI 3.5 1494 1260
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441 1379
## 33 JADE GE MI 3.5 1449 1277
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399 1375
## 35 JOSHUA DAVID LEE MI 3.5 1438 1150
## 36 SIDDHARTH JHA MI 3.5 1355 1388
## 37 AMIYATOSH PWNANANDAM MI 3.5 980 1385
## 38 BRIAN LIU MI 3.0 1423 1539
## 39 JOEL R HENDON MI 3.0 1436 1430
## 40 FOREST ZHANG MI 3.0 1348 1391
## 41 KYLE WILLIAM MURPHY MI 3.0 1403 1248
## 42 JARED GE MI 3.0 1332 1150
## 43 ROBERT GLEN VASEY MI 3.0 1283 1107
## 44 JUSTIN D SCHILLING MI 3.0 1199 1327
## 45 DEREK YAN MI 3.0 1242 1152
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377 1358
## 47 ERIC WRIGHT MI 2.5 1362 1392
## 48 DANIEL KHAIN MI 2.5 1382 1356
## 49 MICHAEL J MARTIN MI 2.5 1291 1286
## 50 SHIVAM JHA MI 2.5 1056 1296
## 51 TEJAS AYYAGARI MI 2.5 1011 1356
## 52 ETHAN GUO MI 2.5 935 1495
## 53 JOSE C YBARRA MI 2.0 1393 1345
## 54 LARRY HODGE MI 2.0 1270 1206
## 55 ALEX KONG MI 2.0 1186 1406
## 56 MARISA RICCI MI 2.0 1153 1414
## 57 MICHAEL LU MI 2.0 1092 1363
## 58 VIRAJ MOHILE MI 2.0 917 1391
## 59 SEAN M MC CORMICK MI 2.0 853 1319
## 60 JULIA SHEN MI 1.5 967 1330
## 61 JEZZEL FARKAS ON 1.5 955 1327
## 62 ASHWIN BALAJI MI 1.0 1530 1186
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175 1350
## 64 BEN LI MI 1.0 1163 1263
write.csv(chess_dt_final,"chess_tournmnt_clean_data.csv",row.names = FALSE)