library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (readr)
library(stringr)
urlfile= "https://raw.githubusercontent.com/ursulapodosenin/DAT-607/main/tournament_info.txt"
raw_tournament_data <- data.frame(readLines(urlfile))
## Warning in readLines(urlfile): incomplete final line found on
## 'https://raw.githubusercontent.com/ursulapodosenin/DAT-607/main/tournament_info.txt'
str(raw_tournament_data)
## 'data.frame': 196 obs. of 1 variable:
## $ readLines.urlfile.: chr "-----------------------------------------------------------------------------------------" " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| " " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | " "-----------------------------------------------------------------------------------------" ...
dim(raw_tournament_data)
## [1] 196 1
player_number <- as.numeric(unlist(str_extract_all(raw_tournament_data,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
player_name <- unlist(str_extract_all(raw_tournament_data,"(?<=\\d\\s\\|\\s)([A-z, -]*\\s){1,}[[:alpha:]]*(?=\\s*\\|)"))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
player_state <- unlist(str_extract_all(raw_tournament_data, "[[:upper:]]{2}(?=\\s\\|)"))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
total_points <- as.numeric(unlist(str_extract_all(raw_tournament_data, "(?<=\\|)\\d\\.\\d")))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
player_pre_rating <- as.numeric(unlist(str_extract_all(raw_tournament_data, "(?<=R:\\s{1,2})(\\d{3,4}(?=\\s))|(\\d{3,4}(?=P\\d{1,2}\\s*-))")))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
tournament_data <- data.frame(player_number, player_name, player_state, total_points, player_pre_rating)
tournament_data
## player_number player_name player_state total_points
## 1 1 GARY HUA ON 6.0
## 2 2 DAKSHESH DARURI MI 6.0
## 3 3 ADITYA BAJAJ MI 6.0
## 4 4 PATRICK H SCHILLING MI 5.5
## 5 5 HANSHI ZUO MI 5.5
## 6 6 HANSEN SONG OH 5.0
## 7 7 GARY DEE SWATHELL MI 5.0
## 8 8 EZEKIEL HOUGHTON MI 5.0
## 9 9 STEFANO LEE ON 5.0
## 10 10 ANVIT RAO MI 5.0
## 11 11 CAMERON WILLIAM MC LEMAN MI 4.5
## 12 12 KENNETH J TACK MI 4.5
## 13 13 TORRANCE HENRY JR MI 4.5
## 14 14 BRADLEY SHAW MI 4.5
## 15 15 ZACHARY JAMES HOUGHTON MI 4.5
## 16 16 MIKE NIKITIN MI 4.0
## 17 17 RONALD GRZEGORCZYK MI 4.0
## 18 18 DAVID SUNDEEN MI 4.0
## 19 19 DIPANKAR ROY MI 4.0
## 20 20 JASON ZHENG MI 4.0
## 21 21 DINH DANG BUI ON 4.0
## 22 22 EUGENE L MCCLURE MI 4.0
## 23 23 ALAN BUI ON 4.0
## 24 24 MICHAEL R ALDRICH MI 4.0
## 25 25 LOREN SCHWIEBERT MI 3.5
## 26 26 MAX ZHU ON 3.5
## 27 27 GAURAV GIDWANI MI 3.5
## 28 28 SOFIA ADINA STANESCU-BELLU MI 3.5
## 29 29 CHIEDOZIE OKORIE MI 3.5
## 30 30 GEORGE AVERY JONES ON 3.5
## 31 31 RISHI SHETTY MI 3.5
## 32 32 JOSHUA PHILIP MATHEWS ON 3.5
## 33 33 JADE GE MI 3.5
## 34 34 MICHAEL JEFFERY THOMAS MI 3.5
## 35 35 JOSHUA DAVID LEE MI 3.5
## 36 36 SIDDHARTH JHA MI 3.5
## 37 37 AMIYATOSH PWNANANDAM MI 3.5
## 38 38 BRIAN LIU MI 3.0
## 39 39 JOEL R HENDON MI 3.0
## 40 40 FOREST ZHANG MI 3.0
## 41 41 KYLE WILLIAM MURPHY MI 3.0
## 42 42 JARED GE MI 3.0
## 43 43 ROBERT GLEN VASEY MI 3.0
## 44 44 JUSTIN D SCHILLING MI 3.0
## 45 45 DEREK YAN MI 3.0
## 46 46 JACOB ALEXANDER LAVALLEY MI 3.0
## 47 47 ERIC WRIGHT MI 2.5
## 48 48 DANIEL KHAIN MI 2.5
## 49 49 MICHAEL J MARTIN MI 2.5
## 50 50 SHIVAM JHA MI 2.5
## 51 51 TEJAS AYYAGARI MI 2.5
## 52 52 ETHAN GUO MI 2.5
## 53 53 JOSE C YBARRA MI 2.0
## 54 54 LARRY HODGE MI 2.0
## 55 55 ALEX KONG MI 2.0
## 56 56 MARISA RICCI MI 2.0
## 57 57 MICHAEL LU MI 2.0
## 58 58 VIRAJ MOHILE MI 2.0
## 59 59 SEAN M MC CORMICK MI 2.0
## 60 60 JULIA SHEN MI 1.5
## 61 61 JEZZEL FARKAS ON 1.5
## 62 62 ASHWIN BALAJI MI 1.0
## 63 63 THOMAS JOSEPH HOSMER MI 1.0
## 64 64 BEN LI MI 1.0
## player_pre_rating
## 1 1794
## 2 1553
## 3 1384
## 4 1716
## 5 1655
## 6 1686
## 7 1649
## 8 1641
## 9 1411
## 10 1365
## 11 1712
## 12 1663
## 13 1666
## 14 1610
## 15 1220
## 16 1604
## 17 1629
## 18 1600
## 19 1564
## 20 1595
## 21 1563
## 22 1555
## 23 1363
## 24 1229
## 25 1745
## 26 1579
## 27 1552
## 28 1507
## 29 1602
## 30 1522
## 31 1494
## 32 1441
## 33 1449
## 34 1399
## 35 1438
## 36 1355
## 37 980
## 38 1423
## 39 1436
## 40 1348
## 41 1403
## 42 1332
## 43 1283
## 44 1199
## 45 1242
## 46 377
## 47 1362
## 48 1382
## 49 1291
## 50 1056
## 51 1011
## 52 935
## 53 1393
## 54 1270
## 55 1186
## 56 1153
## 57 1092
## 58 917
## 59 853
## 60 967
## 61 955
## 62 1530
## 63 1175
## 64 1163
tournament_data <- tournament_data[-c(1:5),]
head(tournament_data)
## player_number player_name player_state total_points
## 6 6 HANSEN SONG OH 5.0
## 7 7 GARY DEE SWATHELL MI 5.0
## 8 8 EZEKIEL HOUGHTON MI 5.0
## 9 9 STEFANO LEE ON 5.0
## 10 10 ANVIT RAO MI 5.0
## 11 11 CAMERON WILLIAM MC LEMAN MI 4.5
## player_pre_rating
## 6 1686
## 7 1649
## 8 1641
## 9 1411
## 10 1365
## 11 1712
opponent_rating <- integer(length(player_number))
for (i in 1:length(player_number)) {
opponents_indices <- as.numeric(unlist(player_number[player_number[i]]))
opponent_rating[i] <- round(mean(player_pre_rating[opponents_indices]), digits = 0)
}
chess_tournament_data <- data.frame(player_number, player_name, player_state, total_points, player_pre_rating)
write.csv(chess_tournament_data,file = "chess_tournament_data.csv")