#Load the required library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the text file into a character vector
text_lines <- read_lines("Chess Tournament.txt")
# Skip the first four lines
text_lines <- tail(text_lines, -4)
Note: I tried to use separate_wider_regex but using my regex as the delims arguments for each of the columns did not work. So, I created and empty data frame, extracted the values needed, and added a row at a time.
line_list = list()
# Create an empty data frame. Data will be added to this data frame later
df <- data.frame(player_no = character(0), name = character(0), state = character(0), score = character(0), current_rating = character(0), next_round_rating = character(0), pair_ids = list())
# Loop through the text_lines vector three lines at a time
for (i in seq(1, length(text_lines), by = 3)){
# Save the first two lines as input strings. Ignore the third line with the dashes
input_string1 <- text_lines[i]
input_string2 <- text_lines[i + 1]
line_list <- append(line_list, str_flatten(input_string1, input_string2))
# Define regular expressions to extract the information
# I used positive look behind and positive look ahead extension notations to isolate the value that I need but are really not necessary
# since each value is the first occurrence of it's character type combination and str_extract only extract the first occurrence.
# However, positive look behind and positive look ahead extension notations were necessary for the next_round_rating and pair_ids.
player_no <- str_extract(input_string1, r"((?<=\s{3})(\d+)(?=\s\|))")
player_name <- str_extract(input_string1, r"((?<=\|\s)(\b[A-Z,\-,\s]+\b)(?=\s+\|\d+\.\d+\s+\|))")
state <- str_extract(input_string2, r"((\b[A-Z]{2}\b))")
rating <- str_extract(input_string1, r"((\b\d*\.\d*\b))")
current_rating <- str_extract(input_string2, r"((?<=R:\s{0,5})\d{3,4})")
next_round_rating <- str_extract(input_string2, r"((?<=->\s{0,5})\d+)")
pair_ids <- unlist(str_extract_all(input_string1, "(?<=[A-Z]\\s{1,4})(\\d*)(?=\\|)"))
# Add rows one at a time to df data frame.
new_row <- c(player_no, player_name, state, rating, current_rating, next_round_rating, pair_ids)
df <- rbind(df, new_row)
}
# Rename the columns
names(df) <- c("id", "name", "state", "total_pts", "pre_rtg", "post_rtg", "pair_1", "pair_2", "pair_3", "pair_4", "pair_5", "pair_6", "pair_7")
# For each row, iterate through each of columns pair_1 to pair_7 and replace the value
# with the corresponding value in pre_rtg column when there is a match.
# Otherwise leave the value of the column unchanged.
# Display the data frame
print(df)
## id name state total_pts pre_rtg post_rtg pair_1 pair_2
## 1 1 GARY HUA ON 6.0 1794 1817 39 21
## 2 2 DAKSHESH DARURI MI 6.0 1553 1663 63 58
## 3 3 ADITYA BAJAJ MI 6.0 1384 1640 8 61
## 4 4 PATRICK H SCHILLING MI 5.5 1716 1744 23 28
## 5 5 HANSHI ZUO MI 5.5 1655 1690 45 37
## 6 6 HANSEN SONG OH 5.0 1686 1687 34 29
## 7 7 GARY DEE SWATHELL MI 5.0 1649 1673 57 46
## 8 8 EZEKIEL HOUGHTON MI 5.0 1641 1657 3 32
## 9 9 STEFANO LEE ON 5.0 1411 1564 25 18
## 10 10 ANVIT RAO MI 5.0 1365 1544 16 19
## 11 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1696 38 56
## 12 12 KENNETH J TACK MI 4.5 1663 1670 42 33
## 13 13 TORRANCE HENRY JR MI 4.5 1666 1662 36 27
## 14 14 BRADLEY SHAW MI 4.5 1610 1618 54 44
## 15 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 1416 19 16
## 16 16 MIKE NIKITIN MI 4.0 1604 1613 10 15
## 17 17 RONALD GRZEGORCZYK MI 4.0 1629 1610 48 41
## 18 18 DAVID SUNDEEN MI 4.0 1600 1600 47 9
## 19 19 DIPANKAR ROY MI 4.0 1564 1570 15 10
## 20 20 JASON ZHENG MI 4.0 1595 1569 40 49
## 21 21 DINH DANG BUI ON 4.0 1563 1562 43 1
## 22 22 EUGENE L MCCLURE MI 4.0 1555 1529 64 52
## 23 23 ALAN BUI ON 4.0 1363 1371 4 43
## 24 24 MICHAEL R ALDRICH MI 4.0 1229 1300 28 47
## 25 25 LOREN SCHWIEBERT MI 3.5 1745 1681 9 53
## 26 26 MAX ZHU ON 3.5 1579 1564 49 40
## 27 27 GAURAV GIDWANI MI 3.5 1552 1539 51 13
## 28 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507 1513 24 4
## 29 29 CHIEDOZIE OKORIE MI 3.5 1602 1508 50 6
## 30 30 GEORGE AVERY JONES ON 3.5 1522 1444 52 64
## 31 31 RISHI SHETTY MI 3.5 1494 1444 58 55
## 32 32 JOSHUA PHILIP MATHEWS ON 3.5 1441 1433 61 8
## 33 33 JADE GE MI 3.5 1449 1421 60 12
## 34 34 MICHAEL JEFFERY THOMAS MI 3.5 1399 1400 6 60
## 35 35 JOSHUA DAVID LEE MI 3.5 1438 1392 46 38
## 36 36 SIDDHARTH JHA MI 3.5 1355 1367 13 57
## 37 37 AMIYATOSH PWNANANDAM MI 3.5 980 1077 5
## 38 38 BRIAN LIU MI 3.0 1423 1439 11 35
## 39 39 JOEL R HENDON MI 3.0 1436 1413 1 54
## 40 40 FOREST ZHANG MI 3.0 1348 1346 20 26
## 41 41 KYLE WILLIAM MURPHY MI 3.0 1403 1341 59 17
## 42 42 JARED GE MI 3.0 1332 1256 12 50
## 43 43 ROBERT GLEN VASEY MI 3.0 1283 1244 21 23
## 44 44 JUSTIN D SCHILLING MI 3.0 1199 1199 14
## 45 45 DEREK YAN MI 3.0 1242 1191 5 51
## 46 46 JACOB ALEXANDER LAVALLEY MI 3.0 377 1076 35 7
## 47 47 ERIC WRIGHT MI 2.5 1362 1341 18 24
## 48 48 DANIEL KHAIN MI 2.5 1382 1335 17 63
## 49 49 MICHAEL J MARTIN MI 2.5 1291 1259 26 20
## 50 50 SHIVAM JHA MI 2.5 1056 1111 29 42
## 51 51 TEJAS AYYAGARI MI 2.5 1011 1097 27 45
## 52 52 ETHAN GUO MI 2.5 935 1092 30 22
## 53 53 JOSE C YBARRA MI 2.0 1393 1359 25
## 54 54 LARRY HODGE MI 2.0 1270 1200 14 39
## 55 55 ALEX KONG MI 2.0 1186 1163 62 31
## 56 56 MARISA RICCI MI 2.0 1153 1140 11
## 57 57 MICHAEL LU MI 2.0 1092 1079 7 36
## 58 58 VIRAJ MOHILE MI 2.0 917 941 31 2
## 59 59 SEAN M MC CORMICK MI 2.0 853 878 41
## 60 60 JULIA SHEN MI 1.5 967 984 33 34
## 61 61 JEZZEL FARKAS ON 1.5 955 979 32 3
## 62 62 ASHWIN BALAJI MI 1.0 1530 1535 55
## 63 63 THOMAS JOSEPH HOSMER MI 1.0 1175 1125 2 48
## 64 64 BEN LI MI 1.0 1163 1112 22 30
## pair_3 pair_4 pair_5 pair_6 pair_7
## 1 18 14 7 12 4
## 2 4 17 16 20 7
## 3 25 21 11 13 12
## 4 2 26 5 19 1
## 5 12 13 4 14 17
## 6 11 35 10 27 21
## 7 13 11 1 9 2
## 8 14 9 47 28 19
## 9 59 8 26 7 20
## 10 55 31 6 25 18
## 11 6 7 3 34 26
## 12 5 38 1 3
## 13 7 5 33 3 32
## 14 8 1 27 5 31
## 15 30 22 54 33 38
## 16 39 2 36
## 17 26 2 23 22 5
## 18 1 32 19 38 10
## 19 52 28 18 4 8
## 20 23 41 28 2 9
## 21 47 3 40 39 6
## 22 28 15 17 40
## 23 20 58 17 37 46
## 24 43 25 60 44 39
## 25 3 24 34 10 47
## 26 17 4 9 32 11
## 27 46 37 14 6
## 28 22 19 20 8 36
## 29 38 34 52 48
## 30 15 55 31 61 50
## 31 64 10 30 50 14
## 32 44 18 51 26 13
## 33 50 36 13 15 51
## 34 37 29 25 11 52
## 35 56 6 57 52 48
## 36 51 33 16 28
## 37 34 27 23 61
## 38 29 12 18 15
## 39 40 16 44 21 24
## 40 39 59 21 56 22
## 41 58 20
## 42 57 60 61 64 56
## 43 24 63 59 46 55
## 44 32 53 39 24 59
## 45 60 56 63 55 58
## 46 27 50 64 43 23
## 47 21 61 8 51 25
## 48 52 29 35
## 49 63 64 58
## 50 33 46 31 30
## 51 36 57 32 47 33
## 52 19 48 29 35 34
## 53 44 57
## 54 61 15 59 64
## 55 10 30 45 43
## 56 35 45 40 42
## 57 42 51 35 53
## 58 41 23 49 45
## 59 9 40 43 54 44
## 60 45 42 24
## 61 54 47 42 30 37
## 62
## 63 49 43 45
## 64 31 49 46 42 54
#print(my_list)
#matches[[1]][, -1]
#class(line_pairs)
#line_pairs
df <- df |> mutate(
total_pts = as.numeric(total_pts),
pre_rtg <- as.numeric(pre_rtg),
post_rtg <- as.numeric(post_rtg),
pair_1 = as.numeric(pre_rtg[as.numeric(pair_1)]),
pair_2 = as.numeric(pre_rtg[as.numeric(pair_2)]),
pair_3 = as.numeric(pre_rtg[as.numeric(pair_3)]),
pair_4 = as.numeric(pre_rtg[as.numeric(pair_4)]),
pair_5 = as.numeric(pre_rtg[as.numeric(pair_5)]),
pair_6 = as.numeric(pre_rtg[as.numeric(pair_6)]),
pair_7 = as.numeric(pre_rtg[as.numeric(pair_7)])
)
df
## id name state total_pts pre_rtg post_rtg pair_1 pair_2
## 1 1 GARY HUA ON 6.0 1794 1817 1436 1563
## 2 2 DAKSHESH DARURI MI 6.0 1553 1663 1175 917
## 3 3 ADITYA BAJAJ MI 6.0 1384 1640 1641 955
## 4 4 PATRICK H SCHILLING MI 5.5 1716 1744 1363 1507
## 5 5 HANSHI ZUO MI 5.5 1655 1690 1242 980
## 6 6 HANSEN SONG OH 5.0 1686 1687 1399 1602
## 7 7 GARY DEE SWATHELL MI 5.0 1649 1673 1092 377
## 8 8 EZEKIEL HOUGHTON MI 5.0 1641 1657 1384 1441
## 9 9 STEFANO LEE ON 5.0 1411 1564 1745 1600
## 10 10 ANVIT RAO MI 5.0 1365 1544 1604 1564
## 11 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1696 1423 1153
## 12 12 KENNETH J TACK MI 4.5 1663 1670 1332 1449
## 13 13 TORRANCE HENRY JR MI 4.5 1666 1662 1355 1552
## 14 14 BRADLEY SHAW MI 4.5 1610 1618 1270 1199
## 15 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 1416 1564 1604
## 16 16 MIKE NIKITIN MI 4.0 1604 1613 1365 1220
## 17 17 RONALD GRZEGORCZYK MI 4.0 1629 1610 1382 1403
## 18 18 DAVID SUNDEEN MI 4.0 1600 1600 1362 1411
## 19 19 DIPANKAR ROY MI 4.0 1564 1570 1220 1365
## 20 20 JASON ZHENG MI 4.0 1595 1569 1348 1291
## 21 21 DINH DANG BUI ON 4.0 1563 1562 1283 1794
## 22 22 EUGENE L MCCLURE MI 4.0 1555 1529 1163 935
## 23 23 ALAN BUI ON 4.0 1363 1371 1716 1283
## 24 24 MICHAEL R ALDRICH MI 4.0 1229 1300 1507 1362
## 25 25 LOREN SCHWIEBERT MI 3.5 1745 1681 1411 1393
## 26 26 MAX ZHU ON 3.5 1579 1564 1291 1348
## 27 27 GAURAV GIDWANI MI 3.5 1552 1539 1011 1666
## 28 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507 1513 1229 1716
## 29 29 CHIEDOZIE OKORIE MI 3.5 1602 1508 1056 1686
## 30 30 GEORGE AVERY JONES ON 3.5 1522 1444 935 1163
## 31 31 RISHI SHETTY MI 3.5 1494 1444 917 1186
## 32 32 JOSHUA PHILIP MATHEWS ON 3.5 1441 1433 955 1641
## 33 33 JADE GE MI 3.5 1449 1421 967 1663
## 34 34 MICHAEL JEFFERY THOMAS MI 3.5 1399 1400 1686 967
## 35 35 JOSHUA DAVID LEE MI 3.5 1438 1392 377 1423
## 36 36 SIDDHARTH JHA MI 3.5 1355 1367 1666 1092
## 37 37 AMIYATOSH PWNANANDAM MI 3.5 980 1077 NA 1655
## 38 38 BRIAN LIU MI 3.0 1423 1439 1712 1438
## 39 39 JOEL R HENDON MI 3.0 1436 1413 1794 1270
## 40 40 FOREST ZHANG MI 3.0 1348 1346 1595 1579
## 41 41 KYLE WILLIAM MURPHY MI 3.0 1403 1341 853 1629
## 42 42 JARED GE MI 3.0 1332 1256 1663 1056
## 43 43 ROBERT GLEN VASEY MI 3.0 1283 1244 1563 1363
## 44 44 JUSTIN D SCHILLING MI 3.0 1199 1199 NA 1610
## 45 45 DEREK YAN MI 3.0 1242 1191 1655 1011
## 46 46 JACOB ALEXANDER LAVALLEY MI 3.0 377 1076 1438 1649
## 47 47 ERIC WRIGHT MI 2.5 1362 1341 1600 1229
## 48 48 DANIEL KHAIN MI 2.5 1382 1335 1629 1175
## 49 49 MICHAEL J MARTIN MI 2.5 1291 1259 1579 1595
## 50 50 SHIVAM JHA MI 2.5 1056 1111 1602 1332
## 51 51 TEJAS AYYAGARI MI 2.5 1011 1097 1552 1242
## 52 52 ETHAN GUO MI 2.5 935 1092 1522 1555
## 53 53 JOSE C YBARRA MI 2.0 1393 1359 NA 1745
## 54 54 LARRY HODGE MI 2.0 1270 1200 1610 1436
## 55 55 ALEX KONG MI 2.0 1186 1163 1530 1494
## 56 56 MARISA RICCI MI 2.0 1153 1140 NA 1712
## 57 57 MICHAEL LU MI 2.0 1092 1079 1649 1355
## 58 58 VIRAJ MOHILE MI 2.0 917 941 1494 1553
## 59 59 SEAN M MC CORMICK MI 2.0 853 878 1403 NA
## 60 60 JULIA SHEN MI 1.5 967 984 1449 1399
## 61 61 JEZZEL FARKAS ON 1.5 955 979 1441 1384
## 62 62 ASHWIN BALAJI MI 1.0 1530 1535 1186 NA
## 63 63 THOMAS JOSEPH HOSMER MI 1.0 1175 1125 1553 1382
## 64 64 BEN LI MI 1.0 1163 1112 1555 1522
## pair_3 pair_4 pair_5 pair_6 pair_7 pre_rtg <- as.numeric(pre_rtg)
## 1 1600 1610 1649 1663 1716 1794
## 2 1716 1629 1604 1595 1649 1553
## 3 1745 1563 1712 1666 1663 1384
## 4 1553 1579 1655 1564 1794 1716
## 5 1663 1666 1716 1610 1629 1655
## 6 1712 1438 1365 1552 1563 1686
## 7 1666 1712 1794 1411 1553 1649
## 8 1610 1411 1362 1507 1564 1641
## 9 853 1641 1579 1649 1595 1411
## 10 1186 1494 1686 1745 1600 1365
## 11 1686 1649 1384 1399 1579 1712
## 12 1655 1423 NA 1794 1384 1663
## 13 1649 1655 1449 1384 1441 1666
## 14 1641 1794 1552 1655 1494 1610
## 15 1522 1555 1270 1449 1423 1220
## 16 NA 1436 1553 1355 NA 1604
## 17 1579 1553 1363 1555 1655 1629
## 18 1794 1441 1564 1423 1365 1600
## 19 935 1507 1600 1716 1641 1564
## 20 1363 1403 1507 1553 1411 1595
## 21 1362 1384 1348 1436 1686 1563
## 22 1507 1220 NA 1629 1348 1555
## 23 1595 917 1629 980 377 1363
## 24 1283 1745 967 1199 1436 1229
## 25 1384 1229 1399 1365 1362 1745
## 26 1629 1716 1411 1441 1712 1579
## 27 377 980 1610 1686 NA 1552
## 28 1555 1564 1595 1641 1355 1507
## 29 1423 1399 935 1382 NA 1602
## 30 1220 1186 1494 955 1056 1522
## 31 1163 1365 1522 1056 1610 1494
## 32 1199 1600 1011 1579 1666 1441
## 33 1056 1355 1666 1220 1011 1449
## 34 980 1602 1745 1712 935 1399
## 35 1153 1686 1092 935 1382 1438
## 36 1011 1449 NA 1604 1507 1355
## 37 1399 1552 NA 1363 955 980
## 38 1602 1663 NA 1600 1220 1423
## 39 1348 1604 1199 1563 1229 1436
## 40 1436 853 1563 1153 1555 1348
## 41 917 1595 NA NA NA 1403
## 42 1092 967 955 1163 1153 1332
## 43 1229 1175 853 377 1186 1283
## 44 1441 1393 1436 1229 853 1199
## 45 967 1153 1175 1186 917 1242
## 46 1552 1056 1163 1283 1363 377
## 47 1563 955 1641 1011 1745 1362
## 48 NA 935 NA 1602 1438 1382
## 49 1175 1163 917 NA NA 1291
## 50 1449 377 NA 1494 1522 1056
## 51 1355 1092 1441 1362 1449 1011
## 52 1564 1382 1602 1438 1399 935
## 53 NA 1199 NA 1092 NA 1393
## 54 955 NA 1220 853 1163 1270
## 55 1365 1522 NA 1242 1283 1186
## 56 1438 1242 NA 1348 1332 1153
## 57 1332 1011 1438 1393 NA 1092
## 58 1403 1363 1291 NA 1242 917
## 59 1411 1348 1283 1270 1199 853
## 60 1242 1332 1229 NA NA 967
## 61 1270 1362 1332 1522 980 955
## 62 NA NA NA NA NA 1530
## 63 1291 1283 1242 NA NA 1175
## 64 1494 1291 377 1332 1270 1163
## post_rtg <- as.numeric(post_rtg)
## 1 1817
## 2 1663
## 3 1640
## 4 1744
## 5 1690
## 6 1687
## 7 1673
## 8 1657
## 9 1564
## 10 1544
## 11 1696
## 12 1670
## 13 1662
## 14 1618
## 15 1416
## 16 1613
## 17 1610
## 18 1600
## 19 1570
## 20 1569
## 21 1562
## 22 1529
## 23 1371
## 24 1300
## 25 1681
## 26 1564
## 27 1539
## 28 1513
## 29 1508
## 30 1444
## 31 1444
## 32 1433
## 33 1421
## 34 1400
## 35 1392
## 36 1367
## 37 1077
## 38 1439
## 39 1413
## 40 1346
## 41 1341
## 42 1256
## 43 1244
## 44 1199
## 45 1191
## 46 1076
## 47 1341
## 48 1335
## 49 1259
## 50 1111
## 51 1097
## 52 1092
## 53 1359
## 54 1200
## 55 1163
## 56 1140
## 57 1079
## 58 941
## 59 878
## 60 984
## 61 979
## 62 1535
## 63 1125
## 64 1112
# Specify the range of columns whose values I want to average
#start_col <- 7 # Starting column index
#end_col <- 13 # Ending column index
# Find the average of columns pair_1 to pair_7 across each row. Ignore columns with NA
# Select and display the required columns
df_final <- df |>
rowwise() |>
mutate(mean_rtg = round(mean(c_across(starts_with("pair_")), na.rm = TRUE))) |>
select("Player Name" = name,
"Player State" = state,
"Total Points" = total_pts,
"Pre-Rating" = pre_rtg,
"Average Pre-Rating" = mean_rtg
)
print(df_final)
## # A tibble: 64 × 5
## # Rowwise:
## `Player Name` `Player State` `Total Points` `Pre-Rating` `Average Pre-Rating`
## <chr> <chr> <dbl> <chr> <dbl>
## 1 GARY HUA ON 6 1794 1605
## 2 DAKSHESH DAR… MI 6 1553 1469
## 3 ADITYA BAJAJ MI 6 1384 1564
## 4 PATRICK H SC… MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5 1686 1519
## 7 GARY DEE SWA… MI 5 1649 1372
## 8 EZEKIEL HOUG… MI 5 1641 1468
## 9 STEFANO LEE ON 5 1411 1523
## 10 ANVIT RAO MI 5 1365 1554
## # ℹ 54 more rows