In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
###Loading the text file into Github. From there I converted it to a readable data frame. I skipped the first 4 rows as it was not necessary for my analysis. My plan for the assignment was to create a dataframe for each of the information needed and pasting it together at the end.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
url <- "https://raw.githubusercontent.com/LeJQC/MSDS/main/DATA%20607/Project%201/tournamentinfo.txt"
df <- read.csv(url,header = FALSE, sep = "|", skip = 4)
view(df)
###Creating a data frame for each player
#Checking the V2 column which contains all the names of the players
head(df$V2,20)
## [1] " GARY HUA " " 15445895 / R: 1794 ->1817 "
## [3] "" " DAKSHESH DARURI "
## [5] " 14598900 / R: 1553 ->1663 " ""
## [7] " ADITYA BAJAJ " " 14959604 / R: 1384 ->1640 "
## [9] "" " PATRICK H SCHILLING "
## [11] " 12616049 / R: 1716 ->1744 " ""
## [13] " HANSHI ZUO " " 14601533 / R: 1655 ->1690 "
## [15] "" " HANSEN SONG "
## [17] " 15055204 / R: 1686 ->1687 " ""
## [19] " GARY DEE SWATHELL " " 11146376 / R: 1649 ->1673 "
#Cleaning the data and searching for 2 consecutive capital letters
trim_df2 <- str_trim(df$V2)
player_names <-grep("[A-Z][A-Z]", trim_df2, value = TRUE)
#Data frame has 1 column, 64 rows
glimpse(player_names)
## chr [1:64] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" ...
###Creating a data frame for each state
head(df$V1,20)
## [1] " 1 "
## [2] " ON "
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 "
## [5] " MI "
## [6] "-----------------------------------------------------------------------------------------"
## [7] " 3 "
## [8] " MI "
## [9] "-----------------------------------------------------------------------------------------"
## [10] " 4 "
## [11] " MI "
## [12] "-----------------------------------------------------------------------------------------"
## [13] " 5 "
## [14] " MI "
## [15] "-----------------------------------------------------------------------------------------"
## [16] " 6 "
## [17] " OH "
## [18] "-----------------------------------------------------------------------------------------"
## [19] " 7 "
## [20] " MI "
#Same as before but with the V1 column that contains the 2 letter state
states <- str_trim(grep("[A-ZA-Z]",df$V1, value = TRUE))
#Data frame has 1 column, 64 rows
glimpse(states)
## chr [1:64] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" ...
#Creating a data frame for total points of each player
head(df$V3,20)
## [1] "6.0 " "N:2 " "" "6.0 " "N:2 " "" "6.0 " "N:2 " ""
## [10] "5.5 " "N:2 " "" "5.5 " "N:2 " "" "5.0 " "N:3 " ""
## [19] "5.0 " "N:3 "
#Searching the V3 column for a digit followed by a dot followed by another digit
total_points <- str_trim(grep("\\d(.)\\d",df$V3, value = TRUE))
#Data frame has 1 column, 64 rows
glimpse(total_points)
## chr [1:64] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" ...
#Creating a data frame for player’s prerating
#The prerating is in the same column as the player name which we used before
head(trim_df2,20)
## [1] "GARY HUA" "15445895 / R: 1794 ->1817"
## [3] "" "DAKSHESH DARURI"
## [5] "14598900 / R: 1553 ->1663" ""
## [7] "ADITYA BAJAJ" "14959604 / R: 1384 ->1640"
## [9] "" "PATRICK H SCHILLING"
## [11] "12616049 / R: 1716 ->1744" ""
## [13] "HANSHI ZUO" "14601533 / R: 1655 ->1690"
## [15] "" "HANSEN SONG"
## [17] "15055204 / R: 1686 ->1687" ""
## [19] "GARY DEE SWATHELL" "11146376 / R: 1649 ->1673"
#Searching for the "R: 1794" pattern
r_rating <- grep("R:\\s+\\d+", trim_df2, value = TRUE)
#Data frame has 1 column, 64 rows but there is other stuff I don't need
glimpse(r_rating)
## chr [1:64] "15445895 / R: 1794 ->1817" "14598900 / R: 1553 ->1663" ...
#Positive lookbehind: Matching digit only if preceded by R with 1 or 2 spaces to account for 3 and 4 digit ratings
pre_rating <- str_extract(r_rating, "(?<=R: )\\d+|(?<=R: )\\d+")
#This data frame just has the 64 ratings
glimpse(pre_rating)
## chr [1:64] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" ...
###Checking how the data looks so far
player_data4 <- data.frame(player_names, states, total_points, pre_rating)
head(player_data4,20)
## player_names states total_points pre_rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 12 KENNETH J TACK MI 4.5 1663
## 13 TORRANCE HENRY JR MI 4.5 1666
## 14 BRADLEY SHAW MI 4.5 1610
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220
## 16 MIKE NIKITIN MI 4.0 1604
## 17 RONALD GRZEGORCZYK MI 4.0 1629
## 18 DAVID SUNDEEN MI 4.0 1600
## 19 DIPANKAR ROY MI 4.0 1564
## 20 JASON ZHENG MI 4.0 1595
#Getting the opponents from each round
#Used a for loop to iterate over the orginial data frame(df) starting from column 4 to 10.
for (i in 4:10) {
#Searched for 5 spaces or a pattern ending in a digit
col_values <- str_subset(df[[i]], pattern = " |([0-9]$)")
#Replaced 5 spaces with na value
col_plus_na <- str_replace_all(col_values, " ", "NA")
#Replaced the W,L,or D with spaces and trimmed it. Saved it as a column
col_clean <- as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D", ""))))
#Assigning the values to columns named round_1 to round_7
assign(paste0("round_", i-3), col_clean)
}
## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion
## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion
## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion
## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion
## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion
## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion
## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion
#Manually combining the all the data frames. Tried appending the rounds in the for loop but it kept giving me an error
rounds <- cbind(player_data4,round_1,round_2,round_3,round_4,round_5,round_6,round_7)
head(rounds,20)
## player_names states total_points pre_rating round_1 round_2
## 1 GARY HUA ON 6.0 1794 39 21
## 2 DAKSHESH DARURI MI 6.0 1553 63 58
## 3 ADITYA BAJAJ MI 6.0 1384 8 61
## 4 PATRICK H SCHILLING MI 5.5 1716 23 28
## 5 HANSHI ZUO MI 5.5 1655 45 37
## 6 HANSEN SONG OH 5.0 1686 34 29
## 7 GARY DEE SWATHELL MI 5.0 1649 57 46
## 8 EZEKIEL HOUGHTON MI 5.0 1641 3 32
## 9 STEFANO LEE ON 5.0 1411 25 18
## 10 ANVIT RAO MI 5.0 1365 16 19
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 38 56
## 12 KENNETH J TACK MI 4.5 1663 42 33
## 13 TORRANCE HENRY JR MI 4.5 1666 36 27
## 14 BRADLEY SHAW MI 4.5 1610 54 44
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 19 16
## 16 MIKE NIKITIN MI 4.0 1604 10 15
## 17 RONALD GRZEGORCZYK MI 4.0 1629 48 41
## 18 DAVID SUNDEEN MI 4.0 1600 47 9
## 19 DIPANKAR ROY MI 4.0 1564 15 10
## 20 JASON ZHENG MI 4.0 1595 40 49
## round_3 round_4 round_5 round_6 round_7
## 1 18 14 7 12 4
## 2 4 17 16 20 7
## 3 25 21 11 13 12
## 4 2 26 5 19 1
## 5 12 13 4 14 17
## 6 11 35 10 27 21
## 7 13 11 1 9 2
## 8 14 9 47 28 19
## 9 59 8 26 7 20
## 10 55 31 6 25 18
## 11 6 7 3 34 26
## 12 5 38 NA 1 3
## 13 7 5 33 3 32
## 14 8 1 27 5 31
## 15 30 22 54 33 38
## 16 NA 39 2 36 NA
## 17 26 2 23 22 5
## 18 1 32 19 38 10
## 19 52 28 18 4 8
## 20 23 41 28 2 9
###Finding Average Pre Chess Rating of Opponents
#Created a list with 64 rows
prematch_ratings <- vector("list", nrow(rounds))
#Iterating over the 64 rows
for (i in 1:nrow(rounds)) {
current_ratings <- c()
#Row 1, Fifth column = 39
for (k in 5:11) {
opponent <- rounds[i,k]
#Appending prerating of opponent 39 to current ratings vector
if (!is.na(opponent)) {
current_ratings <- c(current_ratings, pre_rating[opponent])
}
}
#Storing the rating in a list
prematch_ratings[[i]] <- as.numeric(current_ratings)
}
head(prematch_ratings)
## [[1]]
## [1] 1436 1563 1600 1610 1649 1663 1716
##
## [[2]]
## [1] 1175 917 1716 1629 1604 1595 1649
##
## [[3]]
## [1] 1641 955 1745 1563 1712 1666 1663
##
## [[4]]
## [1] 1363 1507 1553 1579 1655 1564 1794
##
## [[5]]
## [1] 1242 980 1663 1666 1716 1610 1629
##
## [[6]]
## [1] 1399 1602 1712 1438 1365 1552 1563
mean_ratings <- sapply(prematch_ratings, mean)
rounds$opponent_avg_rating <- round(mean_ratings)
head(rounds,20)
## player_names states total_points pre_rating round_1 round_2
## 1 GARY HUA ON 6.0 1794 39 21
## 2 DAKSHESH DARURI MI 6.0 1553 63 58
## 3 ADITYA BAJAJ MI 6.0 1384 8 61
## 4 PATRICK H SCHILLING MI 5.5 1716 23 28
## 5 HANSHI ZUO MI 5.5 1655 45 37
## 6 HANSEN SONG OH 5.0 1686 34 29
## 7 GARY DEE SWATHELL MI 5.0 1649 57 46
## 8 EZEKIEL HOUGHTON MI 5.0 1641 3 32
## 9 STEFANO LEE ON 5.0 1411 25 18
## 10 ANVIT RAO MI 5.0 1365 16 19
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 38 56
## 12 KENNETH J TACK MI 4.5 1663 42 33
## 13 TORRANCE HENRY JR MI 4.5 1666 36 27
## 14 BRADLEY SHAW MI 4.5 1610 54 44
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 19 16
## 16 MIKE NIKITIN MI 4.0 1604 10 15
## 17 RONALD GRZEGORCZYK MI 4.0 1629 48 41
## 18 DAVID SUNDEEN MI 4.0 1600 47 9
## 19 DIPANKAR ROY MI 4.0 1564 15 10
## 20 JASON ZHENG MI 4.0 1595 40 49
## round_3 round_4 round_5 round_6 round_7 opponent_avg_rating
## 1 18 14 7 12 4 1605
## 2 4 17 16 20 7 1469
## 3 25 21 11 13 12 1564
## 4 2 26 5 19 1 1574
## 5 12 13 4 14 17 1501
## 6 11 35 10 27 21 1519
## 7 13 11 1 9 2 1372
## 8 14 9 47 28 19 1468
## 9 59 8 26 7 20 1523
## 10 55 31 6 25 18 1554
## 11 6 7 3 34 26 1468
## 12 5 38 NA 1 3 1506
## 13 7 5 33 3 32 1498
## 14 8 1 27 5 31 1515
## 15 30 22 54 33 38 1484
## 16 NA 39 2 36 NA 1386
## 17 26 2 23 22 5 1499
## 18 1 32 19 38 10 1480
## 19 52 28 18 4 8 1426
## 20 23 41 28 2 9 1411
###Selecting the columns and importing to csv
chess_tour <- rounds %>%
select(player_names, states, total_points, pre_rating, opponent_avg_rating)
head(chess_tour,20)
## player_names states total_points pre_rating opponent_avg_rating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
## 7 GARY DEE SWATHELL MI 5.0 1649 1372
## 8 EZEKIEL HOUGHTON MI 5.0 1641 1468
## 9 STEFANO LEE ON 5.0 1411 1523
## 10 ANVIT RAO MI 5.0 1365 1554
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1468
## 12 KENNETH J TACK MI 4.5 1663 1506
## 13 TORRANCE HENRY JR MI 4.5 1666 1498
## 14 BRADLEY SHAW MI 4.5 1610 1515
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 1484
## 16 MIKE NIKITIN MI 4.0 1604 1386
## 17 RONALD GRZEGORCZYK MI 4.0 1629 1499
## 18 DAVID SUNDEEN MI 4.0 1600 1480
## 19 DIPANKAR ROY MI 4.0 1564 1426
## 20 JASON ZHENG MI 4.0 1595 1411
write.csv(chess_tour, file = "Chess Project.csv", row.names = FALSE)