Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

###Loading the text file into Github. From there I converted it to a readable data frame. I skipped the first 4 rows as it was not necessary for my analysis. My plan for the assignment was to create a dataframe for each of the information needed and pasting it together at the end.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

url <- "https://raw.githubusercontent.com/LeJQC/MSDS/main/DATA%20607/Project%201/tournamentinfo.txt"
df <- read.csv(url,header = FALSE, sep = "|", skip = 4)
view(df)

###Creating a data frame for each player

#Checking the V2 column which contains all the names of the players
head(df$V2,20)

##  [1] " GARY HUA                        " " 15445895 / R: 1794   ->1817     "
##  [3] ""                                  " DAKSHESH DARURI                 "
##  [5] " 14598900 / R: 1553   ->1663     " ""                                 
##  [7] " ADITYA BAJAJ                    " " 14959604 / R: 1384   ->1640     "
##  [9] ""                                  " PATRICK H SCHILLING             "
## [11] " 12616049 / R: 1716   ->1744     " ""                                 
## [13] " HANSHI ZUO                      " " 14601533 / R: 1655   ->1690     "
## [15] ""                                  " HANSEN SONG                     "
## [17] " 15055204 / R: 1686   ->1687     " ""                                 
## [19] " GARY DEE SWATHELL               " " 11146376 / R: 1649   ->1673     "

#Cleaning the data and searching for 2 consecutive capital letters 
trim_df2 <- str_trim(df$V2)
player_names <-grep("[A-Z][A-Z]", trim_df2, value = TRUE)
#Data frame has 1 column, 64 rows
glimpse(player_names)

##  chr [1:64] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" ...

###Creating a data frame for each state

head(df$V1,20)

##  [1] "    1 "                                                                                   
##  [2] "   ON "                                                                                   
##  [3] "-----------------------------------------------------------------------------------------"
##  [4] "    2 "                                                                                   
##  [5] "   MI "                                                                                   
##  [6] "-----------------------------------------------------------------------------------------"
##  [7] "    3 "                                                                                   
##  [8] "   MI "                                                                                   
##  [9] "-----------------------------------------------------------------------------------------"
## [10] "    4 "                                                                                   
## [11] "   MI "                                                                                   
## [12] "-----------------------------------------------------------------------------------------"
## [13] "    5 "                                                                                   
## [14] "   MI "                                                                                   
## [15] "-----------------------------------------------------------------------------------------"
## [16] "    6 "                                                                                   
## [17] "   OH "                                                                                   
## [18] "-----------------------------------------------------------------------------------------"
## [19] "    7 "                                                                                   
## [20] "   MI "

#Same as before but with the V1 column that contains the 2 letter state
states <- str_trim(grep("[A-ZA-Z]",df$V1, value = TRUE))
#Data frame has 1 column, 64 rows
glimpse(states)

##  chr [1:64] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" ...

#Creating a data frame for total points of each player

head(df$V3,20)

##  [1] "6.0  " "N:2  " ""      "6.0  " "N:2  " ""      "6.0  " "N:2  " ""     
## [10] "5.5  " "N:2  " ""      "5.5  " "N:2  " ""      "5.0  " "N:3  " ""     
## [19] "5.0  " "N:3  "

#Searching the V3 column for a digit followed by a dot followed by another digit
total_points <- str_trim(grep("\\d(.)\\d",df$V3, value = TRUE))
#Data frame has 1 column, 64 rows
glimpse(total_points)

##  chr [1:64] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" ...

#Creating a data frame for player’s prerating

#The prerating is in the same column as the player name which we used before
head(trim_df2,20)

##  [1] "GARY HUA"                    "15445895 / R: 1794   ->1817"
##  [3] ""                            "DAKSHESH DARURI"            
##  [5] "14598900 / R: 1553   ->1663" ""                           
##  [7] "ADITYA BAJAJ"                "14959604 / R: 1384   ->1640"
##  [9] ""                            "PATRICK H SCHILLING"        
## [11] "12616049 / R: 1716   ->1744" ""                           
## [13] "HANSHI ZUO"                  "14601533 / R: 1655   ->1690"
## [15] ""                            "HANSEN SONG"                
## [17] "15055204 / R: 1686   ->1687" ""                           
## [19] "GARY DEE SWATHELL"           "11146376 / R: 1649   ->1673"

#Searching for the "R: 1794" pattern
r_rating <- grep("R:\\s+\\d+", trim_df2, value = TRUE)
#Data frame has 1 column, 64 rows but there is other stuff I don't need
glimpse(r_rating)

##  chr [1:64] "15445895 / R: 1794   ->1817" "14598900 / R: 1553   ->1663" ...

#Positive lookbehind: Matching digit only if preceded by R with 1 or 2 spaces to account for 3 and 4 digit ratings
pre_rating <- str_extract(r_rating, "(?<=R: )\\d+|(?<=R:  )\\d+")
#This data frame just has the 64 ratings
glimpse(pre_rating)

##  chr [1:64] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" ...

###Checking how the data looks so far

player_data4 <- data.frame(player_names, states, total_points, pre_rating)
head(player_data4,20)

##                player_names states total_points pre_rating
## 1                  GARY HUA     ON          6.0       1794
## 2           DAKSHESH DARURI     MI          6.0       1553
## 3              ADITYA BAJAJ     MI          6.0       1384
## 4       PATRICK H SCHILLING     MI          5.5       1716
## 5                HANSHI ZUO     MI          5.5       1655
## 6               HANSEN SONG     OH          5.0       1686
## 7         GARY DEE SWATHELL     MI          5.0       1649
## 8          EZEKIEL HOUGHTON     MI          5.0       1641
## 9               STEFANO LEE     ON          5.0       1411
## 10                ANVIT RAO     MI          5.0       1365
## 11 CAMERON WILLIAM MC LEMAN     MI          4.5       1712
## 12           KENNETH J TACK     MI          4.5       1663
## 13        TORRANCE HENRY JR     MI          4.5       1666
## 14             BRADLEY SHAW     MI          4.5       1610
## 15   ZACHARY JAMES HOUGHTON     MI          4.5       1220
## 16             MIKE NIKITIN     MI          4.0       1604
## 17       RONALD GRZEGORCZYK     MI          4.0       1629
## 18            DAVID SUNDEEN     MI          4.0       1600
## 19             DIPANKAR ROY     MI          4.0       1564
## 20              JASON ZHENG     MI          4.0       1595

#Getting the opponents from each round

#Used a for loop to iterate over the orginial data frame(df) starting from column 4 to 10.
for (i in 4:10) {
  #Searched for 5 spaces or a pattern ending in a digit
  col_values <- str_subset(df[[i]], pattern = "     |([0-9]$)")
  #Replaced 5 spaces with na value
  col_plus_na <- str_replace_all(col_values, "     ", "NA")
  #Replaced the W,L,or D with spaces and trimmed it. Saved it as a column
  col_clean <- as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D", ""))))
  #Assigning the values to columns named round_1 to round_7
  assign(paste0("round_", i-3), col_clean)
}

## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion

## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion

## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion

## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion

## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion

## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion

## Warning in as.vector(as.numeric(str_trim(str_replace_all(col_plus_na, "W|L|D",
## : NAs introduced by coercion

#Manually combining the all the data frames. Tried appending the rounds in the for loop but it kept giving me an error
rounds <- cbind(player_data4,round_1,round_2,round_3,round_4,round_5,round_6,round_7)
head(rounds,20)

##                player_names states total_points pre_rating round_1 round_2
## 1                  GARY HUA     ON          6.0       1794      39      21
## 2           DAKSHESH DARURI     MI          6.0       1553      63      58
## 3              ADITYA BAJAJ     MI          6.0       1384       8      61
## 4       PATRICK H SCHILLING     MI          5.5       1716      23      28
## 5                HANSHI ZUO     MI          5.5       1655      45      37
## 6               HANSEN SONG     OH          5.0       1686      34      29
## 7         GARY DEE SWATHELL     MI          5.0       1649      57      46
## 8          EZEKIEL HOUGHTON     MI          5.0       1641       3      32
## 9               STEFANO LEE     ON          5.0       1411      25      18
## 10                ANVIT RAO     MI          5.0       1365      16      19
## 11 CAMERON WILLIAM MC LEMAN     MI          4.5       1712      38      56
## 12           KENNETH J TACK     MI          4.5       1663      42      33
## 13        TORRANCE HENRY JR     MI          4.5       1666      36      27
## 14             BRADLEY SHAW     MI          4.5       1610      54      44
## 15   ZACHARY JAMES HOUGHTON     MI          4.5       1220      19      16
## 16             MIKE NIKITIN     MI          4.0       1604      10      15
## 17       RONALD GRZEGORCZYK     MI          4.0       1629      48      41
## 18            DAVID SUNDEEN     MI          4.0       1600      47       9
## 19             DIPANKAR ROY     MI          4.0       1564      15      10
## 20              JASON ZHENG     MI          4.0       1595      40      49
##    round_3 round_4 round_5 round_6 round_7
## 1       18      14       7      12       4
## 2        4      17      16      20       7
## 3       25      21      11      13      12
## 4        2      26       5      19       1
## 5       12      13       4      14      17
## 6       11      35      10      27      21
## 7       13      11       1       9       2
## 8       14       9      47      28      19
## 9       59       8      26       7      20
## 10      55      31       6      25      18
## 11       6       7       3      34      26
## 12       5      38      NA       1       3
## 13       7       5      33       3      32
## 14       8       1      27       5      31
## 15      30      22      54      33      38
## 16      NA      39       2      36      NA
## 17      26       2      23      22       5
## 18       1      32      19      38      10
## 19      52      28      18       4       8
## 20      23      41      28       2       9

###Finding Average Pre Chess Rating of Opponents

#Created a list with 64 rows
prematch_ratings <- vector("list", nrow(rounds))
#Iterating over the 64 rows
for (i in 1:nrow(rounds)) {
  current_ratings <- c()
  #Row 1, Fifth column = 39 
  for (k in 5:11) {
    opponent <- rounds[i,k]
    #Appending prerating of opponent 39 to current ratings vector
    if (!is.na(opponent)) {
      current_ratings <- c(current_ratings, pre_rating[opponent])
    }
  }
  #Storing the rating in a list 
  prematch_ratings[[i]] <- as.numeric(current_ratings)
}
head(prematch_ratings)

## [[1]]
## [1] 1436 1563 1600 1610 1649 1663 1716
## 
## [[2]]
## [1] 1175  917 1716 1629 1604 1595 1649
## 
## [[3]]
## [1] 1641  955 1745 1563 1712 1666 1663
## 
## [[4]]
## [1] 1363 1507 1553 1579 1655 1564 1794
## 
## [[5]]
## [1] 1242  980 1663 1666 1716 1610 1629
## 
## [[6]]
## [1] 1399 1602 1712 1438 1365 1552 1563

mean_ratings <- sapply(prematch_ratings, mean)
rounds$opponent_avg_rating <- round(mean_ratings)
head(rounds,20)

##                player_names states total_points pre_rating round_1 round_2
## 1                  GARY HUA     ON          6.0       1794      39      21
## 2           DAKSHESH DARURI     MI          6.0       1553      63      58
## 3              ADITYA BAJAJ     MI          6.0       1384       8      61
## 4       PATRICK H SCHILLING     MI          5.5       1716      23      28
## 5                HANSHI ZUO     MI          5.5       1655      45      37
## 6               HANSEN SONG     OH          5.0       1686      34      29
## 7         GARY DEE SWATHELL     MI          5.0       1649      57      46
## 8          EZEKIEL HOUGHTON     MI          5.0       1641       3      32
## 9               STEFANO LEE     ON          5.0       1411      25      18
## 10                ANVIT RAO     MI          5.0       1365      16      19
## 11 CAMERON WILLIAM MC LEMAN     MI          4.5       1712      38      56
## 12           KENNETH J TACK     MI          4.5       1663      42      33
## 13        TORRANCE HENRY JR     MI          4.5       1666      36      27
## 14             BRADLEY SHAW     MI          4.5       1610      54      44
## 15   ZACHARY JAMES HOUGHTON     MI          4.5       1220      19      16
## 16             MIKE NIKITIN     MI          4.0       1604      10      15
## 17       RONALD GRZEGORCZYK     MI          4.0       1629      48      41
## 18            DAVID SUNDEEN     MI          4.0       1600      47       9
## 19             DIPANKAR ROY     MI          4.0       1564      15      10
## 20              JASON ZHENG     MI          4.0       1595      40      49
##    round_3 round_4 round_5 round_6 round_7 opponent_avg_rating
## 1       18      14       7      12       4                1605
## 2        4      17      16      20       7                1469
## 3       25      21      11      13      12                1564
## 4        2      26       5      19       1                1574
## 5       12      13       4      14      17                1501
## 6       11      35      10      27      21                1519
## 7       13      11       1       9       2                1372
## 8       14       9      47      28      19                1468
## 9       59       8      26       7      20                1523
## 10      55      31       6      25      18                1554
## 11       6       7       3      34      26                1468
## 12       5      38      NA       1       3                1506
## 13       7       5      33       3      32                1498
## 14       8       1      27       5      31                1515
## 15      30      22      54      33      38                1484
## 16      NA      39       2      36      NA                1386
## 17      26       2      23      22       5                1499
## 18       1      32      19      38      10                1480
## 19      52      28      18       4       8                1426
## 20      23      41      28       2       9                1411

###Selecting the columns and importing to csv

chess_tour <- rounds %>% 
  select(player_names, states, total_points, pre_rating, opponent_avg_rating)
head(chess_tour,20)

##                player_names states total_points pre_rating opponent_avg_rating
## 1                  GARY HUA     ON          6.0       1794                1605
## 2           DAKSHESH DARURI     MI          6.0       1553                1469
## 3              ADITYA BAJAJ     MI          6.0       1384                1564
## 4       PATRICK H SCHILLING     MI          5.5       1716                1574
## 5                HANSHI ZUO     MI          5.5       1655                1501
## 6               HANSEN SONG     OH          5.0       1686                1519
## 7         GARY DEE SWATHELL     MI          5.0       1649                1372
## 8          EZEKIEL HOUGHTON     MI          5.0       1641                1468
## 9               STEFANO LEE     ON          5.0       1411                1523
## 10                ANVIT RAO     MI          5.0       1365                1554
## 11 CAMERON WILLIAM MC LEMAN     MI          4.5       1712                1468
## 12           KENNETH J TACK     MI          4.5       1663                1506
## 13        TORRANCE HENRY JR     MI          4.5       1666                1498
## 14             BRADLEY SHAW     MI          4.5       1610                1515
## 15   ZACHARY JAMES HOUGHTON     MI          4.5       1220                1484
## 16             MIKE NIKITIN     MI          4.0       1604                1386
## 17       RONALD GRZEGORCZYK     MI          4.0       1629                1499
## 18            DAVID SUNDEEN     MI          4.0       1600                1480
## 19             DIPANKAR ROY     MI          4.0       1564                1426
## 20              JASON ZHENG     MI          4.0       1595                1411

write.csv(chess_tour, file = "Chess Project.csv", row.names = FALSE)

Project 1

Jian Quan Chen

2023-02-13