txt to csv

Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

Loading the Data

Load the data from raw data in GitHub repository, which is available at https://raw.githubusercontent.com/AlphaCurse/TXT_to_CSV/main/chesstornament.txt.

library(stringr)
library(readr)

url = 'https://raw.githubusercontent.com/AlphaCurse/TXT_to_CSV/main/chesstornament.txt'
df = readLines(url)

## Warning in readLines(url): incomplete final line found on 'https://
## raw.githubusercontent.com/AlphaCurse/TXT_to_CSV/main/chesstornament.txt'

Visualize the data we have.

head(df)

## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

Edit the Data

Seeing as the data is difficult to interpret, it’ll need to be configured.

#locate proper data
x0 = 0
x1 = unname(str_locate_all(pattern = '\\|', df[5])[[1]][1,1])
x2 = unname(str_locate_all(pattern = '\\|', df[5])[[1]][2,1])
x3 = unname(str_locate_all(pattern = '\\|', df[5])[[1]][3,1])
x4 = max(nchar(df))

#sequence and group
seq1 = seq(5, 196, 3)
seq2 = seq(6, 196, 3)
group1 = df[seq1]
group2 = df[seq2]

Capture the Data

Now that the data is in a more organized format, we can begin capturing features.

#group player names
name = substr(group1, x1 + 1, x2 - 2)
PlayerName = str_trim(name)

#group player state
state = substr(group2, x0, x1 - 1)
State = str_trim(state)

#group total points
totalpts = substr(group1, x2 + 1, x3 - 1)

#group pre-ratings
pre = substr(group2, x1 + 1, x2 - 1)
pre = str_extract(pre, ': *\\d{2,}')

#add each feature into dataframe
chess_df = data.frame(PlayerName, State)
chess_df$TotalPts = sprintf("%.1f", as.numeric(totalpts))
chess_df$PreRating = as.integer(str_extract(pre, '\\d{2,}'))

Calculate Average Pre Rating of Opponent

To calculate the average pre rating of opponents, I used the following function…

opp = substr(group1, x3 + 1, x4)
opp = str_extract_all(opp, '\\b\\d{1,}')
opp = as.matrix(opp)

avgoppprerate = function(y, z){
  x = y[z]
  
  for (a in x) {
    rate = 0
    c = 0
    for (b in a) {
      c = c + 1
      rate = rate + chess_df$PreRating[as.numeric(b)]
    }
    rate = round(rate/c)
  }
  return(rate)
}

chess_df$AvgOppPreRating = apply(opp, 1, avgoppprerate)

Visualize DataFrame

View(chess_df)

Export txt file into csv file

write.csv(chess_df, "tournamentinfo.csv")

View CSV version

csv_df = read_csv('https://raw.githubusercontent.com/AlphaCurse/TXT_to_CSV/main/tournamentinfo.csv')

## New names:
## Rows: 64 Columns: 6
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (2): PlayerName, State dbl (4): ...1, TotalPts, PreRating, AvgOppPreRating
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`

csv_df

## # A tibble: 64 x 6
##     ...1 PlayerName          State TotalPts PreRating AvgOppPreRating
##    <dbl> <chr>               <chr>    <dbl>     <dbl>           <dbl>
##  1     1 GARY HUA            ON         6        1794            1605
##  2     2 DAKSHESH DARURI     MI         6        1553            1469
##  3     3 ADITYA BAJAJ        MI         6        1384            1564
##  4     4 PATRICK H SCHILLING MI         5.5      1716            1574
##  5     5 HANSHI ZUO          MI         5.5      1655            1501
##  6     6 HANSEN SONG         OH         5        1686            1519
##  7     7 GARY DEE SWATHELL   MI         5        1649            1372
##  8     8 EZEKIEL HOUGHTON    MI         5        1641            1468
##  9     9 STEFANO LEE         ON         5        1411            1523
## 10    10 ANVIT RAO           MI         5        1365            1554
## # ... with 54 more rows