Project 1

Getting started

Load packages

Let’s load these packages

library(tidyverse)
library(stringr)
library(knitr)

Import the chess data from github

chess_df = read_csv(file = 'https://raw.githubusercontent.com/Kossi-Akplaka/Data607-data_acquisition_and_management/main/Project%201/chess.txt', col_names = FALSE)

Let’s have a glimpse of the data

head(chess_df,10)

## # A tibble: 10 × 1
##    X1                                                                           
##    <chr>                                                                        
##  1 ----------------------------------------------------------------------------…
##  2 Pair | Player Name                     |Total|Round|Round|Round|Round|Round|…
##  3 Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |…
##  4 ----------------------------------------------------------------------------…
##  5 1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  …
##  6 ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B …
##  7 ----------------------------------------------------------------------------…
##  8 2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  …
##  9 MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W …
## 10 ----------------------------------------------------------------------------…

Preprocess the Data

We can drop the first 4 rows of the data and process the rest of the data. The way the data is structured, we can extract:

Player Number (Useful to find the average), Player’s Name, and the opponents of the players in one line.
Player’s State, Total Number of Points, Player’s Pre-Rating on the other line

In this part, we can extract those information into 2 different dataset

Dataset 1: Extract every line starting by a number

chess <- matrix(unlist(chess_df), byrow=TRUE)
chess_df1 <- chess[seq(5,length(chess),3)]
head(chess_df1)

## [1] "1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"

Extract data starting by Player’s state

chess_df2 <- chess[seq(6,length(chess),3)]
head(chess_df2)

## [1] "ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

Data manipulation

Now, let’s extract some useful information

Extract the player’s number

PairNum <- as.numeric(str_extract(chess_df1, '\\d+'))

Extract the player’s full name (matching first combination of letters until |, then extract the full name)

Name <- str_extract(chess_df1, '[A-z].{1,32}') 
Name <- str_trim(str_extract(Name, '.+\\s{2,}'))

Extract the player’s state (matching the first 2 letters)

State <- str_extract(chess_df2, '[A-Z]{2}')

Extract the player’s pre-rating (matching any combination starting from R until -, then taking the first 4 numbers)

PreRating <- str_extract(chess_df2, 'R:.{8,}-')
PreRating <- as.numeric(str_extract(PreRating, '\\d{1,4}'))

Extract the player’s rounds (combination of letter, space, 2 numbers, then taking the numbers)

Rounds <- str_extract_all(chess_df1, '[A-Z]\\s{2,}\\d+')
Rounds <- str_extract_all(Rounds, '\\d+')

Extract the total number of points (combination of 1 number, period, a number)

TotalNumberofPoints <- as.numeric(str_extract(chess_df1, '\\d+\\.\\d'))

Calculate the average Pre chess rating of opponents

average_Opponents <- c()
for(i in c(1:length(Rounds))){
  average_Opponents[i] <- round(mean(PreRating[as.numeric(Rounds[[i]])]),0)
}

Creating a new dataframe

chess_dataframe <- data.frame(PairNum,Name,State,TotalNumberofPoints,PreRating,average_Opponents)
kable(head(chess_dataframe,10), row.names = NA)

PairNum	Name	State	TotalNumberofPoints	PreRating	average_Opponents
1	GARY HUA	ON	6.0	1794	1605
2	DAKSHESH DARURI	MI	6.0	1553	1469
3	ADITYA BAJAJ	MI	6.0	1384	1564
4	PATRICK H SCHILLING	MI	5.5	1716	1574
5	HANSHI ZUO	MI	5.5	1655	1501
6	HANSEN SONG	OH	5.0	1686	1519
7	GARY DEE SWATHELL	MI	5.0	1649	1372
8	EZEKIEL HOUGHTON	MI	5.0	1641	1468
9	STEFANO LEE	ON	5.0	1411	1523
10	ANVIT RAO	MI	5.0	1365	1554

Load into a csv format

We can load this data frame into a csv format.

write_csv(chess_dataframe, 'chess.csv')