Introduction

The objective of this project is to imported data from text file into r, manipulate the data, and export the result into an csv file format, using the chess tournament results data set.

The exported file must contain the following information for every giving player:

Load Required Libraries

To achieve the task, the string library is needed to manipulate strings and convert them into different data types.

library(stringr)

Load Data

Loading the data to R environment. Data can be obtained from GitHub.

rawdata_tb <- read.table("https://raw.githubusercontent.com/aliharb/R-Programming/master/tournamentinfo.txt", header=FALSE, sep="\n")

head(rawdata_tb)
##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 3  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 4  -----------------------------------------------------------------------------------------
## 5      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 6     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
tail(rawdata_tb)
##                                                                                            V1
## 191    63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |
## 192    MI | 15057092 / R: 1175   ->1125     |     |W    |B    |W    |B    |B    |     |     |
## 193 -----------------------------------------------------------------------------------------
## 194    64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|
## 195    MI | 15006561 / R: 1163   ->1112     |     |B    |W    |W    |B    |W    |B    |B    |
## 196 -----------------------------------------------------------------------------------------

Data Manipulation

The table above revealed some useful pattern information:

As shown, the same type of information are repeated every n+3.

Through the rest of this document data will be extracted and manipulated based on the on the obove observation.

step 1.

The first three rows of the table are informational provide columns descriptions that not needed in addition to dashed-line, thus is going to be eliminated.

data_tb <- rawdata_tb[-c(1:3),]
head(data_tb)
## [1] -----------------------------------------------------------------------------------------
## [2]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [3]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [4] -----------------------------------------------------------------------------------------
## [5]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [6]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 131 Levels: ----------------------------------------------------------------------------------------- ...

step 2.

The dashed-line must be eliminated and similar data must be grouped into two separated lists to facilitate data string manipulation using expressions. Thus, two new variables will be created, the firstrows and the secondrows.

# get the length of the data
l=length(data_tb)

# start at row 2 and offset by 3 rows to the length of the data 'l'
firstrows <- data_tb[seq(2, l, 3)]
# start at row 3 and offset by 3 rows to the length of the data 'l'
secondrows <- data_tb[seq(3, l, 3)]

head(firstrows)
## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [3]     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## [4]     4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|
## [5]     5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|
## [6]     6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|
## 131 Levels: ----------------------------------------------------------------------------------------- ...
head(secondrows)
## [1]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [2]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [3]    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [4]    MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## [5]    MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6]    OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |
## 131 Levels: ----------------------------------------------------------------------------------------- ...

step 3.

The names, scores, and opponents can be extracted from the firstrows.

# extract all words more than two letters, separated by space, up to 3 three words, and then trim to a new string.
names <- str_trim(str_extract(firstrows, "(\\w+\\s){2,3}"))

# extract all numbers that separated by a dot and parse result to double. 
scores <- as.double(unlist(str_extract_all(firstrows, "[:digit:][.][:digit:]")))

# extract all numbers that followed by |  
opponentstemp <- str_extract_all(firstrows, "[:digit:]+?\\|")

# extract all numbers 
opponents <- str_extract_all(opponentstemp,"\\d+")

head(names)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"
head(scores)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0
head(opponents)
## [[1]]
## [1] "39" "21" "18" "14" "7"  "12" "4" 
## 
## [[2]]
## [1] "63" "58" "4"  "17" "16" "20" "7" 
## 
## [[3]]
## [1] "8"  "61" "25" "21" "11" "13" "12"
## 
## [[4]]
## [1] "23" "28" "2"  "26" "5"  "19" "1" 
## 
## [[5]]
## [1] "45" "37" "12" "13" "4"  "14" "17"
## 
## [[6]]
## [1] "34" "29" "11" "35" "10" "27" "21"

step 4.

The sates and prerating can be extracted from the Secondrows.

# extract all character strings
states <- str_extract(secondrows, "\\w+")

# extract all string that have a length of 4 digits and surrounded by non-numbers 
preratingtemp <- str_extract(secondrows, "[^[:digit:]][:digit:]{1,4}[^[:digit:]]")

# extract all numbers and parse them to integers  
prerating <- as.integer(str_extract(preratingtemp,"\\d+"))

head(states)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
head(prerating)
## [1] 1794 1553 1384 1716 1655 1686

step 5.

Get the average rating of every player.

avg_preRating=0

# loop to get the mean value of every opponent.  
for (i in 1:length(names)) {
  # get the mean rating of every player based on games played 
  avg_preRating[i] <- mean(prerating[as.numeric(unlist(opponents[i]))])
  # add .5 for rounding up 
  avg_preRating[i] <- avg_preRating[i]+0.5
}

# parse to integers 
avg_preRating<-as.integer(avg_preRating)

head(avg_preRating)
## [1] 1605 1469 1564 1574 1501 1519

step 6.

Compose all vectors generated into a single data frame

data <- data.frame(names,states,scores,prerating,avg_preRating)

step 7.

Convert the data frame into a csv file

write.table(data, file = "Ali_Harb_Project1.csv", sep = ",", col.names = T)


Read_csv <- read.csv( "Ali_Harb_Project1.csv")

head(Read_csv)
##                 names states scores prerating avg_preRating
## 1            GARY HUA     ON    6.0      1794          1605
## 2     DAKSHESH DARURI     MI    6.0      1553          1469
## 3        ADITYA BAJAJ     MI    6.0      1384          1564
## 4 PATRICK H SCHILLING     MI    5.5      1716          1574
## 5          HANSHI ZUO     MI    5.5      1655          1501
## 6         HANSEN SONG     OH    5.0      1686          1519