Project1

Data Manipulation

The table above revealed some useful pattern information:

Players information are provided with double raw that are repeated sequentially and separated by dashed-line row.
- The first row is the dashed-line.
- The second row is first line of player’s information.
- The third row is second line of player’s information.

As shown, the same type of information are repeated every n+3.

Through the rest of this document data will be extracted and manipulated based on the on the obove observation.

step 1.

The first three rows of the table are informational provide columns descriptions that not needed in addition to dashed-line, thus is going to be eliminated.

data_tb <- rawdata_tb[-c(1:3),]
head(data_tb)

## [1] -----------------------------------------------------------------------------------------
## [2]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [3]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [4] -----------------------------------------------------------------------------------------
## [5]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [6]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 131 Levels: ----------------------------------------------------------------------------------------- ...

step 2.

The dashed-line must be eliminated and similar data must be grouped into two separated lists to facilitate data string manipulation using expressions. Thus, two new variables will be created, the firstrows and the secondrows.

# get the length of the data
l=length(data_tb)

# start at row 2 and offset by 3 rows to the length of the data 'l'
firstrows <- data_tb[seq(2, l, 3)]
# start at row 3 and offset by 3 rows to the length of the data 'l'
secondrows <- data_tb[seq(3, l, 3)]

head(firstrows)

## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [3]     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## [4]     4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|
## [5]     5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|
## [6]     6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|
## 131 Levels: ----------------------------------------------------------------------------------------- ...

head(secondrows)

## [1]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [2]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [3]    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [4]    MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## [5]    MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6]    OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |
## 131 Levels: ----------------------------------------------------------------------------------------- ...

step 3.

The names, scores, and opponents can be extracted from the firstrows.

# extract all words more than two letters, separated by space, up to 3 three words, and then trim to a new string.
names <- str_trim(str_extract(firstrows, "(\\w+\\s){2,3}"))

# extract all numbers that separated by a dot and parse result to double. 
scores <- as.double(unlist(str_extract_all(firstrows, "[:digit:][.][:digit:]")))

# extract all numbers that followed by |  
opponentstemp <- str_extract_all(firstrows, "[:digit:]+?\\|")

# extract all numbers 
opponents <- str_extract_all(opponentstemp,"\\d+")

head(names)

## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"

head(scores)

## [1] 6.0 6.0 6.0 5.5 5.5 5.0

head(opponents)

## [[1]]
## [1] "39" "21" "18" "14" "7"  "12" "4" 
## 
## [[2]]
## [1] "63" "58" "4"  "17" "16" "20" "7" 
## 
## [[3]]
## [1] "8"  "61" "25" "21" "11" "13" "12"
## 
## [[4]]
## [1] "23" "28" "2"  "26" "5"  "19" "1" 
## 
## [[5]]
## [1] "45" "37" "12" "13" "4"  "14" "17"
## 
## [[6]]
## [1] "34" "29" "11" "35" "10" "27" "21"

step 4.

The sates and prerating can be extracted from the Secondrows.

# extract all character strings
states <- str_extract(secondrows, "\\w+")

# extract all string that have a length of 4 digits and surrounded by non-numbers 
preratingtemp <- str_extract(secondrows, "[^[:digit:]][:digit:]{1,4}[^[:digit:]]")

# extract all numbers and parse them to integers  
prerating <- as.integer(str_extract(preratingtemp,"\\d+"))

head(states)

## [1] "ON" "MI" "MI" "MI" "MI" "OH"

head(prerating)

## [1] 1794 1553 1384 1716 1655 1686

step 5.

Get the average rating of every player.

avg_preRating=0

# loop to get the mean value of every opponent.  
for (i in 1:length(names)) {
  # get the mean rating of every player based on games played 
  avg_preRating[i] <- mean(prerating[as.numeric(unlist(opponents[i]))])
  # add .5 for rounding up 
  avg_preRating[i] <- avg_preRating[i]+0.5
}

# parse to integers 
avg_preRating<-as.integer(avg_preRating)

head(avg_preRating)

## [1] 1605 1469 1564 1574 1501 1519

step 6.

Compose all vectors generated into a single data frame

data <- data.frame(names,states,scores,prerating,avg_preRating)

step 7.

Convert the data frame into a csv file

write.table(data, file = "Ali_Harb_Project1.csv", sep = ",", col.names = T)


Read_csv <- read.csv( "Ali_Harb_Project1.csv")

head(Read_csv)

##                 names states scores prerating avg_preRating
## 1            GARY HUA     ON    6.0      1794          1605
## 2     DAKSHESH DARURI     MI    6.0      1553          1469
## 3        ADITYA BAJAJ     MI    6.0      1384          1564
## 4 PATRICK H SCHILLING     MI    5.5      1716          1574
## 5          HANSHI ZUO     MI    5.5      1655          1501
## 6         HANSEN SONG     OH    5.0      1686          1519

Project1

Ali Harb

September 25, 2016

Introduction

Load Required Libraries

Load Data