Data_607_Project

Load required libraries

library(stringr)

Load the data file and have a look at the first few lines

tournamentinfo <- read.delim("https://raw.githubusercontent.com/ezaccountz/-ezaccountz-Data_607_Project_1/master/tournamentinfo.txt", header = FALSE)
tournamentinfo <- unlist(tournamentinfo)
head(tournamentinfo)

##                                                                                        V11 
##  ----------------------------------------------------------------------------------------- 
##                                                                                        V12 
##  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|  
##                                                                                        V13 
##  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  |  
##                                                                                        V14 
##  ----------------------------------------------------------------------------------------- 
##                                                                                        V15 
##      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| 
##                                                                                        V16 
##     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    | 
## 131 Levels: ----------------------------------------------------------------------------------------- ...

Remove the header

tournamentinfo <- tournamentinfo[c(5:length(tournamentinfo))]
head(tournamentinfo)

##                                                                                       V15 
##     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| 
##                                                                                       V16 
##    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    | 
##                                                                                       V17 
## ----------------------------------------------------------------------------------------- 
##                                                                                       V18 
##     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7| 
##                                                                                       V19 
##    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    | 
##                                                                                      V110 
## ----------------------------------------------------------------------------------------- 
## 131 Levels: ----------------------------------------------------------------------------------------- ...

Extract the lines with information about a player’s pair number, name, total points and opponents and game results

name_score <- tournamentinfo[str_detect(tournamentinfo, "\\d+\\s\\|\\s[:alpha:]+")]
head(name_score)

##                                                                                       V15 
##     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| 
##                                                                                       V18 
##     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7| 
##                                                                                      V111 
##     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12| 
##                                                                                      V114 
##     4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1| 
##                                                                                      V117 
##     5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17| 
##                                                                                      V120 
##     6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21| 
## 131 Levels: ----------------------------------------------------------------------------------------- ...

Extract the lines with information about a player’s USCF ID, state, pre-rating and post rating

state_rating <- tournamentinfo[str_detect(tournamentinfo, "[:alpha:]+\\s\\|\\s\\d+")]
head(state_rating)

##                                                                                       V16 
##    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    | 
##                                                                                       V19 
##    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    | 
##                                                                                      V112 
##    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    | 
##                                                                                      V115 
##    MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    | 
##                                                                                      V118 
##    MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    | 
##                                                                                      V121 
##    OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    | 
## 131 Levels: ----------------------------------------------------------------------------------------- ...

Extract the pair number

pair_num <- str_extract(str_extract(name_score, "\\d+\\s\\|\\s[:alpha:]+"),"\\d+")
head(pair_num)

## [1] "1" "2" "3" "4" "5" "6"

To obtain the players’ names, extract the string from the first “| |” area, then remove the “|”s and trim the string

player_name <- str_extract(name_score,"(\\|).+?\\|")
player_name <- str_remove_all(player_name, "[\\|]")
player_name <- str_trim(player_name)
head(player_name)

## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"

Extract the total points from the second “| |”

name_score <- str_remove(name_score, ".*?\\|.*?\\|")
total_points <- str_extract(name_score, ".*?\\|")
total_points <- as.numeric(str_remove_all(total_points, "[\\s\\|]"))
head(total_points)

## [1] 6.0 6.0 6.0 5.5 5.5 5.0

Extract the opponents’ pair numbers and the game result recurring from the 3rd to 9th “| |”

opponents <- data.frame(matrix(ncol = 7, nrow = length(name_score)))
colnames(opponents) <- str_c("opponent",1:7)

game_result <- data.frame(matrix(ncol = 7, nrow = length(name_score)))
colnames(game_result) <- str_c("opponent",1:7)

for(i in c(1:7)) {
  name_score <- str_remove(name_score, ".*?\\|")
  opponents[,i] <- str_extract(name_score, ".*?\\|")
  game_result[,i] <- str_extract(opponents[,i], "[:alpha:]")
  opponents[,i] <- as.numeric(str_remove_all(opponents[,i], "[[:alpha:]\\s\\|]"))
}

head(opponents)

##   opponent1 opponent2 opponent3 opponent4 opponent5 opponent6 opponent7
## 1        39        21        18        14         7        12         4
## 2        63        58         4        17        16        20         7
## 3         8        61        25        21        11        13        12
## 4        23        28         2        26         5        19         1
## 5        45        37        12        13         4        14        17
## 6        34        29        11        35        10        27        21

head(game_result)

##   opponent1 opponent2 opponent3 opponent4 opponent5 opponent6 opponent7
## 1         W         W         W         W         W         D         D
## 2         W         W         L         W         W         W         W
## 3         L         W         W         W         W         W         W
## 4         W         D         W         W         D         W         D
## 5         W         W         D         D         D         W         W
## 6         W         D         L         W         D         W         W

Now we go to our other set of extracted lines. Extract the state information

state <- str_extract(state_rating, ".*?\\|")
state <- str_remove_all(state, "[\\s\\|]")
head(state)

## [1] "ON" "MI" "MI" "MI" "MI" "OH"

Extract the USCF ID, which is within the first “|” and the first “/”

state_rating <- str_remove(state_rating, ".*?\\|")
uscf_id <- str_extract(state_rating, ".*?\\/")
uscf_id <- str_remove_all(uscf_id, "[\\s\\/]")
head(uscf_id)

## [1] "15445895" "14598900" "14959604" "12616049" "14601533" "15055204"

Extract the pre rating, which is within the string after the first “/” and endings with the rating (number) followed by a space or character

state_rating <- str_remove(state_rating, ".*?\\/")
pre_rating <- str_extract(state_rating, ".*?\\d+[[:alpha:]\\s]")
pre_rating <- as.numeric(str_extract(pre_rating,"\\d+"))
head(pre_rating)

## [1] 1794 1553 1384 1716 1655 1686

Extract the post rating, which is within the string after “the first”->" and endings with the rating (number) followed by a space

state_rating <- str_remove(state_rating, ".*?\\-\\>")
post_rating <- str_extract(state_rating,".*?\\d+[[:alpha:]\\s]")
post_rating <- as.numeric(str_extract(post_rating,"\\d+"))
head(post_rating)

## [1] 1817 1663 1640 1744 1690 1687

According to the pair number of the opponents, we find the pre ratings of the opponents and store them in a table

opponents_rating <- data.frame(matrix(ncol = 7, nrow = length(name_score)))
colnames(opponents_rating) <- str_c("opponent",1:7)
for (i in (1:7)) {
  for (j in (1: length(pair_num)))
    opponents_rating[j,i] <- ifelse(is.na(opponents[j,i]),NA, as.numeric(pre_rating[pair_num == opponents[j,i]]))
}
head(opponents_rating)

##   opponent1 opponent2 opponent3 opponent4 opponent5 opponent6 opponent7
## 1      1436      1563      1600      1610      1649      1663      1716
## 2      1175       917      1716      1629      1604      1595      1649
## 3      1641       955      1745      1563      1712      1666      1663
## 4      1363      1507      1553      1579      1655      1564      1794
## 5      1242       980      1663      1666      1716      1610      1629
## 6      1399      1602      1712      1438      1365      1552      1563

Finally we calculate the average pre rating of the opponents, rounded to the nearest integer

average_rating <- round(apply(opponents_rating, 1, mean,na.rm=TRUE),0)
head(average_rating)

## [1] 1605 1469 1564 1574 1501 1519

Combine all rate into a detailed table

detailed_data <- data.frame(pair_num,player_name,uscf_id,state,total_points,pre_rating, post_rating, 
                           opponents$opponent1,opponents_rating$opponent1,game_result$opponent1,
                           opponents$opponent2,opponents_rating$opponent2,game_result$opponent2,
                           opponents$opponent3,opponents_rating$opponent3,game_result$opponent3,
                           opponents$opponent4,opponents_rating$opponent4,game_result$opponent4,
                           opponents$opponent5,opponents_rating$opponent5,game_result$opponent5,
                           opponents$opponent6,opponents_rating$opponent6,game_result$opponent6,
                           opponents$opponent7,opponents_rating$opponent7,game_result$opponent7,
                           average_rating)
colnames(detailed_data) <- c("Pair Number", "Name", "USCF ID", "State", "Total Points", "Pre Rating", "Post Rating",
                             "Opponent 1 Pair Number", "Opponent 1 Pre Rating", "Game Result vs.Opponent 1",
                             "Opponent 2 Pair Number", "Opponent 2 Pre Rating", "Game Result vs.Opponent 2",
                             "Opponent 3 Pair Number", "Opponent 3 Pre Rating", "Game Result vs.Opponent 3",
                             "Opponent 4 Pair Number", "Opponent 4 Pre Rating", "Game Result vs.Opponent 4",
                             "Opponent 5 Pair Number", "Opponent 5 Pre Rating", "Game Result vs.Opponent 5",
                             "Opponent 6 Pair Number", "Opponent 6 Pre Rating", "Game Result vs.Opponent 6",
                             "Opponent 7 Pair Number", "Opponent 7 Pre Rating", "Game Result vs.Opponent 7",
                             "Average Pre Rating of Opponents"
                            )
head(detailed_data)

##   Pair Number                Name  USCF ID State Total Points Pre Rating
## 1           1            GARY HUA 15445895    ON          6.0       1794
## 2           2     DAKSHESH DARURI 14598900    MI          6.0       1553
## 3           3        ADITYA BAJAJ 14959604    MI          6.0       1384
## 4           4 PATRICK H SCHILLING 12616049    MI          5.5       1716
## 5           5          HANSHI ZUO 14601533    MI          5.5       1655
## 6           6         HANSEN SONG 15055204    OH          5.0       1686
##   Post Rating Opponent 1 Pair Number Opponent 1 Pre Rating
## 1        1817                     39                  1436
## 2        1663                     63                  1175
## 3        1640                      8                  1641
## 4        1744                     23                  1363
## 5        1690                     45                  1242
## 6        1687                     34                  1399
##   Game Result vs.Opponent 1 Opponent 2 Pair Number Opponent 2 Pre Rating
## 1                         W                     21                  1563
## 2                         W                     58                   917
## 3                         L                     61                   955
## 4                         W                     28                  1507
## 5                         W                     37                   980
## 6                         W                     29                  1602
##   Game Result vs.Opponent 2 Opponent 3 Pair Number Opponent 3 Pre Rating
## 1                         W                     18                  1600
## 2                         W                      4                  1716
## 3                         W                     25                  1745
## 4                         D                      2                  1553
## 5                         W                     12                  1663
## 6                         D                     11                  1712
##   Game Result vs.Opponent 3 Opponent 4 Pair Number Opponent 4 Pre Rating
## 1                         W                     14                  1610
## 2                         L                     17                  1629
## 3                         W                     21                  1563
## 4                         W                     26                  1579
## 5                         D                     13                  1666
## 6                         L                     35                  1438
##   Game Result vs.Opponent 4 Opponent 5 Pair Number Opponent 5 Pre Rating
## 1                         W                      7                  1649
## 2                         W                     16                  1604
## 3                         W                     11                  1712
## 4                         W                      5                  1655
## 5                         D                      4                  1716
## 6                         W                     10                  1365
##   Game Result vs.Opponent 5 Opponent 6 Pair Number Opponent 6 Pre Rating
## 1                         W                     12                  1663
## 2                         W                     20                  1595
## 3                         W                     13                  1666
## 4                         D                     19                  1564
## 5                         D                     14                  1610
## 6                         D                     27                  1552
##   Game Result vs.Opponent 6 Opponent 7 Pair Number Opponent 7 Pre Rating
## 1                         D                      4                  1716
## 2                         W                      7                  1649
## 3                         W                     12                  1663
## 4                         W                      1                  1794
## 5                         W                     17                  1629
## 6                         W                     21                  1563
##   Game Result vs.Opponent 7 Average Pre Rating of Opponents
## 1                         D                            1605
## 2                         W                            1469
## 3                         W                            1564
## 4                         D                            1574
## 5                         W                            1501
## 6                         W                            1519

Also, create a summaried data table as requested to be the answer to this project

summarized_data <- data.frame(player_name,state,total_points,pre_rating,average_rating)
colnames(summarized_data) <- c("Name", "State", "Total Points", "Pre Rating","Average Pre Rating of Opponents")
head(summarized_data)

##                  Name State Total Points Pre Rating
## 1            GARY HUA    ON          6.0       1794
## 2     DAKSHESH DARURI    MI          6.0       1553
## 3        ADITYA BAJAJ    MI          6.0       1384
## 4 PATRICK H SCHILLING    MI          5.5       1716
## 5          HANSHI ZUO    MI          5.5       1655
## 6         HANSEN SONG    OH          5.0       1686
##   Average Pre Rating of Opponents
## 1                            1605
## 2                            1469
## 3                            1564
## 4                            1574
## 5                            1501
## 6                            1519

Export the data tables into .csv files

write.csv(summarized_data, file = "summarized_data.csv")
write.csv(detailed_data, file = "detailed_data.csv",row.names=FALSE)

Data_607_Project_1

Euclid Zhang

9/15/2019