DATA 607: Project 1

Abstract

In this project, goal is to create dataset from given a text file with chess tournament results where the information has some structure. I will create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

Data Import

line <- readLines('tournamentinfo.txt')
head(line,10)

 [1] "-----------------------------------------------------------------------------------------" 
 [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
 [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
 [4] "-----------------------------------------------------------------------------------------" 
 [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
 [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
 [7] "-----------------------------------------------------------------------------------------" 
 [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
 [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
[10] "-----------------------------------------------------------------------------------------"

Data Grouping

However, the data is already essentially laid out in a readable table format, it is not useable for analysis. Beacause it contains different syntex. Table has pattern which repeates everytime.

Column Split

Let’s find out how much space each column take to hold the information.

pair_start_point <- 1
pair_end_point <- unname(str_locate_all(pattern = '\\|', line[5])[[1]][1,1]) -1

player_start_point <-pair_end_point+2
player_end_point <- unname(str_locate_all(pattern = '\\|', line[5])[[1]][2,1]) -1

total_start_point <- player_end_point+2
total_end_point <- unname(str_locate_all(pattern = '\\|', line[5])[[1]][3,1])-1

opp_start_point<-total_end_point+2
end_point<-unname(str_locate_all(pattern = '\\|', line[5])[[1]][10,1])

raw Split

Each player respresents 2 line of record which reapeat after 3 lines. I will use sequence which will help to fetch record from text file and will skip 3 line and fatch it again.

playername_line = 5
uscf_id_line = 6
total_line = 196
repeat_pattern=3

group1 <- line[seq(playername_line, total_line, repeat_pattern)]
group2 <- line[seq(uscf_id_line, total_line, repeat_pattern)]

we need 5 differnt columns 1. player name 2. player state 3. total point 4. Pre rating 5. avg.rating

Player Name

fetch_name  <- substr(group1, player_start_point, player_end_point)
PlayerName <- str_to_title(str_trim(fetch_name))
PlayerName

##  [1] "Gary Hua"                   "Dakshesh Daruri"           
##  [3] "Aditya Bajaj"               "Patrick H Schilling"       
##  [5] "Hanshi Zuo"                 "Hansen Song"               
##  [7] "Gary Dee Swathell"          "Ezekiel Houghton"          
##  [9] "Stefano Lee"                "Anvit Rao"                 
## [11] "Cameron William Mc Leman"   "Kenneth J Tack"            
## [13] "Torrance Henry Jr"          "Bradley Shaw"              
## [15] "Zachary James Houghton"     "Mike Nikitin"              
## [17] "Ronald Grzegorczyk"         "David Sundeen"             
## [19] "Dipankar Roy"               "Jason Zheng"               
## [21] "Dinh Dang Bui"              "Eugene L Mcclure"          
## [23] "Alan Bui"                   "Michael R Aldrich"         
## [25] "Loren Schwiebert"           "Max Zhu"                   
## [27] "Gaurav Gidwani"             "Sofia Adina Stanescu-Bellu"
## [29] "Chiedozie Okorie"           "George Avery Jones"        
## [31] "Rishi Shetty"               "Joshua Philip Mathews"     
## [33] "Jade Ge"                    "Michael Jeffery Thomas"    
## [35] "Joshua David Lee"           "Siddharth Jha"             
## [37] "Amiyatosh Pwnanandam"       "Brian Liu"                 
## [39] "Joel R Hendon"              "Forest Zhang"              
## [41] "Kyle William Murphy"        "Jared Ge"                  
## [43] "Robert Glen Vasey"          "Justin D Schilling"        
## [45] "Derek Yan"                  "Jacob Alexander Lavalley"  
## [47] "Eric Wright"                "Daniel Khain"              
## [49] "Michael J Martin"           "Shivam Jha"                
## [51] "Tejas Ayyagari"             "Ethan Guo"                 
## [53] "Jose C Ybarra"              "Larry Hodge"               
## [55] "Alex Kong"                  "Marisa Ricci"              
## [57] "Michael Lu"                 "Viraj Mohile"              
## [59] "Sean M Mc Cormick"          "Julia Shen"                
## [61] "Jezzel Farkas"              "Ashwin Balaji"             
## [63] "Thomas Joseph Hosmer"       "Ben Li"

Player State

fetch_state <- substr(group2, pair_start_point, pair_end_point)
State <- str_trim(fetch_state)
State

##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"

# start our df
chess <- data.frame(PlayerName, State)
chess

##                    PlayerName State
## 1                    Gary Hua    ON
## 2             Dakshesh Daruri    MI
## 3                Aditya Bajaj    MI
## 4         Patrick H Schilling    MI
## 5                  Hanshi Zuo    MI
## 6                 Hansen Song    OH
## 7           Gary Dee Swathell    MI
## 8            Ezekiel Houghton    MI
## 9                 Stefano Lee    ON
## 10                  Anvit Rao    MI
## 11   Cameron William Mc Leman    MI
## 12             Kenneth J Tack    MI
## 13          Torrance Henry Jr    MI
## 14               Bradley Shaw    MI
## 15     Zachary James Houghton    MI
## 16               Mike Nikitin    MI
## 17         Ronald Grzegorczyk    MI
## 18              David Sundeen    MI
## 19               Dipankar Roy    MI
## 20                Jason Zheng    MI
## 21              Dinh Dang Bui    ON
## 22           Eugene L Mcclure    MI
## 23                   Alan Bui    ON
## 24          Michael R Aldrich    MI
## 25           Loren Schwiebert    MI
## 26                    Max Zhu    ON
## 27             Gaurav Gidwani    MI
## 28 Sofia Adina Stanescu-Bellu    MI
## 29           Chiedozie Okorie    MI
## 30         George Avery Jones    ON
## 31               Rishi Shetty    MI
## 32      Joshua Philip Mathews    ON
## 33                    Jade Ge    MI
## 34     Michael Jeffery Thomas    MI
## 35           Joshua David Lee    MI
## 36              Siddharth Jha    MI
## 37       Amiyatosh Pwnanandam    MI
## 38                  Brian Liu    MI
## 39              Joel R Hendon    MI
## 40               Forest Zhang    MI
## 41        Kyle William Murphy    MI
## 42                   Jared Ge    MI
## 43          Robert Glen Vasey    MI
## 44         Justin D Schilling    MI
## 45                  Derek Yan    MI
## 46   Jacob Alexander Lavalley    MI
## 47                Eric Wright    MI
## 48               Daniel Khain    MI
## 49           Michael J Martin    MI
## 50                 Shivam Jha    MI
## 51             Tejas Ayyagari    MI
## 52                  Ethan Guo    MI
## 53              Jose C Ybarra    MI
## 54                Larry Hodge    MI
## 55                  Alex Kong    MI
## 56               Marisa Ricci    MI
## 57                 Michael Lu    MI
## 58               Viraj Mohile    MI
## 59          Sean M Mc Cormick    MI
## 60                 Julia Shen    MI
## 61              Jezzel Farkas    ON
## 62              Ashwin Balaji    MI
## 63       Thomas Joseph Hosmer    MI
## 64                     Ben Li    MI

Total Points

point <- as.numeric(substr(group1, total_start_point, total_end_point))
chess$TotalPoints <- point
point

##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0
## [18] 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
## [35] 3.5 3.5 3.5 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5
## [52] 2.5 2.0 2.0 2.0 2.0 2.0 2.0 2.0 1.5 1.5 1.0 1.0 1.0

Pre-Rating

pRating <- substr(group2, player_start_point, player_end_point)
pRating <- str_extract(pRating, ': * [[:digit:]]{2,}') %>% parse_number()
chess$PreRating <- pRating
pRating

##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153
## [57] 1092  917  853  967  955 1530 1175 1163

Avg Opponent’s Pre-Rating

fetch_opp <- substr(group1, opp_start_point, end_point)
fetch_opp <- as.matrix(str_extract_all(fetch_opp, '\\b\\d{1,}'))

calculate_rating <- function(x, y) {
    temp <- x[y]
   
    for (i in temp){
        rate <- 0
        k <- 0
        for(j in i) {
            k <- k + 1
            rate <- rate + chess$PreRating[as.numeric(j)]
        }
        rate <- round(rate / k)
       
    }
    return(rate)
}

chess$AvgOppPreRating <- apply(fetch_opp, 1, calculate_rating)

Export

datatable(chess)

# export
write.csv(chess, "chess.csv", row.names=FALSE)

Conclusion

The data was already essentially laid out in a readable table format, we were able to use see a pattern and split the data.