Data 607 Project 1

Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

The original datafile will be hosted in Github for consumption.

url <- "https://raw.githubusercontent.com/sortega7878/DATA607PROJ1/master/tournamentinfo.txt"
download_file <- "tournamentinfo.txt" # name of the file on the local machine after download

# the file will be downloaded to the working directory
downloader::download(url, download_file)

# read the file into a vector
conn <- file(download_file, open="r")
tournament.info.raw <- readLines(conn, warn = FALSE)

close(conn)

tournament.info <- str_replace_all(str_replace_all(tournament.info.raw, "->", ">>"), "-{3,}", "")

tournament.info <- tournament.info[tournament.info  != ""]

tournament.info <- tournament.info[-(1:2)]

# determine the format by looking at the contents of the first field in the record

tournament.format1 <- tournament.info[str_detect(str_sub(tournament.info, 1, 6), "[0-9]")]

tournament.format2 <- tournament.info[str_detect(str_sub(tournament.info, 1, 6), "[A-Z]{2,2}")]

Displaying the two different lines of records separated in two different vectors.

head(tournament.format1, 5)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"

head(tournament.format2, 5)

## [1] "   ON | 15445895 / R: 1794   >>1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   >>1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   >>1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   >>1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   >>1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

Now that we separate the two lines of the records we need to parse the contents of each with regular expressions.

First line:

format1.df <- data.frame(player.num   = as.numeric(substr(tournament.format1, 1,  6)),
                         player.name  = str_trim(substr(tournament.format1, 8,  40), side="both"),
                         totalpts    = as.numeric(substr(tournament.format1, 42, 46)),
                         round1       = substr(tournament.format1, 48, 52),
                         round2       = substr(tournament.format1, 54, 58),
                         round3       = substr(tournament.format1, 60, 64),
                         round4       = substr(tournament.format1, 66, 70),
                         round5       = substr(tournament.format1, 72, 76),
                         round6       = substr(tournament.format1, 78, 82),
                         round7       = substr(tournament.format1, 84, 88), stringsAsFactors=FALSE)

Second line:

format2.df <- data.frame(player_state  = str_trim(substr(tournament.format2, 1,  6), side="both"),
                         uscf.id       = str_extract(substr(tournament.format2, 8,  40), "\\d+"),
                         prerating    = as.numeric(str_extract(substr(tournament.format2, 8,  40), "(?<=R: ).\\d+(?=)")),
                         postrating   = as.numeric(str_extract(substr(tournament.format2, 8,  40), "(?<=>>).\\d+(?=)")),
                         stringsAsFactors=FALSE)

Combining the results into a single Data Frame

results.df <- cbind(format1.df, format2.df)
head(results.df, 5)

##   player.num         player.name totalpts round1 round2 round3 round4
## 1          1            GARY HUA      6.0  W  39  W  21  W  18  W  14
## 2          2     DAKSHESH DARURI      6.0  W  63  W  58  L   4  W  17
## 3          3        ADITYA BAJAJ      6.0  L   8  W  61  W  25  W  21
## 4          4 PATRICK H SCHILLING      5.5  W  23  D  28  W   2  W  26
## 5          5          HANSHI ZUO      5.5  W  45  W  37  D  12  D  13
##   round5 round6 round7 player_state  uscf.id prerating postrating
## 1  W   7  D  12  D   4           ON 15445895      1794       1817
## 2  W  16  W  20  W   7           MI 14598900      1553       1663
## 3  W  11  W  13  W  12           MI 14959604      1384       1640
## 4  D   5  W  19  D   1           MI 12616049      1716       1744
## 5  D   4  W  14  W  17           MI 14601533      1655       1690

Continuing parsing process

df.player <- select(results.df, player.num:totalpts, player_state:postrating)
head(df.player, 5)

##   player.num         player.name totalpts player_state  uscf.id prerating
## 1          1            GARY HUA      6.0           ON 15445895      1794
## 2          2     DAKSHESH DARURI      6.0           MI 14598900      1553
## 3          3        ADITYA BAJAJ      6.0           MI 14959604      1384
## 4          4 PATRICK H SCHILLING      5.5           MI 12616049      1716
## 5          5          HANSHI ZUO      5.5           MI 14601533      1655
##   postrating
## 1       1817
## 2       1663
## 3       1640
## 4       1744
## 5       1690

Chess Player Matches Normalized Dataset

The logic below creates a normalized dataset of the player’s matches, The data captured in the round 1 - 7 fields will be split to indicate the outcome of the round and the opponent’s player number.

player.matches <-  
    results.df %>% select(player.num, round1:round7) %>%
    melt(id.var=c("player.num"), value.name="outopp")  %>% 
    mutate( round        = as.numeric(str_replace(variable, "round", "")),
            outcome      = str_extract(outopp, "^\\w+"),
            opponent_num = as.numeric(str_extract(outopp, "\\d+$"))) %>%
    select(-c(variable, outopp)) %>%
    inner_join(select(df.player, player.num, prerating), c("opponent_num" = "player.num")) %>%
    select(player.num, round, outcome, opponent_num, prerating) %>%
    arrange(player.num, round)
    
# rename prerating to opponent_prerating
names(player.matches)[names(player.matches) == "prerating"] <- "opponent_prerating"

Transformed dataset

head(player.matches, 5)

##   player.num round outcome opponent_num opponent_prerating
## 1          1     1       W           39               1436
## 2          1     2       W           21               1563
## 3          1     3       W           18               1600
## 4          1     4       W           14               1610
## 5          1     5       W            7               1649

Aggregate the normalized match dataset to create the value for opponents_avg_prerating. Note: In this calculation, only matches played with a Win, Loss, or Draw status are considered

df.player <-  
    player.matches %>%
    group_by(player.num) %>% summarise(opponents_avg_prerating = round(mean(opponent_prerating))) %>%
    inner_join(df.player, by="player.num") %>%
    select(player.name, player_state, prerating, opponents_avg_prerating, totalpts)
head(df.player, 5)

## # A tibble: 5 × 5
##           player.name player_state prerating opponents_avg_prerating
##                 <chr>        <chr>     <dbl>                   <dbl>
## 1            GARY HUA           ON      1794                    1605
## 2     DAKSHESH DARURI           MI      1553                    1469
## 3        ADITYA BAJAJ           MI      1384                    1564
## 4 PATRICK H SCHILLING           MI      1716                    1574
## 5          HANSHI ZUO           MI      1655                    1501
## # ... with 1 more variables: totalpts <dbl>

The final dataframe will be sent to a CSV file for further consumption.

write.csv(df.player, "Chess_Results.csv", row.names=FALSE)

Data 607 Project 1

Sergio Ortega Cruz

September 15, 2018

Project 1