In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players; - Player’s Name - Player’s State - Total Number of Points - Player’s Pre-Rating - Average Pre Chess Rating of Opponents

load library

require(stringr) 
## Loading required package: stringr
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidyr)
## Loading required package: tidyr
require(knitr)
## Loading required package: knitr
require(readr)
## Loading required package: readr

load the data

tournamentinfo = readLines("/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/tournamentinfo.txt")
## Warning in readLines("/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/
## data 607/ASSIGNMENTS/tournamentinfo.txt"): incomplete final line found on
## '/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/
## tournamentinfo.txt'

removing dividers and rows

tournamentinfo = str_replace_all(tournamentinfo,"-","")

remove empty column

tourn = read.delim(textConnection(tournamentinfo),header = FALSE, sep = "|", stringsAsFactors = FALSE)

tourn = tourn[,-11]

simplify and add header

tournheader = paste(trimws(as.character(tourn[1,])), trimws(as.character(tourn[2,])))

tournheader = make.names(str_sub(tournheader, 1,11))

tourn = as.tbl(tourn[-1:-2,])
names(tourn) = tournheader

fix positioning of player id and rank

playid = seq(1,nrow(tourn),2)
srank = seq(2,nrow(tourn),2)

tournplayers = tourn[playid,]
stateranks = select(tourn[srank,], State = Pair.Num, Pre.Rating = Player.Name)

bind coloumns then display work done

tournplayers = bind_cols(tournplayers, stateranks)

tournplayers[1:5,]
## # A tibble: 5 x 12
##   Pair.Num Player.Name Total.Pts Round.1 Round.2 Round.3 Round.4 Round.5
##   <chr>    <chr>       <chr>     <chr>   <chr>   <chr>   <chr>   <chr>  
## 1 "    1 " " GARY HUA… "6.0  "   W  39   W  21   W  18   W  14   W   7  
## 2 "    2 " " DAKSHESH… "6.0  "   W  63   W  58   L   4   W  17   W  16  
## 3 "    3 " " ADITYA B… "6.0  "   L   8   W  61   W  25   W  21   W  11  
## 4 "    4 " " PATRICK … "5.5  "   W  23   D  28   W   2   W  26   D   5  
## 5 "    5 " " HANSHI Z… "5.5  "   W  45   W  37   D  12   D  13   D   4  
## # … with 4 more variables: Round.6 <chr>, Round.7 <chr>, State <chr>,
## #   Pre.Rating <chr>

clean pre rating

tournplayers = tournplayers %>% 
  mutate(Pre.Rating = str_replace(Pre.Rating,"\\d+\\s+/ R:\\s+","")) %>% 
  mutate(Pre.Rating = str_replace(Pre.Rating,"^[P].+|>.+","")) %>% 
  mutate(Pre.Rating = as.integer(str_extract(Pre.Rating, "\\d+"))) 

clean player name, points and pair

tournplayers = tournplayers %>% 
  mutate(Total.Pts = as.numeric(Total.Pts),
         Pair.Num = parse_number(Pair.Num),
         Player.Name = trimws(Player.Name))

tidy format

tournplayers = tournplayers %>%
  gather("Round","OppPairNum", 4:10) %>% 
  mutate(OppPairNum = parse_number(OppPairNum))
## Warning: 40 parsing failures.
## row col expected actual
##  37  -- a number      B
##  44  -- a number      B
##  53  -- a number      H
##  56  -- a number      H
## 123  -- a number      B
## ... ... ........ ......
## See problems(...) for more details.

join player pre rating

pratings = select(tournplayers, Pair.Num,OppPreRating = Pre.Rating) %>% distinct()

tournplayers = left_join(tournplayers, pratings, by = c("OppPairNum" = "Pair.Num"))
## Warning: Column `OppPairNum`/`Pair.Num` has different attributes on LHS and
## RHS of join

calculate average opponent rating in final table, preview table

tournplayers = tournplayers %>% 
  group_by(Player.Name) %>% 
  mutate(AvgOppPreRating = round(mean(OppPreRating, na.rm=T)))

result = tournplayers %>% 
  select(Player.Name, Player.State = State, Total.Pts, Pre.Rating, AvgOppPreRating) %>% 
  ungroup() %>% 
  distinct()

result[1:10,]
## # A tibble: 10 x 5
##    Player.Name         Player.State Total.Pts Pre.Rating AvgOppPreRating
##    <chr>               <chr>            <dbl>      <int>           <dbl>
##  1 GARY HUA            "   ON "           6         1794            1605
##  2 DAKSHESH DARURI     "   MI "           6         1553            1469
##  3 ADITYA BAJAJ        "   MI "           6         1384            1564
##  4 PATRICK H SCHILLING "   MI "           5.5       1716            1574
##  5 HANSHI ZUO          "   MI "           5.5       1655            1501
##  6 HANSEN SONG         "   OH "           5         1686            1519
##  7 GARY DEE SWATHELL   "   MI "           5         1649            1372
##  8 EZEKIEL HOUGHTON    "   MI "           5         1641            1468
##  9 STEFANO LEE         "   ON "           5         1411            1523
## 10 ANVIT RAO           "   MI "           5         1365            1554

export csv

write_csv(result, "/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/607-Proj1.csv")