In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players; - Player’s Name - Player’s State - Total Number of Points - Player’s Pre-Rating - Average Pre Chess Rating of Opponents
load library
require(stringr)
## Loading required package: stringr
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidyr)
## Loading required package: tidyr
require(knitr)
## Loading required package: knitr
require(readr)
## Loading required package: readr
load the data
tournamentinfo = readLines("/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/tournamentinfo.txt")
## Warning in readLines("/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/
## data 607/ASSIGNMENTS/tournamentinfo.txt"): incomplete final line found on
## '/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/
## tournamentinfo.txt'
removing dividers and rows
tournamentinfo = str_replace_all(tournamentinfo,"-","")
remove empty column
tourn = read.delim(textConnection(tournamentinfo),header = FALSE, sep = "|", stringsAsFactors = FALSE)
tourn = tourn[,-11]
simplify and add header
tournheader = paste(trimws(as.character(tourn[1,])), trimws(as.character(tourn[2,])))
tournheader = make.names(str_sub(tournheader, 1,11))
tourn = as.tbl(tourn[-1:-2,])
names(tourn) = tournheader
fix positioning of player id and rank
playid = seq(1,nrow(tourn),2)
srank = seq(2,nrow(tourn),2)
tournplayers = tourn[playid,]
stateranks = select(tourn[srank,], State = Pair.Num, Pre.Rating = Player.Name)
bind coloumns then display work done
tournplayers = bind_cols(tournplayers, stateranks)
tournplayers[1:5,]
## # A tibble: 5 x 12
## Pair.Num Player.Name Total.Pts Round.1 Round.2 Round.3 Round.4 Round.5
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 " 1 " " GARY HUA… "6.0 " W 39 W 21 W 18 W 14 W 7
## 2 " 2 " " DAKSHESH… "6.0 " W 63 W 58 L 4 W 17 W 16
## 3 " 3 " " ADITYA B… "6.0 " L 8 W 61 W 25 W 21 W 11
## 4 " 4 " " PATRICK … "5.5 " W 23 D 28 W 2 W 26 D 5
## 5 " 5 " " HANSHI Z… "5.5 " W 45 W 37 D 12 D 13 D 4
## # … with 4 more variables: Round.6 <chr>, Round.7 <chr>, State <chr>,
## # Pre.Rating <chr>
clean pre rating
tournplayers = tournplayers %>%
mutate(Pre.Rating = str_replace(Pre.Rating,"\\d+\\s+/ R:\\s+","")) %>%
mutate(Pre.Rating = str_replace(Pre.Rating,"^[P].+|>.+","")) %>%
mutate(Pre.Rating = as.integer(str_extract(Pre.Rating, "\\d+")))
clean player name, points and pair
tournplayers = tournplayers %>%
mutate(Total.Pts = as.numeric(Total.Pts),
Pair.Num = parse_number(Pair.Num),
Player.Name = trimws(Player.Name))
tidy format
tournplayers = tournplayers %>%
gather("Round","OppPairNum", 4:10) %>%
mutate(OppPairNum = parse_number(OppPairNum))
## Warning: 40 parsing failures.
## row col expected actual
## 37 -- a number B
## 44 -- a number B
## 53 -- a number H
## 56 -- a number H
## 123 -- a number B
## ... ... ........ ......
## See problems(...) for more details.
join player pre rating
pratings = select(tournplayers, Pair.Num,OppPreRating = Pre.Rating) %>% distinct()
tournplayers = left_join(tournplayers, pratings, by = c("OppPairNum" = "Pair.Num"))
## Warning: Column `OppPairNum`/`Pair.Num` has different attributes on LHS and
## RHS of join
calculate average opponent rating in final table, preview table
tournplayers = tournplayers %>%
group_by(Player.Name) %>%
mutate(AvgOppPreRating = round(mean(OppPreRating, na.rm=T)))
result = tournplayers %>%
select(Player.Name, Player.State = State, Total.Pts, Pre.Rating, AvgOppPreRating) %>%
ungroup() %>%
distinct()
result[1:10,]
## # A tibble: 10 x 5
## Player.Name Player.State Total.Pts Pre.Rating AvgOppPreRating
## <chr> <chr> <dbl> <int> <dbl>
## 1 GARY HUA " ON " 6 1794 1605
## 2 DAKSHESH DARURI " MI " 6 1553 1469
## 3 ADITYA BAJAJ " MI " 6 1384 1564
## 4 PATRICK H SCHILLING " MI " 5.5 1716 1574
## 5 HANSHI ZUO " MI " 5.5 1655 1501
## 6 HANSEN SONG " OH " 5 1686 1519
## 7 GARY DEE SWATHELL " MI " 5 1649 1372
## 8 EZEKIEL HOUGHTON " MI " 5 1641 1468
## 9 STEFANO LEE " ON " 5 1411 1523
## 10 ANVIT RAO " MI " 5 1365 1554
export csv
write_csv(result, "/Users/cayre/Google Drive/CUNY SPRING 19/COURSES/data 607/ASSIGNMENTS/607-Proj1.csv")