Project Question
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
library(stringr)
library(knitr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Sourcing Data
tournamentinfo<-"https://raw.githubusercontent.com/juanellemarks/JuanelleMarks_DATA607_2018/master/tournamentinfo.text"
textFile <- read.csv(tournamentinfo, header = FALSE)
textFile[c(1:10), ]
## [1] -----------------------------------------------------------------------------------------
## [2] Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## [3] Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## [4] -----------------------------------------------------------------------------------------
## [5] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [6] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [7] -----------------------------------------------------------------------------------------
## [8] 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## [9] MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## [10] -----------------------------------------------------------------------------------------
## 131 Levels: 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ...
Data Cleansing
#remove first four rows as they are unecessary to the solution
textFile1<-textFile[-c(1:4),]
head(textFile1,3)
## [1] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [2] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [3] -----------------------------------------------------------------------------------------
## 131 Levels: 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ...
#Convert from factor to character
textFile1 <- as.character(textFile1)
head(textFile1,3)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
Data Manipulation
#extract rows with player information
playerInfo<-textFile1[seq(1, length(textFile1), 3)]
head(playerInfo,3)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
#extract rows with rating information
ratingInfo<-textFile1[seq(2, length(textFile1), 3)]
head(ratingInfo,3)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
#remove unwanted characters from rows
player_id <- as.integer(str_extract(playerInfo, "\\d+"))
playerName <- str_trim(str_extract(playerInfo, "(\\w+\\s){2,3}"))
player_state <- str_extract(ratingInfo, "\\w\\w")
totalPoints<- as.numeric(str_extract(playerInfo, "\\d.\\d"))
pre_Rating<- as.integer(str_extract(str_extract(ratingInfo, "\\D\\d{3,4}\\D"), "\\d+"))
opponent <- str_extract_all(str_extract_all(playerInfo, "\\d+\\|"), "\\d+")
opponent_count<- sapply(opponent, length)
seq.max <- seq_len(max(opponent_count))
z <- t(sapply(opponent, "[", i = seq.max))
textFile2 <- as.data.frame(z)
y <- data.frame(player_id, pre_Rating)
textFile3 <- textFile2
textFile3[] <- y$pre_Rating[match(unlist(textFile3), y$player_id)]
kable(head(textFile3))
| 1436 |
1563 |
1600 |
1610 |
1649 |
1663 |
1716 |
| 1175 |
917 |
1716 |
1629 |
1604 |
1595 |
1649 |
| 1641 |
955 |
1745 |
1563 |
1712 |
1666 |
1663 |
| 1363 |
1507 |
1553 |
1579 |
1655 |
1564 |
1794 |
| 1242 |
980 |
1663 |
1666 |
1716 |
1610 |
1629 |
| 1399 |
1602 |
1712 |
1438 |
1365 |
1552 |
1563 |
# Finding average pre-Chess rating of opponents
textFile3$oppAvg <- round(apply(textFile3, 1, mean, na.rm=TRUE))
kable(head(textFile3))
| 1436 |
1563 |
1600 |
1610 |
1649 |
1663 |
1716 |
1605 |
| 1175 |
917 |
1716 |
1629 |
1604 |
1595 |
1649 |
1469 |
| 1641 |
955 |
1745 |
1563 |
1712 |
1666 |
1663 |
1564 |
| 1363 |
1507 |
1553 |
1579 |
1655 |
1564 |
1794 |
1574 |
| 1242 |
980 |
1663 |
1666 |
1716 |
1610 |
1629 |
1501 |
| 1399 |
1602 |
1712 |
1438 |
1365 |
1552 |
1563 |
1519 |
final_textFile <- data_frame(player_id, playerName, player_state , totalPoints, pre_Rating, textFile3$oppAvg )
colnames(final_textFile) <- c("ID", "Player_Name", "Player_State", "Total_Points", "Pre_Match_Rating", "Opponent's_Average")
kable(head(final_textFile, 10))
| 1 |
GARY HUA |
ON |
6.0 |
1794 |
1605 |
| 2 |
DAKSHESH DARURI |
MI |
6.0 |
1553 |
1469 |
| 3 |
ADITYA BAJAJ |
MI |
6.0 |
1384 |
1564 |
| 4 |
PATRICK H SCHILLING |
MI |
5.5 |
1716 |
1574 |
| 5 |
HANSHI ZUO |
MI |
5.5 |
1655 |
1501 |
| 6 |
HANSEN SONG |
OH |
5.0 |
1686 |
1519 |
| 7 |
GARY DEE SWATHELL |
MI |
5.0 |
1649 |
1372 |
| 8 |
EZEKIEL HOUGHTON |
MI |
5.0 |
1641 |
1468 |
| 9 |
STEFANO LEE |
ON |
5.0 |
1411 |
1523 |
| 10 |
ANVIT RAO |
MI |
5.0 |
1365 |
1554 |
Write to CSV File
write.csv(final_textFile,file="/Users/juanelle/Desktop/MSDS/Data607/Week Three/Project 1/DATA607_ProjectOne_Newtxtfile")