library(stringr)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v readr 1.1.1
## v tibble 1.4.1 v purrr 0.2.4
## v tidyr 0.7.2 v dplyr 0.7.4
## v ggplot2 2.2.1 v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Importing tounrament project data
tournament.data <- readLines('https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/Week4/dataset/tournamentinfo.txt')
## Warning in readLines("https://raw.githubusercontent.com/niteen11/MSDS/
## master/DATA607/Week4/dataset/tournamentinfo.txt"): incomplete final line
## found on 'https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/
## Week4/dataset/tournamentinfo.txt'
head(tournament.data,10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
typeof(tournament.data)
## [1] "character"
tournament.data <- tournament.data[-c(0:4)]
head(tournament.data)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
tournament.data <- tournament.data[sapply(tournament.data, nchar) > 0]
head(tournament.data)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
It appears from the dataset that information about each player is organized in two consecutive rows. First row indicates player’s information and match result. The second row holds information about the players state, USCF information such as id, pre and post rating. So, each player’s information is embdedded in two consecutive rows in the dataset.
# odd data points represent player match info
player.data <- c(seq(1, length(tournament.data), 3))
player.info <- tournament.data[player.data]
head(player.info)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
# even data points represent player rating info
player.rating.data <- c(seq(2, length(tournament.data), 3))
player.rating.info <- tournament.data[player.rating.data]
head(player.rating.info)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
Extract player name
player.name <- str_extract(player.info, "\\s+([[:alpha:]- ]+)\\b\\s*\\|")
player.name <- gsub(player.name, pattern = "|", replacement = "", fixed = T)
player.name <- trimws(player.name)
head(player.name)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
Extract player state
player.state <- str_extract(player.rating.info, "[[:alpha:]]{2}")
head(player.state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
Extract Player Pre rating score value
player.prerating.score <- str_extract(player.rating.info, ".\\: \\s?[[:digit:]]{3,4}")
player.prerating.score <- gsub(player.prerating.score, pattern = "R: ", replacement = "", fixed = T)
player.prerating.score <- as.numeric(as.character(player.prerating.score))
head(player.prerating.score)
## [1] 1794 1553 1384 1716 1655 1686
Extract Players total points
player.total.points <- str_extract(player.info, "[[:digit:]]+\\.[[:digit:]]")
player.total.points <- as.numeric(as.character(player.total.points))
head(player.total.points)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0
Now extract players opponent info
player.opponent.info <- str_extract_all(player.info, "[[:digit:]]{1,2}\\|")
player.opponent.info <- str_extract_all(player.opponent.info, "[[:digit:]]{1,2}")
player.opponent.info <- lapply(player.opponent.info, as.numeric)
head(player.opponent.info)
## [[1]]
## [1] 39 21 18 14 7 12 4
##
## [[2]]
## [1] 63 58 4 17 16 20 7
##
## [[3]]
## [1] 8 61 25 21 11 13 12
##
## [[4]]
## [1] 23 28 2 26 5 19 1
##
## [[5]]
## [1] 45 37 12 13 4 14 17
##
## [[6]]
## [1] 34 29 11 35 10 27 21
let’s calculate Player’s opponent avg. rating and store in a list
opponent.avg.rating <- list()
for (i in 1:length(player.opponent.info)){
opponent.avg.rating[i] <- round(mean(player.prerating.score[unlist(player.opponent.info[i])]),2)
}
opponent.avg.rating <- lapply(opponent.avg.rating, as.numeric)
opponent.avg.rating <- data.frame(unlist(opponent.avg.rating))
head(opponent.avg.rating)
## unlist.opponent.avg.rating.
## 1 1605.29
## 2 1469.29
## 3 1563.57
## 4 1573.57
## 5 1500.86
## 6 1518.71
player.df <- cbind.data.frame(player.name, player.state, player.total.points,
player.prerating.score,round(opponent.avg.rating,0))
head(player.df)
## player.name player.state player.total.points
## 1 GARY HUA ON 6.0
## 2 DAKSHESH DARURI MI 6.0
## 3 ADITYA BAJAJ MI 6.0
## 4 PATRICK H SCHILLING MI 5.5
## 5 HANSHI ZUO MI 5.5
## 6 HANSEN SONG OH 5.0
## player.prerating.score unlist.opponent.avg.rating.
## 1 1794 1605
## 2 1553 1469
## 3 1384 1564
## 4 1716 1574
## 5 1655 1501
## 6 1686 1519
summary(player.df)
## player.name player.state player.total.points
## ADITYA BAJAJ : 1 MI:55 Min. :1.000
## ALAN BUI : 1 OH: 1 1st Qu.:2.500
## ALEX KONG : 1 ON: 8 Median :3.500
## AMIYATOSH PWNANANDAM: 1 Mean :3.438
## ANVIT RAO : 1 3rd Qu.:4.000
## ASHWIN BALAJI : 1 Max. :6.000
## (Other) :58
## player.prerating.score unlist.opponent.avg.rating.
## Min. : 377 Min. :1107
## 1st Qu.:1227 1st Qu.:1310
## Median :1407 Median :1382
## Mean :1378 Mean :1379
## 3rd Qu.:1583 3rd Qu.:1481
## Max. :1794 Max. :1605
##
ggplot(player.df, aes(player.prerating.score, opponent.avg.rating, color = player.state)) +
geom_point(aes(size = player.total.points, shape = player.state))+
ggtitle('Players info: Pre-rating Vs. Opponent Avg. Pre-rating')+
xlab('Player Opponent Avg. Pre-rating')+
ylab('Opponent Avg. Pre-rating')
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
write.csv(player.df,'playerChessData.csv')