Project 1
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments. You may substitute another text file (or set of text files, or data scraped from web pages) of similar or greater complexity, and create your own assignment and solution. You may work in a small team. All of your code should be in an R markdown file (and published to rpubs.com); with your data accessible for the person running the script.
Loading the data set from github repository for file analysis.
data1<- read.csv("https://raw.githubusercontent.com/johnsuh23/DATA-605/master/tournamentinfo.txt", header=F)
head(data1)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 3 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 4 -----------------------------------------------------------------------------------------
## 5 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 6 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
Removing rows with hyphenated lines and the header lines .
data2<-data1[-c(1:3),]
head(data2)
## [1] -----------------------------------------------------------------------------------------
## [2] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [3] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [4] -----------------------------------------------------------------------------------------
## [5] 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## [6] MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 131 Levels: ----------------------------------------------------------------------------------------- ...
data3<-subset(data2,data2!="-----------------------------------------------------------------------------------------")
head(data3)
## [1] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [2] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [3] 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## [4] MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## [5] 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|
## [6] MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |
## 131 Levels: ----------------------------------------------------------------------------------------- ...
Use string extraction functions to parse out values to create the data columns.
#requires stringr package
library(stringr)
PlayerNameTemp<- str_trim(str_extract(data3,"(\\w+\\s){2,3}"))
PlayerName<-na.omit(PlayerNameTemp)
head(PlayerName)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
PlayerStateTemp<-str_extract(data3,"\\w+")
PlayerState<-subset(c(PlayerStateTemp),c(PlayerStateTemp)=="MI" | c(PlayerStateTemp)=="ON" | c(PlayerStateTemp)=="OH")
head(PlayerState)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
PlayerPointsTemp<-as.numeric(str_extract(data3,"\\d+\\.\\d+"))
PlayerPoints<-na.omit(PlayerPointsTemp)
head(PlayerPoints)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0
PreRatingTemp<-as.integer(str_extract(str_extract(data3,"[^\\d]\\d{3,4}[^\\d]"),"\\d+"))
PreRating<-na.omit(PreRatingTemp)
head(PreRating)
## [1] 1794 1553 1384 1716 1655 1686
#Column data to calculate opponent average ratings
PlayerIDTemp <- as.integer(str_extract(data3, "\\d+"))
PlayerID<-subset(c(PlayerIDTemp),c(PlayerIDTemp)<65)
head(PlayerID)
## [1] 1 2 3 4 5 6
OpponentIDTemp<-str_extract_all(str_extract_all(data3,"\\d+\\|"),"\\d+")
OpponentID<-subset(c(OpponentIDTemp),c(OpponentIDTemp)!="0")
head(OpponentID)
## [[1]]
## [1] "39" "21" "18" "14" "7" "12" "4"
##
## [[2]]
## [1] "63" "58" "4" "17" "16" "20" "7"
##
## [[3]]
## [1] "8" "61" "25" "21" "11" "13" "12"
##
## [[4]]
## [1] "23" "28" "2" "26" "5" "19" "1"
##
## [[5]]
## [1] "45" "37" "12" "13" "4" "14" "17"
##
## [[6]]
## [1] "34" "29" "11" "35" "10" "27" "21"
Create function to calculate opponent avg ratings column.
x<-length(data2)
OppAvgRating<-numeric(x/3)
for (i in 1:(x/3)) {
OppAvgRating[i]<- mean(PreRating[as.numeric(unlist(OpponentID[PlayerID[i]]))])
}
head(OppAvgRating)
## [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714
Create dataframe from all the columns created.
chess<-data.frame(PlayerName,PlayerState,PlayerPoints,PreRating,OppAvgRating)
Show final output.
#requires kableExtra
library(kableExtra)
kable(chess) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
|
PlayerName
|
PlayerState
|
PlayerPoints
|
PreRating
|
OppAvgRating
|
|
GARY HUA
|
ON
|
6.0
|
1794
|
1605.286
|
|
DAKSHESH DARURI
|
MI
|
6.0
|
1553
|
1469.286
|
|
ADITYA BAJAJ
|
MI
|
6.0
|
1384
|
1563.571
|
|
PATRICK H SCHILLING
|
MI
|
5.5
|
1716
|
1573.571
|
|
HANSHI ZUO
|
MI
|
5.5
|
1655
|
1500.857
|
|
HANSEN SONG
|
OH
|
5.0
|
1686
|
1518.714
|
|
GARY DEE SWATHELL
|
MI
|
5.0
|
1649
|
1372.143
|
|
EZEKIEL HOUGHTON
|
MI
|
5.0
|
1641
|
1468.429
|
|
STEFANO LEE
|
ON
|
5.0
|
1411
|
1523.143
|
|
ANVIT RAO
|
MI
|
5.0
|
1365
|
1554.143
|
|
CAMERON WILLIAM MC
|
MI
|
4.5
|
1712
|
1467.571
|
|
KENNETH J TACK
|
MI
|
4.5
|
1663
|
1506.167
|
|
TORRANCE HENRY JR
|
MI
|
4.5
|
1666
|
1497.857
|
|
BRADLEY SHAW
|
MI
|
4.5
|
1610
|
1515.000
|
|
ZACHARY JAMES HOUGHTON
|
MI
|
4.5
|
1220
|
1483.857
|
|
MIKE NIKITIN
|
MI
|
4.0
|
1604
|
1385.800
|
|
RONALD GRZEGORCZYK
|
MI
|
4.0
|
1629
|
1498.571
|
|
DAVID SUNDEEN
|
MI
|
4.0
|
1600
|
1480.000
|
|
DIPANKAR ROY
|
MI
|
4.0
|
1564
|
1426.286
|
|
JASON ZHENG
|
MI
|
4.0
|
1595
|
1410.857
|
|
DINH DANG BUI
|
ON
|
4.0
|
1563
|
1470.429
|
|
EUGENE L MCCLURE
|
MI
|
4.0
|
1555
|
1300.333
|
|
ALAN BUI
|
ON
|
4.0
|
1363
|
1213.857
|
|
MICHAEL R ALDRICH
|
MI
|
4.0
|
1229
|
1357.000
|
|
LOREN SCHWIEBERT
|
MI
|
3.5
|
1745
|
1363.286
|
|
MAX ZHU
|
ON
|
3.5
|
1579
|
1506.857
|
|
GAURAV GIDWANI
|
MI
|
3.5
|
1552
|
1221.667
|
|
SOFIA ADINA
|
MI
|
3.5
|
1507
|
1522.143
|
|
CHIEDOZIE OKORIE
|
MI
|
3.5
|
1602
|
1313.500
|
|
GEORGE AVERY JONES
|
ON
|
3.5
|
1522
|
1144.143
|
|
RISHI SHETTY
|
MI
|
3.5
|
1494
|
1259.857
|
|
JOSHUA PHILIP MATHEWS
|
ON
|
3.5
|
1441
|
1378.714
|
|
JADE GE
|
MI
|
3.5
|
1449
|
1276.857
|
|
MICHAEL JEFFERY THOMAS
|
MI
|
3.5
|
1399
|
1375.286
|
|
JOSHUA DAVID LEE
|
MI
|
3.5
|
1438
|
1149.714
|
|
SIDDHARTH JHA
|
MI
|
3.5
|
1355
|
1388.167
|
|
AMIYATOSH PWNANANDAM
|
MI
|
3.5
|
980
|
1384.800
|
|
BRIAN LIU
|
MI
|
3.0
|
1423
|
1539.167
|
|
JOEL R HENDON
|
MI
|
3.0
|
1436
|
1429.571
|
|
FOREST ZHANG
|
MI
|
3.0
|
1348
|
1390.571
|
|
KYLE WILLIAM MURPHY
|
MI
|
3.0
|
1403
|
1248.500
|
|
JARED GE
|
MI
|
3.0
|
1332
|
1149.857
|
|
ROBERT GLEN VASEY
|
MI
|
3.0
|
1283
|
1106.571
|
|
JUSTIN D SCHILLING
|
MI
|
3.0
|
1199
|
1327.000
|
|
DEREK YAN
|
MI
|
3.0
|
1242
|
1152.000
|
|
JACOB ALEXANDER LAVALLEY
|
MI
|
3.0
|
377
|
1357.714
|
|
ERIC WRIGHT
|
MI
|
2.5
|
1362
|
1392.000
|
|
DANIEL KHAIN
|
MI
|
2.5
|
1382
|
1355.800
|
|
MICHAEL J MARTIN
|
MI
|
2.5
|
1291
|
1285.800
|
|
SHIVAM JHA
|
MI
|
2.5
|
1056
|
1296.000
|
|
TEJAS AYYAGARI
|
MI
|
2.5
|
1011
|
1356.143
|
|
ETHAN GUO
|
MI
|
2.5
|
935
|
1494.571
|
|
JOSE C YBARRA
|
MI
|
2.0
|
1393
|
1345.333
|
|
LARRY HODGE
|
MI
|
2.0
|
1270
|
1206.167
|
|
ALEX KONG
|
MI
|
2.0
|
1186
|
1406.000
|
|
MARISA RICCI
|
MI
|
2.0
|
1153
|
1414.400
|
|
MICHAEL LU
|
MI
|
2.0
|
1092
|
1363.000
|
|
VIRAJ MOHILE
|
MI
|
2.0
|
917
|
1391.000
|
|
SEAN M MC
|
MI
|
2.0
|
853
|
1319.000
|
|
JULIA SHEN
|
MI
|
1.5
|
967
|
1330.200
|
|
JEZZEL FARKAS
|
ON
|
1.5
|
955
|
1327.286
|
|
ASHWIN BALAJI
|
MI
|
1.0
|
1530
|
1186.000
|
|
THOMAS JOSEPH HOSMER
|
MI
|
1.0
|
1175
|
1350.200
|
|
BEN LI
|
MI
|
1.0
|
1163
|
1263.000
|
Write to CSV for import into SQL database
# please change filepath on local machine
write.csv(chess,"C:\\Users\\jkks9\\Documents\\DATA 607\\chess.csv", row.names = FALSE)
Showing a box plot to see which player theoretically had the toughest opponents to face by having the highest OpponentAvgRating during the tournament; added in PlayerID to dataframe.
The scatterplot shows it was player 1, Gary HUA who also had the highest point totals and highest pre-ranking points as well. Most likely as he progressed through each round if it was single elimination by round, he faced players with better ratings. It appears he won the tournament without the data set saying who won.
#requires ggplot2 package
library(ggplot2)
chessdata2<-data.frame(PlayerID,PlayerName,PlayerPoints,OppAvgRating)
head(chessdata2)
## PlayerID PlayerName PlayerPoints OppAvgRating
## 1 1 GARY HUA 6.0 1605.286
## 2 2 DAKSHESH DARURI 6.0 1469.286
## 3 3 ADITYA BAJAJ 6.0 1563.571
## 4 4 PATRICK H SCHILLING 5.5 1573.571
## 5 5 HANSHI ZUO 5.5 1500.857
## 6 6 HANSEN SONG 5.0 1518.714
ggplot(chessdata2, aes(x=chessdata2$PlayerID, y=chessdata2$OppAvgRating)) +geom_point(size=4, shape=18)
