In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.
# load pacakages need to be used
library(stringr)
library(XML)
library(maps)
library(RCurl)
## Loading required package: bitops
library(RMySQL)
## Loading required package: DBI
library(knitr)
library(ggplot2)
tournamentinfo <- read.table("https://raw.githubusercontent.com/YunMai-SPS/DA607-homework/master/tournamentinfo.txt",header = FALSE, sep="\t",quote = "",stringsAsFactors = FALSE)
View(tournamentinfo)
remove “—”lines
chess1 <- data.frame(tournamentinfo[!str_detect(tournamentinfo[,1],"(--+)"),])
head(chess1)
## tournamentinfo..str_detect.tournamentinfo...1..............
## 1 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 2 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 3 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 4 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 5 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 6 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
x <- seq(3,length(chess1[,1]),2)
y <- seq(4,length(chess1[,1]),2)
chess2 <- str_c(chess1[x,],chess1[y,],sep="")
chess2_text <- textConnection(chess2)
chess3 <- read.table(chess2_text,sep="|",stringsAsFactors = FALSE)
chess4 <- str_c(chess1[1,],chess1[2,])
chess4_text <- textConnection(chess4)
chess5 <- read.table(chess4_text,sep="|",stringsAsFactors = FALSE)
colnames_change <- c(4:10,14:20)
chess5[colnames_change] <- str_c(chess5[,seq(4, 10, 1)],chess5[,seq(14, 20, 1)], sep=" ")
colnames_change2 <- c(1,3,11)
chess5[colnames_change2] <- c(str_c(chess5[,1],chess5[,11], sep=" "),str_c(chess5[,3],chess5[,13], sep=" "),"state")
chess5[,13] <- chess5[,3]
headoftable <- as.character(chess5[1,])
colnames(chess3) <- headoftable
a <- str_extract(chess3[,12],"(: +)[0-9]{1,}")
pre_rating <- str_replace_all(a,": ","")
pre_rating_num <- as.numeric(pre_rating)
chess3[,21] <- pre_rating_num
colnames(chess3)[colnames(chess3)=="NA"] <- "pre-rating"
b <- str_extract(chess3[,12],"(->|-> )[0-9]{1,}")
post_rating <- str_replace_all(b,"->|-> ","")
post_rating_num <- as.numeric(post_rating)
chess <- cbind(chess3,post_rating_num)
colnames(chess)[colnames(chess)=="post_rating_num"] <- "post-rating"
c <- str_extract(chess3[,12],"[0-9]{1,}")
UCSF_ID <- as.numeric(c)
chess <- cbind(chess,UCSF_ID)
colnames(chess)[colnames(chess)=="UCSF_ID"] <- "UCSF_ID"
head(chess)
## Pair Num Player Name Total Pts Round 1
## 1 1 GARY HUA 6.0 W 39
## 2 2 DAKSHESH DARURI 6.0 W 63
## 3 3 ADITYA BAJAJ 6.0 L 8
## 4 4 PATRICK H SCHILLING 5.5 W 23
## 5 5 HANSHI ZUO 5.5 W 45
## 6 6 HANSEN SONG 5.0 W 34
## Round 2 Round 3 Round 4 Round 5 Round 6 Round 7 state
## 1 W 21 W 18 W 14 W 7 D 12 D 4 ON
## 2 W 58 L 4 W 17 W 16 W 20 W 7 MI
## 3 W 61 W 25 W 21 W 11 W 13 W 12 MI
## 4 D 28 W 2 W 26 D 5 W 19 D 1 MI
## 5 W 37 D 12 D 13 D 4 W 14 W 17 MI
## 6 D 29 L 11 W 35 D 10 W 27 W 21 OH
## USCF ID / Rtg (Pre->Post) Total Pts Round 1 Round 2 Round 3
## 1 15445895 / R: 1794 ->1817 N:2 W B W
## 2 14598900 / R: 1553 ->1663 N:2 B W B
## 3 14959604 / R: 1384 ->1640 N:2 W B W
## 4 12616049 / R: 1716 ->1744 N:2 W B W
## 5 14601533 / R: 1655 ->1690 N:2 B W B
## 6 15055204 / R: 1686 ->1687 N:3 W B W
## Round 4 Round 5 Round 6 Round 7 pre-rating post-rating UCSF_ID
## 1 B W B W 1794 1817 15445895
## 2 W B W B 1553 1663 14598900
## 3 B W B W 1384 1640 14959604
## 4 B W B B 1716 1744 12616049
## 5 W B W B 1655 1690 14601533
## 6 B B W B 1686 1687 15055204
In this project, we will need to study the relationship between the points one player earned and the average pre-rating of opponents one player played. The pre-rating of each player’s opponents will need to be calcualted. As such, opponents’s number will only show in the game wined(W), lost(L), and Draw(D).
for(i in 1:nrow(chess)){
sum_opp <- 0
count <- 0
for(j in c(4:10)){
d <- str_detect(chess[i,j],"[WLD]{1}[[:space:]]+[[:digit:]]+")
e <- grep("[WLD]{1}[[:space:]]+[[:digit:]]+",chess[i,j],value = TRUE)
opp <- as.numeric(str_extract(e,"[[:digit:]]+"))
if (d == TRUE){
count <- count + 1
sum_opp <- sum_opp + chess[opp,"pre-rating"]
}
}
chess[i,24] <- round(sum_opp/count,0)
}
colnames(chess)[24] <- "ave-opp-pre-rating"
colnames(chess)[13:20] <- names(chess[3:10])
name_trim <- str_trim(names(chess))
name_neat <- str_replace(name_trim, "[[:space:]]{2,4}"," ")
colnames(chess) <- name_neat
names(chess)
## [1] "Pair Num" "Player Name"
## [3] "Total Pts" "Round 1"
## [5] "Round 2" "Round 3"
## [7] "Round 4" "Round 5"
## [9] "Round 6" "Round 7"
## [11] "state" "USCF ID / Rtg (Pre->Post)"
## [13] "Total Pts" "Round 1"
## [15] "Round 2" "Round 3"
## [17] "Round 4" "Round 5"
## [19] "Round 6" "Round 7"
## [21] "pre-rating" "post-rating"
## [23] "UCSF_ID" "ave-opp-pre-rating"
chess_sub1 <- chess[,c("Pair Num", "Player Name", "UCSF_ID", "state", "Total Pts", "pre-rating", "ave-opp-pre-rating", "post-rating")]
chess_sub2 <- chess[,c("Player Name","state", "Total Pts", "pre-rating", "ave-opp-pre-rating")]
# Set the working directory
setwd("D:/Program Files/RStudio/DA607_homework/DA607_Project1")
write.csv(chess_sub2, file = "Chess_tournament_Project_Yun.csv", quote = FALSE)
Player Name | state | Total Pts | pre-rating | ave-opp-pre-rating |
---|---|---|---|---|
GARY HUA | ON | 6.0 | 1794 | 1605 |
DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
HANSEN SONG | OH | 5.0 | 1686 | 1519 |
GARY DEE SWATHELL | MI | 5.0 | 1649 | 1372 |
EZEKIEL HOUGHTON | MI | 5.0 | 1641 | 1468 |
STEFANO LEE | ON | 5.0 | 1411 | 1523 |
ANVIT RAO | MI | 5.0 | 1365 | 1554 |
CAMERON WILLIAM MC LEMAN | MI | 4.5 | 1712 | 1468 |
KENNETH J TACK | MI | 4.5 | 1663 | 1506 |
TORRANCE HENRY JR | MI | 4.5 | 1666 | 1498 |
BRADLEY SHAW | MI | 4.5 | 1610 | 1515 |
ZACHARY JAMES HOUGHTON | MI | 4.5 | 1220 | 1484 |
MIKE NIKITIN | MI | 4.0 | 1604 | 1386 |
RONALD GRZEGORCZYK | MI | 4.0 | 1629 | 1499 |
DAVID SUNDEEN | MI | 4.0 | 1600 | 1480 |
DIPANKAR ROY | MI | 4.0 | 1564 | 1426 |
JASON ZHENG | MI | 4.0 | 1595 | 1411 |
DINH DANG BUI | ON | 4.0 | 1563 | 1470 |
EUGENE L MCCLURE | MI | 4.0 | 1555 | 1300 |
ALAN BUI | ON | 4.0 | 1363 | 1214 |
MICHAEL R ALDRICH | MI | 4.0 | 1229 | 1357 |
LOREN SCHWIEBERT | MI | 3.5 | 1745 | 1363 |
MAX ZHU | ON | 3.5 | 1579 | 1507 |
GAURAV GIDWANI | MI | 3.5 | 1552 | 1222 |
SOFIA ADINA STANESCU-BELLU | MI | 3.5 | 1507 | 1522 |
CHIEDOZIE OKORIE | MI | 3.5 | 1602 | 1314 |
GEORGE AVERY JONES | ON | 3.5 | 1522 | 1144 |
RISHI SHETTY | MI | 3.5 | 1494 | 1260 |
JOSHUA PHILIP MATHEWS | ON | 3.5 | 1441 | 1379 |
JADE GE | MI | 3.5 | 1449 | 1277 |
MICHAEL JEFFERY THOMAS | MI | 3.5 | 1399 | 1375 |
JOSHUA DAVID LEE | MI | 3.5 | 1438 | 1150 |
SIDDHARTH JHA | MI | 3.5 | 1355 | 1388 |
AMIYATOSH PWNANANDAM | MI | 3.5 | 980 | 1385 |
BRIAN LIU | MI | 3.0 | 1423 | 1539 |
JOEL R HENDON | MI | 3.0 | 1436 | 1430 |
FOREST ZHANG | MI | 3.0 | 1348 | 1391 |
KYLE WILLIAM MURPHY | MI | 3.0 | 1403 | 1248 |
JARED GE | MI | 3.0 | 1332 | 1150 |
ROBERT GLEN VASEY | MI | 3.0 | 1283 | 1107 |
JUSTIN D SCHILLING | MI | 3.0 | 1199 | 1327 |
DEREK YAN | MI | 3.0 | 1242 | 1152 |
JACOB ALEXANDER LAVALLEY | MI | 3.0 | 377 | 1358 |
ERIC WRIGHT | MI | 2.5 | 1362 | 1392 |
DANIEL KHAIN | MI | 2.5 | 1382 | 1356 |
MICHAEL J MARTIN | MI | 2.5 | 1291 | 1286 |
SHIVAM JHA | MI | 2.5 | 1056 | 1296 |
TEJAS AYYAGARI | MI | 2.5 | 1011 | 1356 |
ETHAN GUO | MI | 2.5 | 935 | 1495 |
JOSE C YBARRA | MI | 2.0 | 1393 | 1345 |
LARRY HODGE | MI | 2.0 | 1270 | 1206 |
ALEX KONG | MI | 2.0 | 1186 | 1406 |
MARISA RICCI | MI | 2.0 | 1153 | 1414 |
MICHAEL LU | MI | 2.0 | 1092 | 1363 |
VIRAJ MOHILE | MI | 2.0 | 917 | 1391 |
SEAN M MC CORMICK | MI | 2.0 | 853 | 1319 |
JULIA SHEN | MI | 1.5 | 967 | 1330 |
JEZZEL FARKAS | ON | 1.5 | 955 | 1327 |
ASHWIN BALAJI | MI | 1.0 | 1530 | 1186 |
THOMAS JOSEPH HOSMER | MI | 1.0 | 1175 | 1350 |
BEN LI | MI | 1.0 | 1163 | 1263 |
With the table that could be easily read in R, statistic analysis could be done conveniently.
1. Distribution of different variables
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## complete(): tidyr, RCurl
## filter(): dplyr, stats
## lag(): dplyr, stats
## map(): purrr, maps
dif_rating <- chess_sub2[,"ave-opp-pre-rating"]-chess_sub2[,"pre-rating"]
chess_sub3 <- cbind(chess_sub1,dif_rating)
# statistic analysis
summary(chess_sub3[,c('Total Pts', 'pre-rating', 'post-rating', 'ave-opp-pre-rating', 'dif_rating')])
## Total Pts pre-rating post-rating ave-opp-pre-rating
## Min. :1.000 Min. : 377 Min. : 878 Min. :1107
## 1st Qu.:2.500 1st Qu.:1227 1st Qu.:1200 1st Qu.:1310
## Median :3.500 Median :1407 Median :1418 Median :1382
## Mean :3.438 Mean :1378 Mean :1398 Mean :1379
## 3rd Qu.:4.000 3rd Qu.:1583 3rd Qu.:1578 3rd Qu.:1481
## Max. :6.000 Max. :1794 Max. :1817 Max. :1605
## dif_rating
## Min. :-382.000
## 1st Qu.:-172.250
## Median : -68.000
## Mean : 0.125
## 3rd Qu.: 139.750
## Max. : 981.000
ggplot(data = chess_sub3,aes(x = pre_rating_num)) +
geom_histogram(mapping = aes(y=..density..), binwidth = 50, color="black", fill="grey") + geom_density(aes(x = pre_rating_num),alpha =0.2, color="blue", fill = "blue") + labs(title = "Distribution of Players' Pre-performance Rating", x = "Player's Pre-rating")
boxplot(chess_sub3[,"pre-rating"], ylab = "Player's Pre-rating")
The distribution of the Players’ pre-performance rating skews to the low end with median close to 1400. One player with very low rating could be considered as outlier.
ggplot(data = chess_sub3,aes(x = post_rating_num)) +
geom_histogram(mapping = aes(y=..density..), binwidth = 50, color="black", fill="grey") + geom_density(aes(x = post_rating_num),alpha =0.2, color="green", fill = "green") + labs(title = "Distribution of Players' Post-performance Rating", x = "Player's Post-rating")
The distribution of players’ post-performance rating is multimodal.
Pts <- chess_sub3[,"Total Pts"]
ggplot(data = chess_sub3,aes(x = Pts)) +
geom_histogram(mapping = aes(y=..density..), binwidth = 0.5, color="black", fill="grey") + geom_density(aes(x = Pts),alpha =0.2, color="purple", fill = "purple") + labs(title = "Distribution of Players' Total Points", x = "Players' Total Points")
The distribution of players’ total points is normal distribution.
2.The relationship between difference in rating and score/post-rating
ggplot(data = chess_sub3,aes(x = dif_rating)) +
geom_histogram(mapping = aes(y=..density..), binwidth = 50, color="black", fill="grey") + geom_density(aes(x = dif_rating),alpha =0.2, color="red", fill = "red") + labs(title = "Distribution of Difference in Rating", x = "Average of Opponents' Pre-rating - Player's Pre-rating")
boxplot(chess_sub3[,"dif_rating"], ylab = "Average of Opponents' Pre-rating - Player's Pre-rating")
The distribution of the difference of pre-performance rating between the players and their opponents skews to the right. The median is 0, meaning basically palyers play against the opponents with similar rating. The distribution of the difference of players’s rating is close to normal distribution if we do not consider a few of the players whose opponents are much stronger than themselves.
challenging_0 <- "neg"
for(i in 2:nrow(chess_sub3)){
if(chess_sub3[i,"pre-rating"] < chess_sub3[i,"ave-opp-pre-rating"]) {
challenging_2 <- "pos"
}else if(chess_sub3[i,"pre-rating"] > chess_sub3[i,"ave-opp-pre-rating"])
{
challenging_2 <- "neg"
}else
{
challenging_2 <- "equal"
}
challenging_0 <- rbind(challenging_0,challenging_2)
}
chess_sub4 <- cbind(chess_sub3, challenging_0)
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
## --> row.names NOT used
ggplot(data = chess_sub4, aes (x = dif_rating, y = Pts, group = challenging_0)) + geom_point(mapping = aes(color = challenging_0)) + geom_smooth(mapping = aes(x = dif_rating, y = Pts, group = challenging_0)) + labs(title = "Total Points Vs. Diffrence in Players' Rating",x = "Average of Opponents' Pre-rating - Player's Pre-rating", y = "Total Points")
## `geom_smooth()` using method = 'loess'
The line shows the correlation between the total points and the difference in players’ pre-rating. When the difference in rating is positive, i.e., the player played against advanced players, it is likely that they earned less points than those whose opponents rated lower. On the contrary, the player played against the weaker players tended to get more points.
dif_post_pre <- chess_sub3[,"post-rating"] - chess_sub3[,"pre-rating"]
ggplot(data = chess_sub4, aes (x = dif_rating, y = dif_post_pre, group = challenging_0)) + geom_point(mapping = aes(color = challenging_0)) + geom_smooth(mapping = aes(group = challenging_0)) + labs(title = "Post-rating Vs. Diffrence in Players' Rating", x = "Average of Opponents' Pre-rating - Player's Pre-rating", y = "Post-performance Rating - Pre-performance Rating")
## `geom_smooth()` using method = 'loess'
The line shows the correlation between the changes of rating and the difference in players’ pre-performance rating. When the difference in pre-rating is positive, i.e., the player played against advanced players, it is likely that his/her rating increased faster. On the other hand, the player played against the weaker players tended to remain the same ranking or even slightly decreased.