Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.

# load pacakages need to be used
library(stringr)
library(XML)
library(maps)
library(RCurl)

## Loading required package: bitops

library(RMySQL)

## Loading required package: DBI

library(knitr)
library(ggplot2)

Load the chess tournamentinfo table

tournamentinfo <- read.table("https://raw.githubusercontent.com/YunMai-SPS/DA607-homework/master/tournamentinfo.txt",header = FALSE, sep="\t",quote = "",stringsAsFactors = FALSE)
View(tournamentinfo)

remove “—”lines

chess1 <- data.frame(tournamentinfo[!str_detect(tournamentinfo[,1],"(--+)"),])
head(chess1)

##                                  tournamentinfo..str_detect.tournamentinfo...1..............
## 1  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 4     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 5      2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 6     MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |

Merge info of each case to the same row and convert the .txt file to a table

x <- seq(3,length(chess1[,1]),2)
y <- seq(4,length(chess1[,1]),2)

chess2 <- str_c(chess1[x,],chess1[y,],sep="")

chess2_text <- textConnection(chess2)
chess3 <- read.table(chess2_text,sep="|",stringsAsFactors = FALSE)

Name the columns in the table

chess4 <- str_c(chess1[1,],chess1[2,])
chess4_text <- textConnection(chess4)
chess5 <- read.table(chess4_text,sep="|",stringsAsFactors = FALSE)
colnames_change <- c(4:10,14:20)
chess5[colnames_change] <- str_c(chess5[,seq(4, 10, 1)],chess5[,seq(14, 20, 1)], sep=" ")

colnames_change2 <- c(1,3,11)
chess5[colnames_change2] <- c(str_c(chess5[,1],chess5[,11], sep=" "),str_c(chess5[,3],chess5[,13], sep=" "),"state")
chess5[,13] <- chess5[,3]

headoftable <- as.character(chess5[1,])
colnames(chess3) <- headoftable

Extract pre-/post-rating values and UCSFID and create new columns

a <- str_extract(chess3[,12],"(: +)[0-9]{1,}")
pre_rating <- str_replace_all(a,": ","")
pre_rating_num <- as.numeric(pre_rating)
chess3[,21] <- pre_rating_num 
colnames(chess3)[colnames(chess3)=="NA"] <- "pre-rating"

b <- str_extract(chess3[,12],"(->|-> )[0-9]{1,}")
post_rating <- str_replace_all(b,"->|-> ","")
post_rating_num <- as.numeric(post_rating)
chess <- cbind(chess3,post_rating_num)
colnames(chess)[colnames(chess)=="post_rating_num"] <- "post-rating"

c <- str_extract(chess3[,12],"[0-9]{1,}")
UCSF_ID <- as.numeric(c)
chess <- cbind(chess,UCSF_ID)
colnames(chess)[colnames(chess)=="UCSF_ID"] <- "UCSF_ID"
head(chess)

##    Pair    Num    Player Name                      Total  Pts  Round 1
## 1              1  GARY HUA                                 6.0   W  39
## 2              2  DAKSHESH DARURI                          6.0   W  63
## 3              3  ADITYA BAJAJ                             6.0   L   8
## 4              4  PATRICK H SCHILLING                      5.5   W  23
## 5              5  HANSHI ZUO                               5.5   W  45
## 6              6  HANSEN SONG                              5.0   W  34
##   Round 2 Round 3 Round 4 Round 5 Round 6 Round 7  state
## 1   W  21   W  18   W  14   W   7   D  12   D   4    ON 
## 2   W  58   L   4   W  17   W  16   W  20   W   7    MI 
## 3   W  61   W  25   W  21   W  11   W  13   W  12    MI 
## 4   D  28   W   2   W  26   D   5   W  19   D   1    MI 
## 5   W  37   D  12   D  13   D   4   W  14   W  17    MI 
## 6   D  29   L  11   W  35   D  10   W  27   W  21    OH 
##    USCF ID / Rtg (Pre->Post)        Total  Pts  Round 1 Round 2 Round 3
## 1  15445895 / R: 1794   ->1817            N:2     W       B       W    
## 2  14598900 / R: 1553   ->1663            N:2     B       W       B    
## 3  14959604 / R: 1384   ->1640            N:2     W       B       W    
## 4  12616049 / R: 1716   ->1744            N:2     W       B       W    
## 5  14601533 / R: 1655   ->1690            N:2     B       W       B    
## 6  15055204 / R: 1686   ->1687            N:3     W       B       W    
##   Round 4 Round 5 Round 6 Round 7 pre-rating post-rating  UCSF_ID
## 1   B       W       B       W           1794        1817 15445895
## 2   W       B       W       B           1553        1663 14598900
## 3   B       W       B       W           1384        1640 14959604
## 4   B       W       B       B           1716        1744 12616049
## 5   W       B       W       B           1655        1690 14601533
## 6   B       B       W       B           1686        1687 15055204

Calculate the average pre-rating of each player’s opponents

In this project, we will need to study the relationship between the points one player earned and the average pre-rating of opponents one player played. The pre-rating of each player’s opponents will need to be calcualted. As such, opponents’s number will only show in the game wined(W), lost(L), and Draw(D).

for(i in 1:nrow(chess)){
  sum_opp <- 0
  count <- 0
  for(j in c(4:10)){
    d <- str_detect(chess[i,j],"[WLD]{1}[[:space:]]+[[:digit:]]+")
    e <- grep("[WLD]{1}[[:space:]]+[[:digit:]]+",chess[i,j],value = TRUE)
    opp <- as.numeric(str_extract(e,"[[:digit:]]+"))
    if (d == TRUE){
       count <- count + 1
       sum_opp <- sum_opp + chess[opp,"pre-rating"]
    }
  }
  chess[i,24] <- round(sum_opp/count,0)
}
colnames(chess)[24] <- "ave-opp-pre-rating"
colnames(chess)[13:20] <- names(chess[3:10])

Subset the table to show desire variables

name_trim <- str_trim(names(chess))
name_neat <- str_replace(name_trim, "[[:space:]]{2,4}"," ")
colnames(chess) <- name_neat
names(chess)

##  [1] "Pair Num"                  "Player Name"              
##  [3] "Total Pts"                 "Round 1"                  
##  [5] "Round 2"                   "Round 3"                  
##  [7] "Round 4"                   "Round 5"                  
##  [9] "Round 6"                   "Round 7"                  
## [11] "state"                     "USCF ID / Rtg (Pre->Post)"
## [13] "Total Pts"                 "Round 1"                  
## [15] "Round 2"                   "Round 3"                  
## [17] "Round 4"                   "Round 5"                  
## [19] "Round 6"                   "Round 7"                  
## [21] "pre-rating"                "post-rating"              
## [23] "UCSF_ID"                   "ave-opp-pre-rating"

chess_sub1 <- chess[,c("Pair Num", "Player Name", "UCSF_ID", "state", "Total Pts", "pre-rating", "ave-opp-pre-rating", "post-rating")]
chess_sub2 <- chess[,c("Player Name","state", "Total Pts", "pre-rating", "ave-opp-pre-rating")]

Export as .CSV file

# Set the working directory
setwd("D:/Program Files/RStudio/DA607_homework/DA607_Project1")
write.csv(chess_sub2, file = "Chess_tournament_Project_Yun.csv", quote = FALSE)

Display the table

Player Name	state	Total Pts	pre-rating	ave-opp-pre-rating
GARY HUA	ON	6.0	1794	1605
DAKSHESH DARURI	MI	6.0	1553	1469
ADITYA BAJAJ	MI	6.0	1384	1564
PATRICK H SCHILLING	MI	5.5	1716	1574
HANSHI ZUO	MI	5.5	1655	1501
HANSEN SONG	OH	5.0	1686	1519
GARY DEE SWATHELL	MI	5.0	1649	1372
EZEKIEL HOUGHTON	MI	5.0	1641	1468
STEFANO LEE	ON	5.0	1411	1523
ANVIT RAO	MI	5.0	1365	1554
CAMERON WILLIAM MC LEMAN	MI	4.5	1712	1468
KENNETH J TACK	MI	4.5	1663	1506
TORRANCE HENRY JR	MI	4.5	1666	1498
BRADLEY SHAW	MI	4.5	1610	1515
ZACHARY JAMES HOUGHTON	MI	4.5	1220	1484
MIKE NIKITIN	MI	4.0	1604	1386
RONALD GRZEGORCZYK	MI	4.0	1629	1499
DAVID SUNDEEN	MI	4.0	1600	1480
DIPANKAR ROY	MI	4.0	1564	1426
JASON ZHENG	MI	4.0	1595	1411
DINH DANG BUI	ON	4.0	1563	1470
EUGENE L MCCLURE	MI	4.0	1555	1300
ALAN BUI	ON	4.0	1363	1214
MICHAEL R ALDRICH	MI	4.0	1229	1357
LOREN SCHWIEBERT	MI	3.5	1745	1363
MAX ZHU	ON	3.5	1579	1507
GAURAV GIDWANI	MI	3.5	1552	1222
SOFIA ADINA STANESCU-BELLU	MI	3.5	1507	1522
CHIEDOZIE OKORIE	MI	3.5	1602	1314
GEORGE AVERY JONES	ON	3.5	1522	1144
RISHI SHETTY	MI	3.5	1494	1260
JOSHUA PHILIP MATHEWS	ON	3.5	1441	1379
JADE GE	MI	3.5	1449	1277
MICHAEL JEFFERY THOMAS	MI	3.5	1399	1375
JOSHUA DAVID LEE	MI	3.5	1438	1150
SIDDHARTH JHA	MI	3.5	1355	1388
AMIYATOSH PWNANANDAM	MI	3.5	980	1385
BRIAN LIU	MI	3.0	1423	1539
JOEL R HENDON	MI	3.0	1436	1430
FOREST ZHANG	MI	3.0	1348	1391
KYLE WILLIAM MURPHY	MI	3.0	1403	1248
JARED GE	MI	3.0	1332	1150
ROBERT GLEN VASEY	MI	3.0	1283	1107
JUSTIN D SCHILLING	MI	3.0	1199	1327
DEREK YAN	MI	3.0	1242	1152
JACOB ALEXANDER LAVALLEY	MI	3.0	377	1358
ERIC WRIGHT	MI	2.5	1362	1392
DANIEL KHAIN	MI	2.5	1382	1356
MICHAEL J MARTIN	MI	2.5	1291	1286
SHIVAM JHA	MI	2.5	1056	1296
TEJAS AYYAGARI	MI	2.5	1011	1356
ETHAN GUO	MI	2.5	935	1495
JOSE C YBARRA	MI	2.0	1393	1345
LARRY HODGE	MI	2.0	1270	1206
ALEX KONG	MI	2.0	1186	1406
MARISA RICCI	MI	2.0	1153	1414
MICHAEL LU	MI	2.0	1092	1363
VIRAJ MOHILE	MI	2.0	917	1391
SEAN M MC CORMICK	MI	2.0	853	1319
JULIA SHEN	MI	1.5	967	1330
JEZZEL FARKAS	ON	1.5	955	1327
ASHWIN BALAJI	MI	1.0	1530	1186
THOMAS JOSEPH HOSMER	MI	1.0	1175	1350
BEN LI	MI	1.0	1163	1263

Analyze the relationship between points and difference in players’ rating

With the table that could be easily read in R, statistic analysis could be done conveniently.

1. Distribution of different variables

library(tidyverse)

## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## complete(): tidyr, RCurl
## filter():   dplyr, stats
## lag():      dplyr, stats
## map():      purrr, maps

dif_rating <- chess_sub2[,"ave-opp-pre-rating"]-chess_sub2[,"pre-rating"]
chess_sub3 <- cbind(chess_sub1,dif_rating)
# statistic analysis 
summary(chess_sub3[,c('Total Pts', 'pre-rating', 'post-rating', 'ave-opp-pre-rating', 'dif_rating')])

##    Total Pts       pre-rating    post-rating   ave-opp-pre-rating
##  Min.   :1.000   Min.   : 377   Min.   : 878   Min.   :1107      
##  1st Qu.:2.500   1st Qu.:1227   1st Qu.:1200   1st Qu.:1310      
##  Median :3.500   Median :1407   Median :1418   Median :1382      
##  Mean   :3.438   Mean   :1378   Mean   :1398   Mean   :1379      
##  3rd Qu.:4.000   3rd Qu.:1583   3rd Qu.:1578   3rd Qu.:1481      
##  Max.   :6.000   Max.   :1794   Max.   :1817   Max.   :1605      
##    dif_rating      
##  Min.   :-382.000  
##  1st Qu.:-172.250  
##  Median : -68.000  
##  Mean   :   0.125  
##  3rd Qu.: 139.750  
##  Max.   : 981.000

ggplot(data = chess_sub3,aes(x = pre_rating_num)) + 
  geom_histogram(mapping = aes(y=..density..), binwidth = 50, color="black", fill="grey") + geom_density(aes(x = pre_rating_num),alpha =0.2, color="blue", fill = "blue") + labs(title = "Distribution of Players' Pre-performance Rating", x = "Player's Pre-rating")

boxplot(chess_sub3[,"pre-rating"], ylab = "Player's Pre-rating")

The distribution of the Players’ pre-performance rating skews to the low end with median close to 1400. One player with very low rating could be considered as outlier.

ggplot(data = chess_sub3,aes(x = post_rating_num)) + 
  geom_histogram(mapping = aes(y=..density..), binwidth = 50, color="black", fill="grey") + geom_density(aes(x = post_rating_num),alpha =0.2, color="green", fill = "green") + labs(title = "Distribution of Players' Post-performance Rating", x = "Player's Post-rating")

The distribution of players’ post-performance rating is multimodal.

Pts <- chess_sub3[,"Total Pts"]
ggplot(data = chess_sub3,aes(x = Pts)) + 
  geom_histogram(mapping = aes(y=..density..), binwidth = 0.5, color="black", fill="grey") + geom_density(aes(x = Pts),alpha =0.2, color="purple", fill = "purple") + labs(title = "Distribution of Players' Total Points", x = "Players' Total Points")

The distribution of players’ total points is normal distribution.

2.The relationship between difference in rating and score/post-rating

ggplot(data = chess_sub3,aes(x = dif_rating)) + 
  geom_histogram(mapping = aes(y=..density..), binwidth = 50, color="black", fill="grey") + geom_density(aes(x = dif_rating),alpha =0.2, color="red", fill = "red") + labs(title = "Distribution of Difference in Rating", x = "Average of Opponents' Pre-rating - Player's Pre-rating")

boxplot(chess_sub3[,"dif_rating"], ylab = "Average of Opponents' Pre-rating - Player's Pre-rating")

The distribution of the difference of pre-performance rating between the players and their opponents skews to the right. The median is 0, meaning basically palyers play against the opponents with similar rating. The distribution of the difference of players’s rating is close to normal distribution if we do not consider a few of the players whose opponents are much stronger than themselves.

challenging_0 <- "neg"
for(i in 2:nrow(chess_sub3)){
  if(chess_sub3[i,"pre-rating"] < chess_sub3[i,"ave-opp-pre-rating"]) {
  challenging_2 <- "pos"
 }else if(chess_sub3[i,"pre-rating"] > chess_sub3[i,"ave-opp-pre-rating"])
 {
   challenging_2 <- "neg"
 }else
 {
   challenging_2 <- "equal"
 }
  challenging_0 <- rbind(challenging_0,challenging_2)
}

chess_sub4 <- cbind(chess_sub3, challenging_0)

## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
## --> row.names NOT used

ggplot(data = chess_sub4, aes (x = dif_rating, y = Pts, group = challenging_0)) + geom_point(mapping = aes(color = challenging_0)) + geom_smooth(mapping = aes(x = dif_rating, y = Pts, group = challenging_0)) + labs(title = "Total Points Vs. Diffrence in Players' Rating",x = "Average of Opponents' Pre-rating - Player's Pre-rating", y = "Total Points")

## `geom_smooth()` using method = 'loess'

The line shows the correlation between the total points and the difference in players’ pre-rating. When the difference in rating is positive, i.e., the player played against advanced players, it is likely that they earned less points than those whose opponents rated lower. On the contrary, the player played against the weaker players tended to get more points.

dif_post_pre <- chess_sub3[,"post-rating"] - chess_sub3[,"pre-rating"]
ggplot(data = chess_sub4, aes (x = dif_rating, y = dif_post_pre, group = challenging_0)) + geom_point(mapping = aes(color = challenging_0)) + geom_smooth(mapping = aes(group = challenging_0)) + labs(title = "Post-rating Vs. Diffrence in Players' Rating", x = "Average of Opponents' Pre-rating - Player's Pre-rating", y = "Post-performance Rating - Pre-performance Rating")

## `geom_smooth()` using method = 'loess'

The line shows the correlation between the changes of rating and the difference in players’ pre-performance rating. When the difference in pre-rating is positive, i.e., the player played against advanced players, it is likely that his/her rating increased faster. On the other hand, the player played against the weaker players tended to remain the same ranking or even slightly decreased.

DA607_Project1_Chess_Tournament

Yun Mai

February 26, 2017