Chess Tournament Project

Task: Take an unformatted file and, using regular expressions, order it into a
csv with accessible data. Create a column that relies on data from multiple rows.

Our final table is presented here. The code used to accomplish our task are found at the bottom of this document.

##                         Player State_or_Province Tournament_Points Prior_Rating Opponents'_Average_Rating
## 1                    GARY HUA                ON                6.0         1794                   1605.29
## 2             DAKSHESH DARURI                MI                6.0         1553                   1469.29
## 3                ADITYA BAJAJ                MI                6.0         1384                   1563.57
## 4         PATRICK H SCHILLING                MI                5.5         1716                   1573.57
## 5                  HANSHI ZUO                MI                5.5         1655                   1500.86
## 6                 HANSEN SONG                OH                5.0         1686                   1518.71
## 7           GARY DEE SWATHELL                MI                5.0         1649                   1372.14
## 8            EZEKIEL HOUGHTON                MI                5.0         1641                   1468.43
## 9                 STEFANO LEE                ON                5.0         1411                   1523.14
## 10                  ANVIT RAO                MI                5.0         1365                   1554.14
## 11    CAMERON WILLIAM MCLEMAN                MI                4.5         1712                   1467.57
## 12             KENNETH J TACK                MI                4.5         1663                   1506.17
## 13          TORRANCE HENRY JR                MI                4.5         1666                   1497.86
## 14               BRADLEY SHAW                MI                4.5         1610                   1515.00
## 15     ZACHARY JAMES HOUGHTON                MI                4.5         1220                   1483.86
## 16               MIKE NIKITIN                MI                4.0         1604                   1385.80
## 17         RONALD GRZEGORCZYK                MI                4.0         1629                   1498.57
## 18              DAVID SUNDEEN                MI                4.0         1600                   1480.00
## 19               DIPANKAR ROY                MI                4.0         1564                   1426.29
## 20                JASON ZHENG                MI                4.0         1595                   1410.86
## 21              DINH DANG BUI                ON                4.0         1563                   1470.43
## 22           EUGENE L MCCLURE                MI                4.0         1555                   1300.33
## 23                   ALAN BUI                ON                4.0         1363                   1213.86
## 24          MICHAEL R ALDRICH                MI                4.0         1229                   1357.00
## 25           LOREN SCHWIEBERT                MI                3.5         1745                   1363.29
## 26                    MAX ZHU                ON                3.5         1579                   1506.86
## 27             GAURAV GIDWANI                MI                3.5         1552                   1221.67
## 28 SOFIA ADINA STANESCU-BELLU                MI                3.5         1507                   1522.14
## 29           CHIEDOZIE OKORIE                MI                3.5         1602                   1313.50
## 30         GEORGE AVERY JONES                ON                3.5         1522                   1144.14
## 31               RISHI SHETTY                MI                3.5         1494                   1259.86
## 32      JOSHUA PHILIP MATHEWS                ON                3.5         1441                   1378.71
## 33                    JADE GE                MI                3.5         1449                   1276.86
## 34     MICHAEL JEFFERY THOMAS                MI                3.5         1399                   1375.29
## 35           JOSHUA DAVID LEE                MI                3.5         1438                   1149.71
## 36              SIDDHARTH JHA                MI                3.5         1355                   1388.17
## 37       AMIYATOSH PWNANANDAM                MI                3.5          980                   1384.80
## 38                  BRIAN LIU                MI                3.0         1423                   1539.17
## 39              JOEL R HENDON                MI                3.0         1436                   1429.57
## 40               FOREST ZHANG                MI                3.0         1348                   1390.57
## 41        KYLE WILLIAM MURPHY                MI                3.0         1403                   1248.50
## 42                   JARED GE                MI                3.0         1332                   1149.86
## 43          ROBERT GLEN VASEY                MI                3.0         1283                   1106.57
## 44         JUSTIN D SCHILLING                MI                3.0         1199                   1327.00
## 45                  DEREK YAN                MI                3.0         1242                   1152.00
## 46   JACOB ALEXANDER LAVALLEY                MI                3.0          377                   1357.71
## 47                ERIC WRIGHT                MI                2.5         1362                   1392.00
## 48               DANIEL KHAIN                MI                2.5         1382                   1355.80
## 49           MICHAEL J MARTIN                MI                2.5         1291                   1285.80
## 50                 SHIVAM JHA                MI                2.5         1056                   1296.00
## 51             TEJAS AYYAGARI                MI                2.5         1011                   1356.14
## 52                  ETHAN GUO                MI                2.5          935                   1494.57
## 53              JOSE C YBARRA                MI                2.0         1393                   1345.33
## 54                LARRY HODGE                MI                2.0         1270                   1206.17
## 55                  ALEX KONG                MI                2.0         1186                   1406.00
## 56               MARISA RICCI                MI                2.0         1153                   1414.40
## 57                 MICHAEL LU                MI                2.0         1092                   1363.00
## 58               VIRAJ MOHILE                MI                2.0          917                   1391.00
## 59           SEAN M MCCORMICK                MI                2.0          853                   1319.00
## 60                 JULIA SHEN                MI                1.5          967                   1330.20
## 61              JEZZEL FARKAS                ON                1.5          955                   1327.29
## 62              ASHWIN BALAJI                MI                1.0         1530                   1186.00
## 63       THOMAS JOSEPH HOSMER                MI                1.0         1175                   1350.20
## 64                     BEN LI                MI                1.0         1163                   1263.00

We first show the relationship between a player’s rating and the points earned during the tournament.

Now, we run a linear regression and we find that the difference is statistically significant in our t-statistic and f-statistic. But at .3714, our coefficient of determination is low and only accounts for a small amount of the variation in point attainment during the tournament. We replot our graph again, with our regression line included.

## 
## Call:
## lm(formula = as.numeric(final.data[[3]]) ~ as.numeric(final.data[[4]]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86623 -0.56507 -0.06999  0.34002  2.54694 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -0.4635332  0.6562549  -0.706    0.483    
## as.numeric(final.data[[4]])  0.0028299  0.0004676   6.052 9.09e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9853 on 62 degrees of freedom
## Multiple R-squared:  0.3714, Adjusted R-squared:  0.3612 
## F-statistic: 36.63 on 1 and 62 DF,  p-value: 9.093e-08

library(stringr)
library(ggplot2)
fileName <- 'C:/Users/dawig/Documents/tournamentinfo.txt'
OurData<-readChar(fileName, file.info(fileName)$size)
#Extract all names in order
split.string<-unlist(str_sub(OurData,375,-1))
split.string<-unlist(str_replace_all(split.string,"MC ","MC"))
split.string2<-unlist(str_extract_all(split.string,"([A-Z]{1,20}[ ]{1}[A-Z]{1,20}[ ]{1}[A-Z]{1,20})|[A-Z]{1,20}[ ]{1}[A-Z]{1,20}"))
#Split each entry into a separate row in a data frame
split.by.player<-data.frame(nrow=64)
for (i in 1:63){
split.string3<-unlist(str_locate(split.string,split.string2[[i+1]]))
split.by.player[i,1]<-unlist(str_sub(split.string,0,split.string3[1,1]))
split.string<-unlist(str_sub(split.string,split.string3[1,1],-1))
           }
#The last row is still in split.string and has to be dealt with separately.
split.by.player[64,1]<-split.string
for(i in 1:64){
split.string3<-unlist(str_locate(split.by.player[i,1],"\\s{2,25}"))
split.by.player[i,2]<-unlist(str_sub(split.by.player[i,1],split.string3[1,1],-1))
split.by.player[i,1]<-unlist(str_sub(split.by.player[i,1],0,split.string3[1,1]))
split.by.player[i,3]<-unlist(str_extract(split.by.player[i,2]," OH | ON | MI "))
split.by.player[i,4]<-unlist(str_extract(split.by.player[i,2],"[[:digit:]]\\.[[:digit:]]"))
split.by.player[i,5]<-unlist(str_extract(split.by.player[i,2],"R\\: {1,2}[[:digit:]]{3,4}"))
split.by.player[i,5]<-unlist(str_extract(split.by.player[i,5],"[[:digit:]]{3,4}"))
              }
tempVar<-data.frame(nrow=64)
for(j in 1:7)           {
for(i in 1:64){
  helper<-unlist(str_locate(split.by.player[i,2], "\\|[WLDBU].+?[[:digit:]]"))
tempVar<-unlist(str_sub(split.by.player[i,2],helper[2],-1))
split.by.player[i,j+5]<-unlist(str_extract(tempVar,"[[:digit:]]{0,3}"))
split.by.player[i,2]<-unlist(str_sub(split.by.player[i,2],helper[2],-1))
              }
                          }
for(h in 1:64){
  split.by.player[h,13]<-0
  split.by.player[h,14]<-0
  for (i in 6:12){
  if (!is.na(split.by.player[h,i])) {
    split.by.player[h,13]<-split.by.player[h,13]+1
    opponent.reference.for.rating<-split.by.player[h,i]
    split.by.player[h,14]<-(as.numeric(split.by.player[opponent.reference.for.rating,5])+split.by.player[h,14])
                                    }
split.by.player[h,15]<-(split.by.player[h,14]/split.by.player[h,13]) 
split.by.player[h,15]<-round(split.by.player[h,15], digits=2)
      }}
final.data<-split.by.player[,c(1,3,4,5,15)]
colnames(final.data)<-c("Player","State_or_Province","Tournament_Points", "Prior_Rating", "Opponents'_Average_Rating")
final.data
write.csv(final.data, file = "Chess_Tournament_Data.csv")