Title: CUNY SPS MDS Data607_Project1"

Author: Charles Ugiagbe

Date: “9/12/2021”

Load Required Package

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)

Read Tournament Data

theurl <- "https://raw.githubusercontent.com/omocharly/DATA607_LABS/main/tournamentinfo.txt"
chess_data.df <- readLines(theurl)
## Warning in readLines(theurl): incomplete final line found on 'https://
## raw.githubusercontent.com/omocharly/DATA607_LABS/main/tournamentinfo.txt'

Take a brief look at the head of the data

head(chess_data.df)
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

Cleanup the header by removing first 4 rows:

chess_data1.df <- chess_data.df[c(-1:-4, 0)]
head(chess_data1.df)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

Create two data frames to capture the two lines of data for each player from the original frame. Every player information is in lines of three with the third line being the row delimiter:

#created empty objects:  
data1.df <- c()
data2.df <- c()

#Run a loop to get the split:  
k <- 1
for (i in 1:length(chess_data1.df) ) {
  if ( i == 1 | i%%3 == 1 )  {    data1.df[k] <- chess_data1.df[i]  }
  else if (i == 2 | i%%3 == 2)  {      data2.df[k] <- chess_data1.df[i]    }
  if (i%%3 == 0) { 
    k <- k + 1
    }
}
# Sample rows from the dataset:  
head(data1.df)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
head(data2.df)
## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"
# Splitting  with pattern as the delimiter "|" :
data1.cols.df <- str_split(data1.df, pattern = fixed("|"), simplify = TRUE, n=11)
data2.cols.df <- str_split(data2.df, pattern = fixed("|"), simplify = TRUE, n=11)

#Trimming the values for values in all rows and columns:
data1.cols.df  <- trimws(data1.cols.df )
data2.cols.df  <- trimws(data2.cols.df )

Sample rows from the dataset:

head(data1.cols.df)
##      [,1] [,2]                  [,3]  [,4]    [,5]    [,6]    [,7]    [,8]   
## [1,] "1"  "GARY HUA"            "6.0" "W  39" "W  21" "W  18" "W  14" "W   7"
## [2,] "2"  "DAKSHESH DARURI"     "6.0" "W  63" "W  58" "L   4" "W  17" "W  16"
## [3,] "3"  "ADITYA BAJAJ"        "6.0" "L   8" "W  61" "W  25" "W  21" "W  11"
## [4,] "4"  "PATRICK H SCHILLING" "5.5" "W  23" "D  28" "W   2" "W  26" "D   5"
## [5,] "5"  "HANSHI ZUO"          "5.5" "W  45" "W  37" "D  12" "D  13" "D   4"
## [6,] "6"  "HANSEN SONG"         "5.0" "W  34" "D  29" "L  11" "W  35" "D  10"
##      [,9]    [,10]   [,11]
## [1,] "D  12" "D   4" ""   
## [2,] "W  20" "W   7" ""   
## [3,] "W  13" "W  12" ""   
## [4,] "W  19" "D   1" ""   
## [5,] "W  14" "W  17" ""   
## [6,] "W  27" "W  21" ""
head(data2.cols.df)
##      [,1] [,2]                          [,3]  [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] "ON" "15445895 / R: 1794   ->1817" "N:2" "W"  "B"  "W"  "B"  "W"  "B" 
## [2,] "MI" "14598900 / R: 1553   ->1663" "N:2" "B"  "W"  "B"  "W"  "B"  "W" 
## [3,] "MI" "14959604 / R: 1384   ->1640" "N:2" "W"  "B"  "W"  "B"  "W"  "B" 
## [4,] "MI" "12616049 / R: 1716   ->1744" "N:2" "W"  "B"  "W"  "B"  "W"  "B" 
## [5,] "MI" "14601533 / R: 1655   ->1690" "N:2" "B"  "W"  "B"  "W"  "B"  "W" 
## [6,] "OH" "15055204 / R: 1686   ->1687" "N:3" "W"  "B"  "W"  "B"  "B"  "W" 
##      [,10] [,11]
## [1,] "W"   ""   
## [2,] "B"   ""   
## [3,] "W"   ""   
## [4,] "B"   ""   
## [5,] "B"   ""   
## [6,] "B"   ""

We will create the final data set having the merged data from the above two data sets with each row in this final data set representing a single player information:

# Creating of an empty dataset with column names and number of rows as the length of final dataset:  
final.dataset<-data.frame(id=character(nrow(data1.cols.df)), name=character(nrow(data1.cols.df)), state=character(nrow(data1.cols.df)), total.points=numeric(nrow(data1.cols.df)), pre.rating=numeric(nrow(data1.cols.df)), avg.pre.rating.oppo=numeric(nrow(data1.cols.df)), rd1=character(nrow(data1.cols.df)),rd2=character(nrow(data1.cols.df)), rd3=character(nrow(data1.cols.df)), rd4=character(nrow(data1.cols.df)), rd5=character(nrow(data1.cols.df)), rd6=character(nrow(data1.cols.df)), rd7=character(nrow(data1.cols.df)) )

# Assigning valve to each column in the final dataset [except for average opponent rating, which we will calculate in next step]:    
final.dataset$id <- trimws(data1.cols.df[, 1])
final.dataset$name <- data1.cols.df[, 2]
final.dataset$state <- data2.cols.df[, 1]
final.dataset$total.points <-  as.double(data1.cols.df[, 3] )
final.dataset$pre.rating  <- as.integer(  sub('P', '.', trimws(  substr(data2.cols.df[,2], regexpr('R:', data2.cols.df[,2] ) +2 , regexpr("->", data2.cols.df[,2])-1)  ) ))
final.dataset$rd1 <- sub('[[:alpha:]]', '', data1.cols.df[,4])
final.dataset$rd2 <- sub('[[:alpha:]]', '', data1.cols.df[,5])
final.dataset$rd3 <- sub('[[:alpha:]]', '', data1.cols.df[,6])
final.dataset$rd4 <- sub('[[:alpha:]]', '', data1.cols.df[,7])
final.dataset$rd5 <- sub('[[:alpha:]]', '', data1.cols.df[,8])
final.dataset$rd6 <- sub('[[:alpha:]]', '', data1.cols.df[,9])
final.dataset$rd7 <- sub('[[:alpha:]]', '', data1.cols.df[,10])


#Assigning valve to column avg.pre.rating.oppo:
for (i in 1:nrow(data1.cols.df) ) {
  
  final.dataset$avg.pre.rating.oppo[i] <- as.integer( mean(  c(final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd1[i])]   , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd2[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd3[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd4[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd5[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd6[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd7[i])] ) , na.rm = TRUE ) )
}
# Sample rows from the dataset:  
head(final.dataset)
##   id                name state total.points pre.rating avg.pre.rating.oppo  rd1
## 1  1            GARY HUA    ON          6.0       1794                1605   39
## 2  2     DAKSHESH DARURI    MI          6.0       1553                1469   63
## 3  3        ADITYA BAJAJ    MI          6.0       1384                1563    8
## 4  4 PATRICK H SCHILLING    MI          5.5       1716                1573   23
## 5  5          HANSHI ZUO    MI          5.5       1655                1500   45
## 6  6         HANSEN SONG    OH          5.0       1686                1518   34
##    rd2  rd3  rd4  rd5  rd6  rd7
## 1   21   18   14    7   12    4
## 2   58    4   17   16   20    7
## 3   61   25   21   11   13   12
## 4   28    2   26    5   19    1
## 5   37   12   13    4   14   17
## 6   29   11   35   10   27   21

Creating of the csv file with some selected columns:

final.csv.dataset <- subset(final.dataset, select= c(name, state, total.points, pre.rating,avg.pre.rating.oppo ))

Analysis and Visualization

Player’s Pre Ratings compared to Opponent’s Average Ratings

rating <- lm(pre.rating~ avg.pre.rating.oppo, data=final.dataset)
summary(rating)
## 
## Call:
## lm(formula = pre.rating ~ avg.pre.rating.oppo, data = final.dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -988.2 -138.0   50.6  201.0  376.0 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         516.5114   371.7458   1.389   0.1697  
## avg.pre.rating.oppo   0.6254     0.2687   2.327   0.0232 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.6 on 62 degrees of freedom
## Multiple R-squared:  0.08035,    Adjusted R-squared:  0.06552 
## F-statistic: 5.417 on 1 and 62 DF,  p-value: 0.02322
plot(final.dataset$`pre.rating`, final.dataset$`avg.pre.rating.oppo`, xlim=c(800,1800), ylim=c(1000,1800), main="PreRating vs Oppononent avg Rating", xlab="Player PreRating", ylab="Opponent Average PreRating", abline(rating))

Write into a csv file in current working directory

getwd()
## [1] "C:/Users/omocharly/Documents/CUNY_SPS_MDS"
write.csv(final.csv.dataset,'chess_players_info.csv')