library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
theurl <- "https://raw.githubusercontent.com/omocharly/DATA607_LABS/main/tournamentinfo.txt"
chess_data.df <- readLines(theurl)
## Warning in readLines(theurl): incomplete final line found on 'https://
## raw.githubusercontent.com/omocharly/DATA607_LABS/main/tournamentinfo.txt'
head(chess_data.df)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
chess_data1.df <- chess_data.df[c(-1:-4, 0)]
head(chess_data1.df)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
#created empty objects:
data1.df <- c()
data2.df <- c()
#Run a loop to get the split:
k <- 1
for (i in 1:length(chess_data1.df) ) {
if ( i == 1 | i%%3 == 1 ) { data1.df[k] <- chess_data1.df[i] }
else if (i == 2 | i%%3 == 2) { data2.df[k] <- chess_data1.df[i] }
if (i%%3 == 0) {
k <- k + 1
}
}
# Sample rows from the dataset:
head(data1.df)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
head(data2.df)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
# Splitting with pattern as the delimiter "|" :
data1.cols.df <- str_split(data1.df, pattern = fixed("|"), simplify = TRUE, n=11)
data2.cols.df <- str_split(data2.df, pattern = fixed("|"), simplify = TRUE, n=11)
#Trimming the values for values in all rows and columns:
data1.cols.df <- trimws(data1.cols.df )
data2.cols.df <- trimws(data2.cols.df )
head(data1.cols.df)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "1" "GARY HUA" "6.0" "W 39" "W 21" "W 18" "W 14" "W 7"
## [2,] "2" "DAKSHESH DARURI" "6.0" "W 63" "W 58" "L 4" "W 17" "W 16"
## [3,] "3" "ADITYA BAJAJ" "6.0" "L 8" "W 61" "W 25" "W 21" "W 11"
## [4,] "4" "PATRICK H SCHILLING" "5.5" "W 23" "D 28" "W 2" "W 26" "D 5"
## [5,] "5" "HANSHI ZUO" "5.5" "W 45" "W 37" "D 12" "D 13" "D 4"
## [6,] "6" "HANSEN SONG" "5.0" "W 34" "D 29" "L 11" "W 35" "D 10"
## [,9] [,10] [,11]
## [1,] "D 12" "D 4" ""
## [2,] "W 20" "W 7" ""
## [3,] "W 13" "W 12" ""
## [4,] "W 19" "D 1" ""
## [5,] "W 14" "W 17" ""
## [6,] "W 27" "W 21" ""
head(data2.cols.df)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] "ON" "15445895 / R: 1794 ->1817" "N:2" "W" "B" "W" "B" "W" "B"
## [2,] "MI" "14598900 / R: 1553 ->1663" "N:2" "B" "W" "B" "W" "B" "W"
## [3,] "MI" "14959604 / R: 1384 ->1640" "N:2" "W" "B" "W" "B" "W" "B"
## [4,] "MI" "12616049 / R: 1716 ->1744" "N:2" "W" "B" "W" "B" "W" "B"
## [5,] "MI" "14601533 / R: 1655 ->1690" "N:2" "B" "W" "B" "W" "B" "W"
## [6,] "OH" "15055204 / R: 1686 ->1687" "N:3" "W" "B" "W" "B" "B" "W"
## [,10] [,11]
## [1,] "W" ""
## [2,] "B" ""
## [3,] "W" ""
## [4,] "B" ""
## [5,] "B" ""
## [6,] "B" ""
# Creating of an empty dataset with column names and number of rows as the length of final dataset:
final.dataset<-data.frame(id=character(nrow(data1.cols.df)), name=character(nrow(data1.cols.df)), state=character(nrow(data1.cols.df)), total.points=numeric(nrow(data1.cols.df)), pre.rating=numeric(nrow(data1.cols.df)), avg.pre.rating.oppo=numeric(nrow(data1.cols.df)), rd1=character(nrow(data1.cols.df)),rd2=character(nrow(data1.cols.df)), rd3=character(nrow(data1.cols.df)), rd4=character(nrow(data1.cols.df)), rd5=character(nrow(data1.cols.df)), rd6=character(nrow(data1.cols.df)), rd7=character(nrow(data1.cols.df)) )
# Assigning valve to each column in the final dataset [except for average opponent rating, which we will calculate in next step]:
final.dataset$id <- trimws(data1.cols.df[, 1])
final.dataset$name <- data1.cols.df[, 2]
final.dataset$state <- data2.cols.df[, 1]
final.dataset$total.points <- as.double(data1.cols.df[, 3] )
final.dataset$pre.rating <- as.integer( sub('P', '.', trimws( substr(data2.cols.df[,2], regexpr('R:', data2.cols.df[,2] ) +2 , regexpr("->", data2.cols.df[,2])-1) ) ))
final.dataset$rd1 <- sub('[[:alpha:]]', '', data1.cols.df[,4])
final.dataset$rd2 <- sub('[[:alpha:]]', '', data1.cols.df[,5])
final.dataset$rd3 <- sub('[[:alpha:]]', '', data1.cols.df[,6])
final.dataset$rd4 <- sub('[[:alpha:]]', '', data1.cols.df[,7])
final.dataset$rd5 <- sub('[[:alpha:]]', '', data1.cols.df[,8])
final.dataset$rd6 <- sub('[[:alpha:]]', '', data1.cols.df[,9])
final.dataset$rd7 <- sub('[[:alpha:]]', '', data1.cols.df[,10])
#Assigning valve to column avg.pre.rating.oppo:
for (i in 1:nrow(data1.cols.df) ) {
final.dataset$avg.pre.rating.oppo[i] <- as.integer( mean( c(final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd1[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd2[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd3[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd4[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd5[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd6[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd7[i])] ) , na.rm = TRUE ) )
}
# Sample rows from the dataset:
head(final.dataset)
## id name state total.points pre.rating avg.pre.rating.oppo rd1
## 1 1 GARY HUA ON 6.0 1794 1605 39
## 2 2 DAKSHESH DARURI MI 6.0 1553 1469 63
## 3 3 ADITYA BAJAJ MI 6.0 1384 1563 8
## 4 4 PATRICK H SCHILLING MI 5.5 1716 1573 23
## 5 5 HANSHI ZUO MI 5.5 1655 1500 45
## 6 6 HANSEN SONG OH 5.0 1686 1518 34
## rd2 rd3 rd4 rd5 rd6 rd7
## 1 21 18 14 7 12 4
## 2 58 4 17 16 20 7
## 3 61 25 21 11 13 12
## 4 28 2 26 5 19 1
## 5 37 12 13 4 14 17
## 6 29 11 35 10 27 21
final.csv.dataset <- subset(final.dataset, select= c(name, state, total.points, pre.rating,avg.pre.rating.oppo ))
Player’s Pre Ratings compared to Opponent’s Average Ratings
rating <- lm(pre.rating~ avg.pre.rating.oppo, data=final.dataset)
summary(rating)
##
## Call:
## lm(formula = pre.rating ~ avg.pre.rating.oppo, data = final.dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -988.2 -138.0 50.6 201.0 376.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 516.5114 371.7458 1.389 0.1697
## avg.pre.rating.oppo 0.6254 0.2687 2.327 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.6 on 62 degrees of freedom
## Multiple R-squared: 0.08035, Adjusted R-squared: 0.06552
## F-statistic: 5.417 on 1 and 62 DF, p-value: 0.02322
plot(final.dataset$`pre.rating`, final.dataset$`avg.pre.rating.oppo`, xlim=c(800,1800), ylim=c(1000,1800), main="PreRating vs Oppononent avg Rating", xlab="Player PreRating", ylab="Opponent Average PreRating", abline(rating))
getwd()
## [1] "C:/Users/omocharly/Documents/CUNY_SPS_MDS"
write.csv(final.csv.dataset,'chess_players_info.csv')