Read in text file as string and import Libraries
library(stringr)
library(readr)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
mystring <- read_file("tournamentinfo.txt")
Extract Column Names and state names using regular expressions
states <- c()
column_names <- unlist(str_extract_all(mystring,"\\w+"))[1:11]
states_2 <- str_split(unlist(str_extract_all(mystring,"[A-Z]+[:blank:]\\|")),"\\s+")
for (x in 1:64){
states <- append(states,states_2[[x]][1])
}
states
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
column_names
## [1] "Pair" "Player" "Name" "Total" "Round" "Round" "Round"
## [8] "Round" "Round" "Round" "Round"
Create blank vectors to populate with chess names, aggregated matches played, player starting rank, and player chess points
- Use regular expression to populate these vectors
trial <- unlist(str_extract_all(mystring,"\\|.*|$"))
splited_trial <- str_split(trial[3:130],"\\|")
chess_names<- c()
matches <- c()
start_ranks <- c()
chess_points <- c()
splited_trial[[4]][2]
## [1] " 14598900 / R: 1553 ->1663 "
##If block grabs ranks from splited_trial[[even#'s]]
##Else catches the rest or splited_trial[[odd#'s]]
for (x in 1:length(splited_trial)){
if (x%%2==0){
my_ranks<-splited_trial[[x]][2]
my_ranks <- unlist(str_split(unlist(str_extract(my_ranks,":\\s+[[:digit:]]+")),"\\s+"))[2]
start_ranks <- append(start_ranks,my_ranks)
}
else {
chess_names <- append(chess_names,splited_trial[[x]][2])
chess_points <- append(chess_points,splited_trial[[x]][3])
splited_trial[[x]] <- str_split(splited_trial[[x]] ,"[:blank:]+")
for (index in 4:10){
matches <- append(matches,splited_trial[[x]][[index]][2])
}
}
}
Alternate example of how I could have used vectors and lapply instead of for loops.
odds <- splited_trial[seq(from=1,to=127,by=2)]
evens <-splited_trial[seq(from=2,to=127,by=2)]
vector_ranks <- lapply(evens, `[[`, 2)
vector_ranks<- unlist(str_split(unlist(str_extract(vector_ranks,":\\s+[[:digit:]]+")),"\\s+"))
vector_ranks[seq(from=2,to=127,by=2)]
## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" "980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" "377" "1362" "1382" "1291" "1056"
## [51] "1011" "935" "1393" "1270" "1186" "1153" "1092" "917" "853" "967"
## [61] "955" "1530" "1175"
Trim and Clean data
chess_points <- as.numeric(str_trim(chess_points))
chess_names <- str_trim(chess_names)
matches[matches==""] <- NA
start_ranks <- as.numeric(start_ranks)
matches <- as.numeric(matches)
chess_names <- str_trim(chess_names)
Split aggreagated matches data into matches by player
- Map ranking values from another df into my matches by player df
- Example matches by player= 39,21,18,14,7,12,4 becomes player_ranks 1605 1469 1564 1574 1501 1519 1372
##split into 7*64 matches_by_player
## map replacement values to new split df
matches_by_player <- split(matches, ceiling(seq_along(matches)/7))
element_1 <- matches_by_player
element_1 <- as_data_frame(element_1)
df_start_ranks <- as_data_frame(start_ranks)
df_start_ranks <- cbind(1:64,df_start_ranks)
temp_df<- as.data.frame(lapply(element_1, function(x) ifelse(grepl(paste(df_start_ranks$`1:64`, collapse = '|'), x),
df_start_ranks$value[match(x, df_start_ranks$`1:64`)], x)))
## find means of the rank of opponents
average_opp_rank <- round(colMeans(temp_df,na.rm=TRUE),0)
Display all the vectors that were created and combine them into a dataframe
average_opp_rank <- as.vector(average_opp_rank)
chess_names
## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [21] "DINH DANG BUI" "EUGENE L MCCLURE"
## [23] "ALAN BUI" "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT" "MAX ZHU"
## [27] "GAURAV GIDWANI" "SOFIA ADINA STANESCU-BELLU"
## [29] "CHIEDOZIE OKORIE" "GEORGE AVERY JONES"
## [31] "RISHI SHETTY" "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE" "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE" "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"
## [39] "JOEL R HENDON" "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY" "JARED GE"
## [43] "ROBERT GLEN VASEY" "JUSTIN D SCHILLING"
## [45] "DEREK YAN" "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT" "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN" "SHIVAM JHA"
## [51] "TEJAS AYYAGARI" "ETHAN GUO"
## [53] "JOSE C YBARRA" "LARRY HODGE"
## [55] "ALEX KONG" "MARISA RICCI"
## [57] "MICHAEL LU" "VIRAJ MOHILE"
## [59] "SEAN M MC CORMICK" "JULIA SHEN"
## [61] "JEZZEL FARKAS" "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER" "BEN LI"
start_ranks
## [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355 980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242 377 1362 1382 1291 1056 1011 935 1393 1270 1186 1153
## [57] 1092 917 853 967 955 1530 1175 1163
chess_points
## [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0
## [18] 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
## [35] 3.5 3.5 3.5 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5
## [52] 2.5 2.0 2.0 2.0 2.0 2.0 2.0 2.0 1.5 1.5 1.0 1.0 1.0
states
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
finished_chess_rankings <- as_data_frame(cbind(chess_names,states,chess_points,start_ranks,average_opp_rank))
write.csv(finished_chess_rankings, file="finished_chess_rankings.csv")
Curious experimentation
- Is there a correlation between original rankings and average opponent rankings when it comes to opponent selection
##Sort by rank
rankings_for_plot <- finished_chess_rankings[order(finished_chess_rankings$start_ranks),]
rankings_for_plot$start_ranks <- as.integer(rankings_for_plot$start_ranks)
rankings_for_plot$average_opp_rank <- as.integer(rankings_for_plot$average_opp_rank)
rankigns_2 <- arrange(rankings_for_plot,start_ranks)
#rankigns_2 <- as.data.frame(rankigns_2)
str(rankings_for_plot)
## Classes 'tbl_df', 'tbl' and 'data.frame': 64 obs. of 5 variables:
## $ chess_names : chr "TEJAS AYYAGARI" "SHIVAM JHA" "MICHAEL LU" "MARISA RICCI" ...
## $ states : chr "MI" "MI" "MI" "MI" ...
## $ chess_points : chr "2.5" "2.5" "2" "2" ...
## $ start_ranks : int 1011 1056 1092 1153 1163 1175 1186 1199 1220 1229 ...
## $ average_opp_rank: int 1356 1296 1363 1414 1263 1350 1406 1327 1484 1357 ...
library(plyr)
library(ggplot2)
ggplot(rankings_for_plot, aes(x=start_ranks,y=average_opp_rank))+
geom_point()+
geom_smooth(method="lm")

- it appears there is some overall positive correlation, which would suggest matches may be designed to have like ranked players play against each other, however, this graph looks very scattered and this very well could have happened because of chance.