Loading packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Loading text file into a table using string manipulation

url = "https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/project1/tournamentinfo.txt"
data = read.csv(url)
count = 1
matrice = c()
while (count < dim(data)[1]){
  # splits the chart into groups of two rows
  list = unlist(strsplit(trimws(data[count:(count+1),]),"[|]"))
  size = length(list)/2
  row = c()
  count = count +1
  # concats the two rows into one (this is so we can get the data easier later)
  for(i in seq(size)){
    string = paste(list[i],list[i+size],sep = '|')
    row = c(row,string)
  }
  matrice = c(matrice, row)
  count = count + 2
}
# prints resulting chart with 1 row representing 1 persons match data
results = data.frame(t(matrix(matrice,nrow = size)))
names(results) = results[1,]
results = results[2:nrow(results),]
rownames(results) = 1:nrow(results)
head(results)
##   Pair |Num  
## 1      1 |ON 
## 2      2 |MI 
## 3      3 |MI 
## 4      4 |MI 
## 5      5 |MI 
## 6      6 |OH 
##    Player Name                     | USCF ID / Rtg (Pre->Post)       
## 1  GARY HUA                        | 15445895 / R: 1794   ->1817     
## 2  DAKSHESH DARURI                 | 14598900 / R: 1553   ->1663     
## 3  ADITYA BAJAJ                    | 14959604 / R: 1384   ->1640     
## 4  PATRICK H SCHILLING             | 12616049 / R: 1716   ->1744     
## 5  HANSHI ZUO                      | 14601533 / R: 1655   ->1690     
## 6  HANSEN SONG                     | 15055204 / R: 1686   ->1687     
##   Total| Pts  Round|  1   Round|  2   Round|  3   Round|  4   Round|  5  
## 1 6.0  |N:2   W  39|W     W  21|B     W  18|W     W  14|B     W   7|W    
## 2 6.0  |N:2   W  63|B     W  58|W     L   4|B     W  17|W     W  16|B    
## 3 6.0  |N:2   L   8|W     W  61|B     W  25|W     W  21|B     W  11|W    
## 4 5.5  |N:2   W  23|W     D  28|B     W   2|W     W  26|B     D   5|W    
## 5 5.5  |N:2   W  45|B     W  37|W     D  12|B     D  13|W     D   4|B    
## 6 5.0  |N:3   W  34|W     D  29|B     L  11|W     W  35|B     D  10|B    
##   Round|  6   Round|  7  
## 1 D  12|B     D   4|W    
## 2 W  20|W     W   7|B    
## 3 W  13|B     W  12|W    
## 4 W  19|B     D   1|B    
## 5 W  14|W     W  17|B    
## 6 W  27|W     W  21|B

Splitting data into multiple columns

rows = dim(results)[1]
columns = names(results)
states = c()
number = c()
for (i in seq(rows)){
  number = c(number,trimws(strsplit(results[,columns[1]],"[|]")[[i]][1]))
  states = c(states,strsplit(results[,columns[1]],"[|]")[[i]][2])
}
# split column into two columns (number and states)
results$states = states
results$number = number 

Creating players dataset

player_name =c()
player_id = c()
pre_elo = c()
#splitting second column into seperate columns
for (i in seq(rows)){
  split = strsplit(results[,columns[2]],"[|]")
  player_name = c(player_name, split[[i]][1])
  split2 = strsplit(split[[i]][2],"[/]")
  player_id = c(player_id, split2[[1]][1])
  pattern = "[0-9][0-9]*[0-9]"
  pre_elo = c(pre_elo, grep(pattern, unlist(strsplit(split2[[1]][2]," ")),value = TRUE)[1])
}
results$player = trimws(player_name)
results$player_id = player_id
# cleaning the pre_elo column further
for (i in seq(rows)){
  if (is.numeric(pre_elo[i]) == FALSE){
    pre_elo[i] = strsplit(pre_elo[i], "P")[[1]][1]
  }
}
results$pre_elo = as.numeric(pre_elo)

# getting points from third column
points = c()
for (i in seq(rows)){
  point = strsplit(results[,columns[3]],"[|]")[[i]][1]
  points = c(points, as.numeric(point))
}
results$points = points

#create df filled with player information
players = results[c('number','states','player_id','player','pre_elo','points')]
# Clean dataset
head(players)
##   number states  player_id              player pre_elo points
## 1      1    ON   15445895             GARY HUA    1794    6.0
## 2      2    MI   14598900      DAKSHESH DARURI    1553    6.0
## 3      3    MI   14959604         ADITYA BAJAJ    1384    6.0
## 4      4    MI   12616049  PATRICK H SCHILLING    1716    5.5
## 5      5    MI   14601533           HANSHI ZUO    1655    5.5
## 6      6    OH   15055204          HANSEN SONG    1686    5.0

Creating games dataset

rounds = results[c('player',columns[4:10])]
# rename columns
list = names(rounds)[2:8]
for (i in seq(length(list[2:8]))){
  names(rounds)[names(rounds) == list[i]] = i
}
# pivot columns by player
games = pivot_longer(rounds,!player,names_to = "round")
# create new colums from value
value = games[names(games)[3]]
outcome = c()
opponent = c()
color = c()
for(i in seq(dim(value)[1])){
  sep = strsplit(trimws(value[[1]]), "[|]")
  color = c(color, sep[[i]][2])
  sep_outcomes = strsplit(trimws(sep[[i]][1])," ")[[1]]
  outcome = c(outcome, sep_outcomes[1])
  pattern = "[0-9][0-9]*"
  match = grep(pattern,sep_outcomes,value= TRUE)
  # fill values with no match as NA
  if (length(match) == 0){
    opponent = c(opponent,NA)
  }
  else{
    opponent = c(opponent, match)
  }
}
games$number = opponent
games$color = color
games$outcome = outcome
games$value = NULL
# Clean dataset
head(games)
## # A tibble: 6 x 5
##   player   round number color outcome
##   <chr>    <chr> <chr>  <chr> <chr>  
## 1 GARY HUA 1     39     W     W      
## 2 GARY HUA 2     21     B     W      
## 3 GARY HUA 3     18     W     W      
## 4 GARY HUA 4     14     B     W      
## 5 GARY HUA 5     7      W     W      
## 6 GARY HUA 6     12     B     D

Average elo of opponents

opponent_elo = merge(games[c('number','player')],players[c('number', 'pre_elo')], "number")

opp_elo = opponent_elo %>%
  group_by(player) %>%
  summarize(avg_opp_elo = round(mean(pre_elo,na.rm=TRUE)))
# players and their opponents average pre_elo
head(opp_elo)
## # A tibble: 6 x 2
##   player               avg_opp_elo
##   <chr>                      <dbl>
## 1 ADITYA BAJAJ                1564
## 2 ALAN BUI                    1214
## 3 ALEX KONG                   1406
## 4 AMIYATOSH PWNANANDAM        1385
## 5 ANVIT RAO                   1554
## 6 ASHWIN BALAJI               1186

Simple linear regression predictions

# Regresson predictions
data = players[c("pre_elo","points")]
x = data["pre_elo"][[1]]
y = data["points"][[1]]
regression = lm(y~x,data)
prediction = predict(regression,players['pre_elo'])
players$prediction = prediction
players$difference = players$points - players$prediction 
# Highest scoring player based on predictions
players[players$difference == max(players$difference),]
##   number states  player_id       player pre_elo points prediction difference
## 3      3    MI   14959604  ADITYA BAJAJ    1384      6   3.453065   2.546935