Loading packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Loading text file into a table using string manipulation
url = "https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/project1/tournamentinfo.txt"
data = read.csv(url)
count = 1
matrice = c()
while (count < dim(data)[1]){
# splits the chart into groups of two rows
list = unlist(strsplit(trimws(data[count:(count+1),]),"[|]"))
size = length(list)/2
row = c()
count = count +1
# concats the two rows into one (this is so we can get the data easier later)
for(i in seq(size)){
string = paste(list[i],list[i+size],sep = '|')
row = c(row,string)
}
matrice = c(matrice, row)
count = count + 2
}
# prints resulting chart with 1 row representing 1 persons match data
results = data.frame(t(matrix(matrice,nrow = size)))
names(results) = results[1,]
results = results[2:nrow(results),]
rownames(results) = 1:nrow(results)
head(results)
## Pair |Num
## 1 1 |ON
## 2 2 |MI
## 3 3 |MI
## 4 4 |MI
## 5 5 |MI
## 6 6 |OH
## Player Name | USCF ID / Rtg (Pre->Post)
## 1 GARY HUA | 15445895 / R: 1794 ->1817
## 2 DAKSHESH DARURI | 14598900 / R: 1553 ->1663
## 3 ADITYA BAJAJ | 14959604 / R: 1384 ->1640
## 4 PATRICK H SCHILLING | 12616049 / R: 1716 ->1744
## 5 HANSHI ZUO | 14601533 / R: 1655 ->1690
## 6 HANSEN SONG | 15055204 / R: 1686 ->1687
## Total| Pts Round| 1 Round| 2 Round| 3 Round| 4 Round| 5
## 1 6.0 |N:2 W 39|W W 21|B W 18|W W 14|B W 7|W
## 2 6.0 |N:2 W 63|B W 58|W L 4|B W 17|W W 16|B
## 3 6.0 |N:2 L 8|W W 61|B W 25|W W 21|B W 11|W
## 4 5.5 |N:2 W 23|W D 28|B W 2|W W 26|B D 5|W
## 5 5.5 |N:2 W 45|B W 37|W D 12|B D 13|W D 4|B
## 6 5.0 |N:3 W 34|W D 29|B L 11|W W 35|B D 10|B
## Round| 6 Round| 7
## 1 D 12|B D 4|W
## 2 W 20|W W 7|B
## 3 W 13|B W 12|W
## 4 W 19|B D 1|B
## 5 W 14|W W 17|B
## 6 W 27|W W 21|B
Splitting data into multiple columns
rows = dim(results)[1]
columns = names(results)
states = c()
number = c()
for (i in seq(rows)){
number = c(number,trimws(strsplit(results[,columns[1]],"[|]")[[i]][1]))
states = c(states,strsplit(results[,columns[1]],"[|]")[[i]][2])
}
# split column into two columns (number and states)
results$states = states
results$number = number
Creating players dataset
player_name =c()
player_id = c()
pre_elo = c()
#splitting second column into seperate columns
for (i in seq(rows)){
split = strsplit(results[,columns[2]],"[|]")
player_name = c(player_name, split[[i]][1])
split2 = strsplit(split[[i]][2],"[/]")
player_id = c(player_id, split2[[1]][1])
pattern = "[0-9][0-9]*[0-9]"
pre_elo = c(pre_elo, grep(pattern, unlist(strsplit(split2[[1]][2]," ")),value = TRUE)[1])
}
results$player = trimws(player_name)
results$player_id = player_id
# cleaning the pre_elo column further
for (i in seq(rows)){
if (is.numeric(pre_elo[i]) == FALSE){
pre_elo[i] = strsplit(pre_elo[i], "P")[[1]][1]
}
}
results$pre_elo = as.numeric(pre_elo)
# getting points from third column
points = c()
for (i in seq(rows)){
point = strsplit(results[,columns[3]],"[|]")[[i]][1]
points = c(points, as.numeric(point))
}
results$points = points
#create df filled with player information
players = results[c('number','states','player_id','player','pre_elo','points')]
# Clean dataset
head(players)
## number states player_id player pre_elo points
## 1 1 ON 15445895 GARY HUA 1794 6.0
## 2 2 MI 14598900 DAKSHESH DARURI 1553 6.0
## 3 3 MI 14959604 ADITYA BAJAJ 1384 6.0
## 4 4 MI 12616049 PATRICK H SCHILLING 1716 5.5
## 5 5 MI 14601533 HANSHI ZUO 1655 5.5
## 6 6 OH 15055204 HANSEN SONG 1686 5.0
Creating games dataset
rounds = results[c('player',columns[4:10])]
# rename columns
list = names(rounds)[2:8]
for (i in seq(length(list[2:8]))){
names(rounds)[names(rounds) == list[i]] = i
}
# pivot columns by player
games = pivot_longer(rounds,!player,names_to = "round")
# create new colums from value
value = games[names(games)[3]]
outcome = c()
opponent = c()
color = c()
for(i in seq(dim(value)[1])){
sep = strsplit(trimws(value[[1]]), "[|]")
color = c(color, sep[[i]][2])
sep_outcomes = strsplit(trimws(sep[[i]][1])," ")[[1]]
outcome = c(outcome, sep_outcomes[1])
pattern = "[0-9][0-9]*"
match = grep(pattern,sep_outcomes,value= TRUE)
# fill values with no match as NA
if (length(match) == 0){
opponent = c(opponent,NA)
}
else{
opponent = c(opponent, match)
}
}
games$number = opponent
games$color = color
games$outcome = outcome
games$value = NULL
# Clean dataset
head(games)
## # A tibble: 6 x 5
## player round number color outcome
## <chr> <chr> <chr> <chr> <chr>
## 1 GARY HUA 1 39 W W
## 2 GARY HUA 2 21 B W
## 3 GARY HUA 3 18 W W
## 4 GARY HUA 4 14 B W
## 5 GARY HUA 5 7 W W
## 6 GARY HUA 6 12 B D
Average elo of opponents
opponent_elo = merge(games[c('number','player')],players[c('number', 'pre_elo')], "number")
opp_elo = opponent_elo %>%
group_by(player) %>%
summarize(avg_opp_elo = round(mean(pre_elo,na.rm=TRUE)))
# players and their opponents average pre_elo
head(opp_elo)
## # A tibble: 6 x 2
## player avg_opp_elo
## <chr> <dbl>
## 1 ADITYA BAJAJ 1564
## 2 ALAN BUI 1214
## 3 ALEX KONG 1406
## 4 AMIYATOSH PWNANANDAM 1385
## 5 ANVIT RAO 1554
## 6 ASHWIN BALAJI 1186
Simple linear regression predictions
# Regresson predictions
data = players[c("pre_elo","points")]
x = data["pre_elo"][[1]]
y = data["points"][[1]]
regression = lm(y~x,data)
prediction = predict(regression,players['pre_elo'])
players$prediction = prediction
players$difference = players$points - players$prediction
# Highest scoring player based on predictions
players[players$difference == max(players$difference),]
## number states player_id player pre_elo points prediction difference
## 3 3 MI 14959604 ADITYA BAJAJ 1384 6 3.453065 2.546935