Attach Libraries

library(dplyr)
library(stringr)

Load Data

#loads data from text file-JACK WRIGHT, his is better because it pulls data from web

#list.files(path="C:/data", full.names = TRUE)
#chess_data <- readLines("C:/data/tournamentinfo.txt")

#Cameron Smith- loads data from github
chess_data <- readLines("https://raw.githubusercontent.com/cwestsmith/cuny-msds/master/datasets/tournamentinfo.txt")

Extract Relevant Data

#extract names- JACK WRIGHT
name <- str_extract_all(chess_data, "\\w+[^USCF|Player] ?\\w+ \\w+")%>%unlist()


#extract states
state_match<-" MI | ON | OH "
has_state<-str_subset(chess_data,state_match)
state<-str_extract(has_state,state_match)%>%unlist()


#extract points won
point_match<-"\\d\\.\\d"
has_points<-str_subset(chess_data,point_match)
points<-str_extract(has_points,point_match)%>%unlist()
points<-as.numeric(points)



#extract relevant ELO rating


#(?<=) is a lookaround assertion, looks AHEAD of this
pre_rating_match<-"(R:\\s{1,3})(\\d+)"
has_pre<-str_extract_all(chess_data,pre_rating_match)%>%unlist()
pre_rating<-str_extract_all(has_pre,"\\d+")%>%unlist
pre_rating<-as.numeric(pre_rating)


#create data frame with relevant player info

player_info<-data.frame("name"=name,"state"=state,"PreRating"=pre_rating,"points"=points)

Getting Opponent’s ELO

The way I tackled this was to get a vector with the opponents match number for each player. I then created a new matrix, which I filled by using a “for” loop to pull the opponents ID, and then pull the ELO and sub it into the new data frame, at the same matrix location. This left me with a matrix where the ELO replaced the “opponent ID”.

I then took mean of these columns to get the average ELO of their opponent.

#extract matches played

opponent_match<-"\\|[0-9].*"
has_opponent <- str_extract_all(chess_data, opponent_match)%>%unlist()


replace_match<-"(?<=(H|U|B|X)  )\\s{2}"
corrected_draw<-str_replace_all(has_opponent,replace_match,"00")

op_number_match<-" \\d{1,2}"
opponents<-str_extract_all(corrected_draw,op_number_match)



#create data frame with opponents
op_frame <- data.frame(matrix(unlist(opponents), ncol = max(lengths(opponents)), byrow = TRUE))

names(op_frame)<-c("opponent1","opponent2","opponent3","opponent4","opponent5","opponent6","opponent7")





#create data frame to hold ELO points of opponents (wide for easy averaging later)
opponent_sub<-data.frame(matrix(NA, nrow = 7, ncol = 64))
names(opponent_sub)<-name

#Loop to fill opponent_sub
i=1
j=1
 for (i in 1:64){
   
   for (j in 1:7){
     val<-op_frame[i,j]%>%as.numeric()
     if(val==00){
       opponent_sub[j,i]<-NA
     }else{
     opponent_sub[j,i]<-pre_rating[val]
     
     }
   }
 }

opponent_sub<-type.convert(opponent_sub, na.strings="NA")

opponent_average<-colMeans(opponent_sub, na.rm=TRUE)%>%round(0)

processed_data<-cbind(player_info, opponent_average )

head(processed_data)
##                                    name state PreRating points opponent_average
## GARY HUA                       GARY HUA   ON       1794    6.0             1605
## DAKSHESH DARURI         DAKSHESH DARURI   MI       1553    6.0             1469
## ADITYA BAJAJ               ADITYA BAJAJ   MI       1384    6.0             1564
## PATRICK H SCHILLING PATRICK H SCHILLING   MI       1716    5.5             1574
## HANSHI ZUO                   HANSHI ZUO   MI       1655    5.5             1501
## HANSEN SONG                 HANSEN SONG   OH       1686    5.0             1519
write.csv(processed_data, "chess_data.csv")