library(dplyr)
library(stringr)
#loads data from text file-JACK WRIGHT, his is better because it pulls data from web
#list.files(path="C:/data", full.names = TRUE)
#chess_data <- readLines("C:/data/tournamentinfo.txt")
#Cameron Smith- loads data from github
chess_data <- readLines("https://raw.githubusercontent.com/cwestsmith/cuny-msds/master/datasets/tournamentinfo.txt")
#extract names- JACK WRIGHT
name <- str_extract_all(chess_data, "\\w+[^USCF|Player] ?\\w+ \\w+")%>%unlist()
#extract states
state_match<-" MI | ON | OH "
has_state<-str_subset(chess_data,state_match)
state<-str_extract(has_state,state_match)%>%unlist()
#extract points won
point_match<-"\\d\\.\\d"
has_points<-str_subset(chess_data,point_match)
points<-str_extract(has_points,point_match)%>%unlist()
points<-as.numeric(points)
#extract relevant ELO rating
#(?<=) is a lookaround assertion, looks AHEAD of this
pre_rating_match<-"(R:\\s{1,3})(\\d+)"
has_pre<-str_extract_all(chess_data,pre_rating_match)%>%unlist()
pre_rating<-str_extract_all(has_pre,"\\d+")%>%unlist
pre_rating<-as.numeric(pre_rating)
#create data frame with relevant player info
player_info<-data.frame("name"=name,"state"=state,"PreRating"=pre_rating,"points"=points)
The way I tackled this was to get a vector with the opponents match number for each player. I then created a new matrix, which I filled by using a “for” loop to pull the opponents ID, and then pull the ELO and sub it into the new data frame, at the same matrix location. This left me with a matrix where the ELO replaced the “opponent ID”.
I then took mean of these columns to get the average ELO of their opponent.
#extract matches played
opponent_match<-"\\|[0-9].*"
has_opponent <- str_extract_all(chess_data, opponent_match)%>%unlist()
replace_match<-"(?<=(H|U|B|X) )\\s{2}"
corrected_draw<-str_replace_all(has_opponent,replace_match,"00")
op_number_match<-" \\d{1,2}"
opponents<-str_extract_all(corrected_draw,op_number_match)
#create data frame with opponents
op_frame <- data.frame(matrix(unlist(opponents), ncol = max(lengths(opponents)), byrow = TRUE))
names(op_frame)<-c("opponent1","opponent2","opponent3","opponent4","opponent5","opponent6","opponent7")
#create data frame to hold ELO points of opponents (wide for easy averaging later)
opponent_sub<-data.frame(matrix(NA, nrow = 7, ncol = 64))
names(opponent_sub)<-name
#Loop to fill opponent_sub
i=1
j=1
for (i in 1:64){
for (j in 1:7){
val<-op_frame[i,j]%>%as.numeric()
if(val==00){
opponent_sub[j,i]<-NA
}else{
opponent_sub[j,i]<-pre_rating[val]
}
}
}
opponent_sub<-type.convert(opponent_sub, na.strings="NA")
opponent_average<-colMeans(opponent_sub, na.rm=TRUE)%>%round(0)
processed_data<-cbind(player_info, opponent_average )
head(processed_data)
## name state PreRating points opponent_average
## GARY HUA GARY HUA ON 1794 6.0 1605
## DAKSHESH DARURI DAKSHESH DARURI MI 1553 6.0 1469
## ADITYA BAJAJ ADITYA BAJAJ MI 1384 6.0 1564
## PATRICK H SCHILLING PATRICK H SCHILLING MI 1716 5.5 1574
## HANSHI ZUO HANSHI ZUO MI 1655 5.5 1501
## HANSEN SONG HANSEN SONG OH 1686 5.0 1519
write.csv(processed_data, "chess_data.csv")