#load necessary packages
library(RCurl)
## Loading required package: bitops
library(stringr)
#load txt file
x <- 'https://raw.githubusercontent.com/miasiracusa/Data607/master/project1/tournamentinfo.txt'
#read into list
first_data <- read.delim(x, header=FALSE, stringsAsFactors=FALSE)
#look at data
head(first_data)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 3 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 4 -----------------------------------------------------------------------------------------
## 5 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 6 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
I want to work with the data in 2 sets, the first row for the competitors will populate the first set and the second row will populate the second set. I don’t care about the headers, so I get rid of them.
#removed headers
second_data <- first_data[-c(1:3),]
head(second_data)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [3] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [6] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
Now, I’ll split the data into 2 sets and combine them back together using cbind.
mid1 <- second_data[seq(2, length(second_data), 3)]
mid2 <- second_data[seq(3, length(second_data), 3)]
full_data <- cbind(mid1, mid2)
head(full_data)
## mid1
## [1,] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2,] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3,] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4,] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5,] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6,] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
## mid2
## [1,] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2,] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3,] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4,] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5,] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6,] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
Since we are asked to focus on Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents, I only focus on those categories.
#use stringr package to extract the names
competitorname <- str_trim(str_extract(mid1, "([[:alpha:]]+\\s){2,3}"))
#state info is located in mid2, formated in two letter abbreviations
competitorstate <- str_trim(str_extract(mid2, "([[:alpha:]]+\\s){1}"))
#want only the number part of the pre rating, so extract the the numbers that are of length 3 or 4 and surounded by no digits. I first exctract all strings of digits, then extract the pattern i want
rating_pre <- as.numeric(str_extract(str_extract(mid2, "[^\\d]\\d{3,4}[^\\d]"), "\\d+"))
#want points in format "d.d"
totalpoints <- as.numeric(str_extract(mid1, "(\\d\\.\\d)"))
#want the numbers to end in |, so i extract all digits followed by |, then I extract all the digits. i keep getting an error "argument is not an atomic vector; coercing" but I don't know what that means or how to fix it (despite extensive googling)
opponent <- str_extract_all(str_extract_all(mid1, "\\d+\\|"), "\\d+")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
#to calculate the average prerating of the competitors' opponents, i first make an empty column
avgpre_opponent <- c()
#then, I create a for loop that unlists (as numbers so i can find the mean) the preratings of each opponent for the respective players and finds the mean of this new list of numbers
for (i in 1:length(mid1))
{
avgpre_opponent[i] <- round(mean(rating_pre[as.numeric(unlist(opponent[i]))]))
}
I create the final data frame and create the csv file.
chess_data <- data.frame(competitorname, competitorstate, rating_pre, totalpoints, avgpre_opponent)
write.csv(chess_data, file = "chess_data.csv", row.names = FALSE)