data607 project 1 msiracusa

#load necessary packages
library(RCurl)

## Loading required package: bitops

library(stringr)

#load txt file 
x <- 'https://raw.githubusercontent.com/miasiracusa/Data607/master/project1/tournamentinfo.txt'

#read into list
first_data <- read.delim(x, header=FALSE, stringsAsFactors=FALSE)

#look at data
head(first_data)

##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 3  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 4  -----------------------------------------------------------------------------------------
## 5      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 6     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |

I want to work with the data in 2 sets, the first row for the competitors will populate the first set and the second row will populate the second set. I don’t care about the headers, so I get rid of them.

#removed headers
second_data <- first_data[-c(1:3),]
head(second_data)

## [1] "-----------------------------------------------------------------------------------------"
## [2] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [3] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [6] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

Now, I’ll split the data into 2 sets and combine them back together using cbind.

mid1 <- second_data[seq(2, length(second_data), 3)]
mid2 <- second_data[seq(3, length(second_data), 3)]

full_data <- cbind(mid1, mid2)
head(full_data)

##      mid1                                                                                       
## [1,] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2,] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3,] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4,] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5,] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6,] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
##      mid2                                                                                       
## [1,] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2,] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3,] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4,] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5,] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6,] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

Since we are asked to focus on Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents, I only focus on those categories.

#use stringr package to extract the names
competitorname <- str_trim(str_extract(mid1, "([[:alpha:]]+\\s){2,3}"))

#state info is located in mid2, formated in two letter abbreviations
competitorstate <- str_trim(str_extract(mid2, "([[:alpha:]]+\\s){1}"))

#want only the number part of the pre rating, so extract the the numbers that are of length 3 or 4 and surounded by no digits. I first exctract all strings of digits, then extract the pattern i want
rating_pre <- as.numeric(str_extract(str_extract(mid2, "[^\\d]\\d{3,4}[^\\d]"), "\\d+"))

#want points in format "d.d"
totalpoints <- as.numeric(str_extract(mid1, "(\\d\\.\\d)"))

#want the numbers to end in |, so i extract all digits followed by |, then I extract all the digits. i keep getting an error "argument is not an atomic vector; coercing" but I don't know what that means or how to fix it (despite extensive googling)

opponent <- str_extract_all(str_extract_all(mid1, "\\d+\\|"), "\\d+")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

#to calculate the average prerating of the competitors' opponents, i first make an empty column
avgpre_opponent <- c()

#then, I create a for loop that unlists (as numbers so i can find the mean) the preratings of each opponent for the respective players and finds the mean of this new list of numbers
  for (i in 1:length(mid1))
       {
    avgpre_opponent[i] <- round(mean(rating_pre[as.numeric(unlist(opponent[i]))]))
  }

I create the final data frame and create the csv file.

chess_data <- data.frame(competitorname, competitorstate, rating_pre, totalpoints, avgpre_opponent)

write.csv(chess_data, file = "chess_data.csv", row.names = FALSE)