R Markdown

#load required libraries
library(RCurl)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

chesstxt <- getURL("https://raw.githubusercontent.com/Shayaeng/Data607/main/Project1/chess.txt")
chess_raw <- readLines(textConnection(chesstxt))

chess_filtered <- c()

#First, let's get rid of those '---' lines
for (line in chess_raw) {
  if (!grepl("^-+$", line)) {
    chess_filtered <- c(chess_filtered, line)
  }
}

#Now we'll remove the first two lines
chess_filtered <- chess_filtered[-(1:2)]

#Next, let's combine each of the two lines per record
chess_combined <- c()
for (i in seq(1, length(chess_filtered), by = 2)) {
  bottom_part <- chess_filtered[i]
  top_part <- chess_filtered[i + 1]
  combined_line <- paste(bottom_part, top_part)
  chess_combined <- c(chess_combined, combined_line)
}
  
#for (line in chess_combined) {
# print(line)
#}

#split the lines by their records
chess_split <- c()
for (line in chess_combined) {
  split_line <- strsplit(line, split = '[/]|->|\\|')
  trimmed_line <- lapply(split_line, str_trim)
  chess_split <- c(chess_split, trimmed_line)
}

#convert to data frame
chess_df <- data.frame(do.call(rbind, chess_split))
colnames(chess_df) <- c('PairId','PlayerName', 'Points', 'Round1', 'Round2', 'Round3', 'Round4', 'Round5', 'Round6', 'Round7', 'State', 'ChessId', 'PreRating', 'PostRating', '?', 'Result1', 'Result2', 'Result3', 'Result4', 'Result5', 'Result6', 'Result7')
chess_df$PreRating <- sub("^[^[:space:]]*\\s*", "", chess_df$PreRating)

#subset relevant columns
subset_df <- chess_df[, c('PairId','PlayerName', 'State', 'Points', 'Round1', 'Round2', 'Round3', 'Round4', 'Round5', 'Round6', 'Round7', 'PreRating')]

#trim the extra characters
subset_df$PreRating <- sub("P.*", "", subset_df$PreRating)
subset_df[c('Round1', 'Round2','Round3', 'Round4', 'Round5', 'Round6', 'Round7')] <- lapply(subset_df[c('Round1', 'Round2','Round3', 'Round4', 'Round5', 'Round6', 'Round7')], function(x) sub("^[^[:space:]]*\\s*", "", x))

#convert the columns
subset_df[c('PairId', 'Round1', 'Round2', 'Round3', 'Round4', 'Round5', 'Round6', 'Round7')] <- lapply(subset_df[c('PairId', 'Round1', 'Round2', 'Round3', 'Round4', 'Round5', 'Round6', 'Round7')], as.integer)
subset_df[c('Points', 'PreRating')] <- lapply(subset_df[c('Points', 'PreRating')], as.numeric)

#add opponents rating
subset_df <- subset_df %>%
  left_join(subset_df[, c('PairId', 'PreRating')], by = c('Round1' = 'PairId')) %>%
  rename(OppRound1 = PreRating.y, PreRating = PreRating.x) %>%
  
  left_join(subset_df[, c('PairId', 'PreRating')], by = c('Round2' = 'PairId')) %>%
  rename(OppRound2 = PreRating.y, PreRating = PreRating.x) %>%
  
  left_join(subset_df[, c('PairId', 'PreRating')], by = c('Round3' = 'PairId')) %>%
  rename(OppRound3 = PreRating.y, PreRating = PreRating.x) %>%
  
  left_join(subset_df[, c('PairId', 'PreRating')], by = c('Round4' = 'PairId')) %>%
  rename(OppRound4 = PreRating.y, PreRating = PreRating.x) %>%
  
  left_join(subset_df[, c('PairId', 'PreRating')], by = c('Round5' = 'PairId')) %>%
  rename(OppRound5 = PreRating.y, PreRating = PreRating.x) %>%
  
  left_join(subset_df[, c('PairId', 'PreRating')], by = c('Round6' = 'PairId')) %>%
  rename(OppRound6 = PreRating.y, PreRating = PreRating.x) %>%
  
  left_join(subset_df[, c('PairId', 'PreRating')], by = c('Round7' = 'PairId')) %>%
  rename(OppRound7 = PreRating.y, PreRating = PreRating.x)

#add average rating
subset_df <- subset_df %>%
  rowwise() %>%
  mutate(OppAverage = round(mean(c(`OppRound1`, `OppRound2`, `OppRound3`, `OppRound4`, `OppRound5`, `OppRound6`, `OppRound7`), na.rm = TRUE)))

#select and reorder columns
chess_final <- subset_df[, c('PlayerName', 'State', 'Points', 'PreRating', 'OppAverage')]

#create the .csv file
write.csv(chess_final, file = "chess_final.csv", row.names = FALSE)