In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
#load the data
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(readr, quietly = TRUE)
library(stringr, quietly = TRUE)
chess_raw <- 'https://raw.githubusercontent.com/nk014914/Data-607/main/project_1_tournamentinfo.txt'
#place in dataframe
df <- read_lines(chess_raw)
head(df)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
#locate the pattern occurences
x0 = 0
x1 = unname(str_locate_all(pattern = '\\|', df[5])[[1]][1,1])
x2 = unname(str_locate_all(pattern = '\\|', df[5])[[1]][2,1])
x3 = unname(str_locate_all(pattern = '\\|', df[5])[[1]][3,1])
x4 = max(nchar(df))
#sequence and group
seq1 = seq(5, 196, 3)
seq2 = seq(6, 196, 3)
group1 = df[seq1]
group2 = df[seq2]
#grouping for each column
#player names
name = substr(group1, x1 + 1, x2 - 2)
PlayerName = str_trim(name)
#player state
state = substr(group2, x0, x1 - 1)
State = str_trim(state)
#total points
totalpts = substr(group1, x2 + 1, x3 - 1)
#gpre-ratings
pre = substr(group2, x1 + 1, x2 - 1)
pre = str_extract(pre, ': *\\d{2,}')
#add into df
chess_scores = data.frame(PlayerName, State)
chess_scores$TotalPts = sprintf("%.1f", as.numeric(totalpts))
chess_scores$PreRating = as.integer(str_extract(pre, '\\d{2,}'))
opp = substr(group1, x3 + 1, x4)
opp = str_extract_all(opp, '\\b\\d{1,}')
opp = as.matrix(opp)
avgoppprerate = function(y, z){
x = y[z]
for (a in x) {
rate = 0
c = 0
for (b in a) {
c = c + 1
rate = rate + chess_scores$PreRating[as.numeric(b)]
}
rate = round(rate/c)
}
return(rate)
}
chess_scores$AvgOppPreRating = apply(opp, 1, avgoppprerate)
head(chess_scores)
## PlayerName State TotalPts PreRating AvgOppPreRating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
#convert txt into csv
write.csv(chess_scores, "chesstournamentinfo.csv")