In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(stringr)
library(tibble)
library(knitr)
raw_data_path<-("https://raw.githubusercontent.com/amedina613/Project-1-Adriana/main/chess_tour.txt")
raw_data <- readLines(raw_data_path)
print(raw_data[1:6])
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
raw_data = str_replace_all(raw_data,"-","")
print(raw_data [1:6])
## [1] ""
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre>Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] ""
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W |"
r_data_set = read.delim(textConnection(raw_data),header = F, sep = "|", stringsAsFactors = F)
r_data_set = r_data_set[,-11]
header = paste(trimws(as.character(r_data_set[1,])), trimws(as.character(r_data_set[2,])))
header = make.names(str_sub(header, 1,11))
r_data_set = as_tibble(r_data_set[-1:-2,])
names(r_data_set) = header
player_id = seq(1,nrow(r_data_set),2)
states_ranks_id = seq(2,nrow(r_data_set),2)
chess_players = r_data_set[player_id,]
state_ranks = select(r_data_set[states_ranks_id,], State = Pair.Num, Pre.Rating = Player.Name)
pattern <- ".*R:\\s*([0-9]{3,4}).*"
state_ranks$Pre.Rating <- gsub(pattern, "\\1", state_ranks$Pre.Rating)
chess_players = bind_cols(chess_players, state_ranks)
print(chess_players[1:5,])
## # A tibble: 5 × 12
## Pair.Num Player.Name Total.Pts Round.1 Round.2 Round.3 Round.4 Round.5 Round.6
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 " 1 " " GARY HUA… "6.0 " W 39 W 21 W 18 W 14 W 7 D 12
## 2 " 2 " " DAKSHESH… "6.0 " W 63 W 58 L 4 W 17 W 16 W 20
## 3 " 3 " " ADITYA B… "6.0 " L 8 W 61 W 25 W 21 W 11 W 13
## 4 " 4 " " PATRICK … "5.5 " W 23 D 28 W 2 W 26 D 5 W 19
## 5 " 5 " " HANSHI Z… "5.5 " W 45 W 37 D 12 D 13 D 4 W 14
## # ℹ 3 more variables: Round.7 <chr>, State <chr>, Pre.Rating <chr>
pivot_longer from tidyr allows me too transpose the Round #s and pivot the info from a wide format to a long format, meaning I am increasing the number of rows and decreasing the number of columns.
chess_players <- chess_players %>%
pivot_longer(cols = starts_with("Round"), names_to = "Round", values_to = "Opponent.Num")
chess_players <- chess_players %>%
mutate(Opponent.Num = str_extract(Opponent.Num, "\\d+"))
str(chess_players)
## tibble [448 × 7] (S3: tbl_df/tbl/data.frame)
## $ Pair.Num : chr [1:448] " 1 " " 1 " " 1 " " 1 " ...
## $ Player.Name : chr [1:448] " GARY HUA " " GARY HUA " " GARY HUA " " GARY HUA " ...
## $ Total.Pts : chr [1:448] "6.0 " "6.0 " "6.0 " "6.0 " ...
## $ State : chr [1:448] " ON " " ON " " ON " " ON " ...
## $ Pre.Rating : chr [1:448] "1794" "1794" "1794" "1794" ...
## $ Round : chr [1:448] "Round.1" "Round.2" "Round.3" "Round.4" ...
## $ Opponent.Num: chr [1:448] "39" "21" "18" "14" ...
chess_players = chess_players %>%
mutate(Total.Pts = as.numeric(Total.Pts),
Pair.Num = parse_number(Pair.Num),
Player.Name = trimws(Player.Name),
Opponent.Num = as.numeric(Opponent.Num),
Pre.Rating = as.numeric(Pre.Rating))
chess_players <- chess_players %>%
mutate(Opponent.Rating = Pre.Rating[match(Opponent.Num, Pair.Num)])
print(chess_players[1:10,])
## # A tibble: 10 × 8
## Pair.Num Player.Name Total.Pts State Pre.Rating Round Opponent.Num
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1 GARY HUA 6 " ON " 1794 Round.1 39
## 2 1 GARY HUA 6 " ON " 1794 Round.2 21
## 3 1 GARY HUA 6 " ON " 1794 Round.3 18
## 4 1 GARY HUA 6 " ON " 1794 Round.4 14
## 5 1 GARY HUA 6 " ON " 1794 Round.5 7
## 6 1 GARY HUA 6 " ON " 1794 Round.6 12
## 7 1 GARY HUA 6 " ON " 1794 Round.7 4
## 8 2 DAKSHESH DARURI 6 " MI " 1553 Round.1 63
## 9 2 DAKSHESH DARURI 6 " MI " 1553 Round.2 58
## 10 2 DAKSHESH DARURI 6 " MI " 1553 Round.3 4
## # ℹ 1 more variable: Opponent.Rating <dbl>
chess_players <- chess_players %>%
group_by(Player.Name) %>%
mutate(Avg.pcr = mean(Opponent.Rating, na.rm = TRUE))
print(chess_players[1:10,])
## # A tibble: 10 × 9
## # Groups: Player.Name [2]
## Pair.Num Player.Name Total.Pts State Pre.Rating Round Opponent.Num
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
## 1 1 GARY HUA 6 " ON " 1794 Round.1 39
## 2 1 GARY HUA 6 " ON " 1794 Round.2 21
## 3 1 GARY HUA 6 " ON " 1794 Round.3 18
## 4 1 GARY HUA 6 " ON " 1794 Round.4 14
## 5 1 GARY HUA 6 " ON " 1794 Round.5 7
## 6 1 GARY HUA 6 " ON " 1794 Round.6 12
## 7 1 GARY HUA 6 " ON " 1794 Round.7 4
## 8 2 DAKSHESH DARURI 6 " MI " 1553 Round.1 63
## 9 2 DAKSHESH DARURI 6 " MI " 1553 Round.2 58
## 10 2 DAKSHESH DARURI 6 " MI " 1553 Round.3 4
## # ℹ 2 more variables: Opponent.Rating <dbl>, Avg.pcr <dbl>
This line was entered after feedback: my player’s data was repeating itself 7 times per player, representing each round.
round_1_data <- chess_players %>%
filter(Round == "Round.1")
columns_keep <- c("Player.Name", "State", "Total.Pts", "Pre.Rating", "Avg.pcr")
final_chess_data<- round_1_data[columns_keep]
print(final_chess_data[1:10,])
## # A tibble: 10 × 5
## # Groups: Player.Name [10]
## Player.Name State Total.Pts Pre.Rating Avg.pcr
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 GARY HUA " ON " 6 1794 1605.
## 2 DAKSHESH DARURI " MI " 6 1553 1469.
## 3 ADITYA BAJAJ " MI " 6 1384 1564.
## 4 PATRICK H SCHILLING " MI " 5.5 1716 1574.
## 5 HANSHI ZUO " MI " 5.5 1655 1501.
## 6 HANSEN SONG " OH " 5 1686 1519.
## 7 GARY DEE SWATHELL " MI " 5 1649 1372.
## 8 EZEKIEL HOUGHTON " MI " 5 1641 1468.
## 9 STEFANO LEE " ON " 5 1411 1523.
## 10 ANVIT RAO " MI " 5 1365 1554.
write.csv(final_chess_data, file = "~/Documents/GitHub/Project-1-Adriana/final_chess_data.csv", row.names = FALSE)