Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
First, we will load the packages needed and the url file from github.
# load packages
library(stringr)
library(dplyr)
library(tidyr)
# specify the url file
urlfile = "https://raw.githubusercontent.com/kristinlussi/607Project1/main/tournamentinfo.txt"
# read the .txt file
chess_data <- read.delim(urlfile, header = FALSE)
Next, we will manipulate the data into a data frame with multiple columns.
# flatten the data into a vector and then convert into a matrix
chess_data <- matrix(unlist(chess_data), byrow = TRUE)
# remove the "---" rows and create two tables
# first table has Pair Number, Player Name, Total Points, and Round Scores
chess_data1 <- chess_data[seq(5, length(chess_data), 3)]
# second table has state, USCF #, rating (pre -> post), etc
chess_data2 <- chess_data[seq(6, length(chess_data), 3)]
# change the tables into data frames
chess_data1 <- data.frame(chess_data1)
chess_data2 <- data.frame(chess_data2)
# separate the column into columns: Pair Number (Primary key), Name,
# Total Points, and each round
chess_data_df <- separate(
chess_data1,
col = chess_data1,
into = c("Player_Number", "Name", "Total_Points", "Round1",
"Round2", "Round3", "Round4", "Round5", "Round6", "Round7"),
sep = "\\|",
remove = TRUE
)
# Change the primary key from character to integer
chess_data_df$Player_Number <- as.integer(chess_data_df$Player_Number)
# separate the column into columns: State, USCF_ID
# (where the pre_rating is located), and Rating
chess_data_df2 <- separate(
chess_data2,
col = chess_data2,
into = c("State", "USCF_ID", "RATING"),
sep = "\\|",
remove = TRUE
)
# create a primary key for the second data frame to match up with the first data frame
chess_data_df2 <- chess_data_df2 %>%
mutate(Player_Number = row_number()) %>%
select(Player_Number, everything())
# merge the two data frames by the primary key (Player_Number)
chess_data <- merge(chess_data_df, chess_data_df2, by = "Player_Number")
# separate the USCF_ID column into pre_rating and post_rating columns
chess_data <- chess_data %>%
separate(USCF_ID, into = c("RATING", "Post_Rating"), sep = " ->", extra = "merge") %>%
mutate(Post_Rating = gsub(".*R: (\\d+).*", "\\1", Post_Rating)) %>%
separate(RATING, into = c("RATING", "Pre_Rating"), sep = ": ", extra = "merge")
# change the Total_Points column to numeric
chess_data$Total_Points <- as.numeric(chess_data$Total_Points)
Here, to assist with calculating the opponent average rating for each player, we will create a second data frame that contains the Player Number and their pre rating to reference back to. We will then make a function which will create a new column for each round, matching each opponent with their corresponding pre rating. These new columns are then used to calculate the mean opponent pre rating for each row (player).
# select only the columns needed for opponent average pre rating calculations
chess_data <- chess_data %>%
select(Player_Number, Name, State, Total_Points, Pre_Rating,
Round1, Round2, Round3, Round4, Round5, Round6, Round7)
# extract only the player number from each round
chess_data$Round1 <- as.numeric(gsub("[^0-9]+", "", chess_data$Round1))
chess_data$Round2 <- as.numeric(gsub("[^0-9]+", "", chess_data$Round2))
chess_data$Round3 <- as.numeric(gsub("[^0-9]+", "", chess_data$Round3))
chess_data$Round4 <- as.numeric(gsub("[^0-9]+", "", chess_data$Round4))
chess_data$Round5 <- as.numeric(gsub("[^0-9]+", "", chess_data$Round5))
chess_data$Round6 <- as.numeric(gsub("[^0-9]+", "", chess_data$Round6))
chess_data$Round7 <- as.numeric(gsub("[^0-9]+", "", chess_data$Round7))
# extract only the first four characters from the Pre_Rating
# column to get the Pre_Rating for each player
chess_data$Pre_Rating <- substr(chess_data$Pre_Rating, 1, 4) %>%
as.numeric(chess_data$Pre_Rating) # change the Pre_Rating column to numeric
# create a different data frame for player_numbers and pre_ratings
player_id_rating <- data.frame(chess_data$Player_Number, chess_data$Pre_Rating)
# rename the colum names in the new data frame
colnames(player_id_rating) <- c("Player_Number", "Opponent_Rating")
# create a function that matches the pre rating to each opponent based on the player number
merge_round_ratings <- function(chess_data, round_column) {
chess_data %>%
left_join(player_id_rating, by = setNames("Player_Number", round_column)) %>%
rename_with(~paste0(round_column, "_Rating"), "Opponent_Rating")
}
# specify each round
rounds <- c("Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
# apply the function for each round
for (round_column in rounds) {
chess_data <- merge_round_ratings(chess_data, round_column)
}
# create a column for average opponent rating and
# calculate the average opponent rating for each row
chess_data <- chess_data %>%
rowwise() %>%
mutate(
Opponent_Avg_Rating = mean(c(Round1_Rating, Round2_Rating,
Round3_Rating, Round4_Rating, Round5_Rating,
Round6_Rating, Round7_Rating), na.rm = TRUE)
)
Here, we will select only the Name, State, Total Points, Pre Rating, and Opponent Average Pre Rating columns for our final data frame to prepare for export.
# select only the columns required for the .csv file
chess_data <- chess_data %>%
select(Name, State, Total_Points, Pre_Rating, Opponent_Avg_Rating) %>%
as.data.frame()
# show a glimpse of the data frame
head(chess_data)
## Name State Total_Points Pre_Rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## Opponent_Avg_Rating
## 1 1605.286
## 2 1469.286
## 3 1563.571
## 4 1573.571
## 5 1500.857
## 6 1518.714
Finally, we can write the .csv file with the following code:
write.csv(chess_data, file =
"/Users/kristinlussi/Documents/DATA607/Project1/chess_data_table.csv",
row.names = FALSE, sep = ",")
The above .csv file will be saved in the specified file location with the name “chess_data_table.csv”.