In the context of this project, I will create an R Markdown file aimed at generating a CSV file from the provided text document, which encompasses chess tournament outcomes. Subsequently, I will undertake the task of transforming the aforementioned data into a structured data frame before generating the CSV file.
#Import data
df <- readLines("https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/607/tournamentinfo.txt")
head(df)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
#Separate data into two vectors
df_1 <- df[seq(from = 5, to = length(df)-1, by = 3)]
df_1 <- str_split(df_1, pattern = "\\|", simplify = TRUE)
head(df_1)
## [,1] [,2] [,3] [,4] [,5]
## [1,] " 1 " " GARY HUA " "6.0 " "W 39" "W 21"
## [2,] " 2 " " DAKSHESH DARURI " "6.0 " "W 63" "W 58"
## [3,] " 3 " " ADITYA BAJAJ " "6.0 " "L 8" "W 61"
## [4,] " 4 " " PATRICK H SCHILLING " "5.5 " "W 23" "D 28"
## [5,] " 5 " " HANSHI ZUO " "5.5 " "W 45" "W 37"
## [6,] " 6 " " HANSEN SONG " "5.0 " "W 34" "D 29"
## [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] "W 18" "W 14" "W 7" "D 12" "D 4" ""
## [2,] "L 4" "W 17" "W 16" "W 20" "W 7" ""
## [3,] "W 25" "W 21" "W 11" "W 13" "W 12" ""
## [4,] "W 2" "W 26" "D 5" "W 19" "D 1" ""
## [5,] "D 12" "D 13" "D 4" "W 14" "W 17" ""
## [6,] "L 11" "W 35" "D 10" "W 27" "W 21" ""
df_2 <- df[seq(from = 6, to = length(df), by = 3)]
df_2 <- str_split(df_2, pattern = "\\|", simplify = TRUE)
head(df_2)
## [,1] [,2] [,3] [,4] [,5]
## [1,] " ON " " 15445895 / R: 1794 ->1817 " "N:2 " "W " "B "
## [2,] " MI " " 14598900 / R: 1553 ->1663 " "N:2 " "B " "W "
## [3,] " MI " " 14959604 / R: 1384 ->1640 " "N:2 " "W " "B "
## [4,] " MI " " 12616049 / R: 1716 ->1744 " "N:2 " "W " "B "
## [5,] " MI " " 14601533 / R: 1655 ->1690 " "N:2 " "B " "W "
## [6,] " OH " " 15055204 / R: 1686 ->1687 " "N:3 " "W " "B "
## [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] "W " "B " "W " "B " "W " ""
## [2,] "B " "W " "B " "W " "B " ""
## [3,] "W " "B " "W " "B " "W " ""
## [4,] "W " "B " "W " "B " "B " ""
## [5,] "B " "W " "B " "W " "B " ""
## [6,] "W " "B " "B " "W " "B " ""
#Get name
name <- str_trim(c(df_1[ ,2]))
head(name)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
total_points <- str_trim(c(df_1[ ,3]))
head(total_points)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
State <- str_trim(c(df_2[ ,1]))
head(State)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
rating <- "R:" %R% one_or_more(SPACE) %R% one_or_more(DIGIT) %R% optional("P")
pre_rating <- c(str_extract(str_extract(df_2[, 2], pattern = rating), pattern = "\\d+"))
pre_rating <- as.numeric(pre_rating)
head(pre_rating)
## [1] 1794 1553 1384 1716 1655 1686
results <- data.frame("Player_Name" = name, "Player_State" = State, "Total_Points" = total_points, "Pre_Rating" = pre_rating)
results$Avg_Opp_Pre_Rating <- NA
head(results)
## Player_Name Player_State Total_Points Pre_Rating Avg_Opp_Pre_Rating
## 1 GARY HUA ON 6.0 1794 NA
## 2 DAKSHESH DARURI MI 6.0 1553 NA
## 3 ADITYA BAJAJ MI 6.0 1384 NA
## 4 PATRICK H SCHILLING MI 5.5 1716 NA
## 5 HANSHI ZUO MI 5.5 1655 NA
## 6 HANSEN SONG OH 5.0 1686 NA
#Store opponents' ID
opponents <- matrix(str_extract_all(df_1[,4:10], "\\d+", simplify = TRUE), ncol = 7)
head(opponents)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "39" "21" "18" "14" "7" "12" "4"
## [2,] "63" "58" "4" "17" "16" "20" "7"
## [3,] "8" "61" "25" "21" "11" "13" "12"
## [4,] "23" "28" "2" "26" "5" "19" "1"
## [5,] "45" "37" "12" "13" "4" "14" "17"
## [6,] "34" "29" "11" "35" "10" "27" "21"
#Calculate Average Opponents' Pre-Rating
for(i in 1:nrow(results)){
results$Avg_Opp_Pre_Rating[i] <- round(mean(pre_rating[as.numeric(opponents[i,])], na.rm = TRUE))
}
#View final data frame
head(results)
## Player_Name Player_State Total_Points Pre_Rating Avg_Opp_Pre_Rating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
#Create CSV file
write.csv(results, file = "tournamentinfo.csv")
In conclusion, I have acquired the proficiency to utilize the R Markdown file for the generation of a CSV file. CSV files, characterized by their uncomplicated structure and ease of comprehension, offer significant advantages for the analysis of extensive datasets. An area warranting further exploration involves the importation of the CSV file into the SQL database to validate its functionality.