Project 1

Overview

In the context of this project, I will create an R Markdown file aimed at generating a CSV file from the provided text document, which encompasses chess tournament outcomes. Subsequently, I will undertake the task of transforming the aforementioned data into a structured data frame before generating the CSV file.

#Import data
df <- readLines("https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/607/tournamentinfo.txt")
head(df)
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
#Separate data into two vectors
df_1 <- df[seq(from = 5, to = length(df)-1, by = 3)]
df_1 <- str_split(df_1, pattern = "\\|", simplify = TRUE)
head(df_1)
##      [,1]     [,2]                                [,3]    [,4]    [,5]   
## [1,] "    1 " " GARY HUA                        " "6.0  " "W  39" "W  21"
## [2,] "    2 " " DAKSHESH DARURI                 " "6.0  " "W  63" "W  58"
## [3,] "    3 " " ADITYA BAJAJ                    " "6.0  " "L   8" "W  61"
## [4,] "    4 " " PATRICK H SCHILLING             " "5.5  " "W  23" "D  28"
## [5,] "    5 " " HANSHI ZUO                      " "5.5  " "W  45" "W  37"
## [6,] "    6 " " HANSEN SONG                     " "5.0  " "W  34" "D  29"
##      [,6]    [,7]    [,8]    [,9]    [,10]   [,11]
## [1,] "W  18" "W  14" "W   7" "D  12" "D   4" ""   
## [2,] "L   4" "W  17" "W  16" "W  20" "W   7" ""   
## [3,] "W  25" "W  21" "W  11" "W  13" "W  12" ""   
## [4,] "W   2" "W  26" "D   5" "W  19" "D   1" ""   
## [5,] "D  12" "D  13" "D   4" "W  14" "W  17" ""   
## [6,] "L  11" "W  35" "D  10" "W  27" "W  21" ""
df_2 <- df[seq(from = 6, to = length(df), by = 3)]
df_2 <- str_split(df_2, pattern = "\\|", simplify = TRUE)
head(df_2)
##      [,1]     [,2]                                [,3]    [,4]    [,5]   
## [1,] "   ON " " 15445895 / R: 1794   ->1817     " "N:2  " "W    " "B    "
## [2,] "   MI " " 14598900 / R: 1553   ->1663     " "N:2  " "B    " "W    "
## [3,] "   MI " " 14959604 / R: 1384   ->1640     " "N:2  " "W    " "B    "
## [4,] "   MI " " 12616049 / R: 1716   ->1744     " "N:2  " "W    " "B    "
## [5,] "   MI " " 14601533 / R: 1655   ->1690     " "N:2  " "B    " "W    "
## [6,] "   OH " " 15055204 / R: 1686   ->1687     " "N:3  " "W    " "B    "
##      [,6]    [,7]    [,8]    [,9]    [,10]   [,11]
## [1,] "W    " "B    " "W    " "B    " "W    " ""   
## [2,] "B    " "W    " "B    " "W    " "B    " ""   
## [3,] "W    " "B    " "W    " "B    " "W    " ""   
## [4,] "W    " "B    " "W    " "B    " "B    " ""   
## [5,] "B    " "W    " "B    " "W    " "B    " ""   
## [6,] "W    " "B    " "B    " "W    " "B    " ""
#Get name
name <- str_trim(c(df_1[ ,2]))
head(name)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"
total_points <- str_trim(c(df_1[ ,3]))
head(total_points)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
State <- str_trim(c(df_2[ ,1]))
head(State)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
rating <- "R:" %R% one_or_more(SPACE) %R% one_or_more(DIGIT) %R% optional("P") 
pre_rating <- c(str_extract(str_extract(df_2[, 2], pattern = rating), pattern = "\\d+")) 
pre_rating <- as.numeric(pre_rating)
head(pre_rating)
## [1] 1794 1553 1384 1716 1655 1686
results <- data.frame("Player_Name" = name, "Player_State" = State, "Total_Points" = total_points, "Pre_Rating" = pre_rating)
results$Avg_Opp_Pre_Rating <- NA 
head(results)
##           Player_Name Player_State Total_Points Pre_Rating Avg_Opp_Pre_Rating
## 1            GARY HUA           ON          6.0       1794                 NA
## 2     DAKSHESH DARURI           MI          6.0       1553                 NA
## 3        ADITYA BAJAJ           MI          6.0       1384                 NA
## 4 PATRICK H SCHILLING           MI          5.5       1716                 NA
## 5          HANSHI ZUO           MI          5.5       1655                 NA
## 6         HANSEN SONG           OH          5.0       1686                 NA
#Store opponents' ID
opponents <- matrix(str_extract_all(df_1[,4:10], "\\d+", simplify = TRUE), ncol = 7)
head(opponents)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "39" "21" "18" "14" "7"  "12" "4" 
## [2,] "63" "58" "4"  "17" "16" "20" "7" 
## [3,] "8"  "61" "25" "21" "11" "13" "12"
## [4,] "23" "28" "2"  "26" "5"  "19" "1" 
## [5,] "45" "37" "12" "13" "4"  "14" "17"
## [6,] "34" "29" "11" "35" "10" "27" "21"
#Calculate Average Opponents' Pre-Rating
for(i in 1:nrow(results)){
  results$Avg_Opp_Pre_Rating[i] <- round(mean(pre_rating[as.numeric(opponents[i,])], na.rm = TRUE))
}
#View final data frame
head(results)
##           Player_Name Player_State Total_Points Pre_Rating Avg_Opp_Pre_Rating
## 1            GARY HUA           ON          6.0       1794               1605
## 2     DAKSHESH DARURI           MI          6.0       1553               1469
## 3        ADITYA BAJAJ           MI          6.0       1384               1564
## 4 PATRICK H SCHILLING           MI          5.5       1716               1574
## 5          HANSHI ZUO           MI          5.5       1655               1501
## 6         HANSEN SONG           OH          5.0       1686               1519
#Create CSV file
write.csv(results, file = "tournamentinfo.csv")

Conclusion

In conclusion, I have acquired the proficiency to utilize the R Markdown file for the generation of a CSV file. CSV files, characterized by their uncomplicated structure and ease of comprehension, offer significant advantages for the analysis of extensive datasets. An area warranting further exploration involves the importation of the CSV file into the SQL database to validate its functionality.