In this project, I was given a text file with chess tournament results where the information had some structure. My goal was to create an R Markdown file that generates a CSV file with the following information for all players:

My process for this project was to read in tournament data, tidy it, aggregate the opponent pre-scores, and return a CSV file.

Here is a GitHub repository with links to the original text file and the CSV file I created.


1. Read in the file.

df <- read.table("C:/Users/Kavya/Desktop/Education/MS Data Science/DATA 607 (Data Acquisition and Management)/Projects/Project 01/tournamentinfo.txt",sep = "|", fill = TRUE)

2. Examine the data.

names(df) <- c("Pair_Num", "Player_Name", "Total_Pts", "Round_1", "Round_2", "Round_3", "Round_4", "Round_5", "Round_6", "Round_7", "NA")

head(df)
##                                                                                    Pair_Num
## 1 -----------------------------------------------------------------------------------------
## 2                                                                                     Pair 
## 3                                                                                     Num  
## 4 -----------------------------------------------------------------------------------------
## 5                                                                                        1 
## 6                                                                                       ON 
##                         Player_Name Total_Pts Round_1 Round_2 Round_3
## 1                                                                    
## 2  Player Name                          Total   Round   Round   Round
## 3  USCF ID / Rtg (Pre->Post)             Pts      1       2       3  
## 4                                                                    
## 5  GARY HUA                             6.0     W  39   W  21   W  18
## 6  15445895 / R: 1794   ->1817          N:2     W       B       W    
##   Round_4 Round_5 Round_6 Round_7 NA
## 1                                 NA
## 2   Round   Round   Round   Round NA
## 3     4       5       6       7   NA
## 4                                 NA
## 5   W  14   W   7   D  12   D   4 NA
## 6   B       W       B       W     NA

3. Reshape the data.

df <- distinct(df)

df <- df[ -c(1:3), ]
df2 <- df %>% gather(Round, Outcome, Round_1, Round_2, Round_3, Round_4, Round_5, Round_6, Round_7)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(df2)
##   Pair_Num                       Player_Name Total_Pts NA   Round Outcome
## 1       1   GARY HUA                             6.0   NA Round_1   W  39
## 2      ON   15445895 / R: 1794   ->1817          N:2   NA Round_1   W    
## 3       2   DAKSHESH DARURI                      6.0   NA Round_1   W  63
## 4      MI   14598900 / R: 1553   ->1663          N:2   NA Round_1   B    
## 5       3   ADITYA BAJAJ                         6.0   NA Round_1   L   8
## 6      MI   14959604 / R: 1384   ->1640          N:2   NA Round_1   W

4. Separate the data – Part 1.

state <- grep("[[:alpha:]]+", df2$Pair_Num, value=T)

pair_number <- grep("[[:digit:]]+", df2$Pair_Num, value=T)

id <- grep("[[:digit:]]+", df2$Player_Name, value=T)

Player_Name <- grep("[[:alpha:]]+[[:space:]][[:alpha:]]+", df2$Player_Name, value=T)

df3 <- data.frame(state, pair_number, id, Player_Name)

head(df3)
##    state pair_number                                id
## 1    ON           1   15445895 / R: 1794   ->1817     
## 2    MI           2   14598900 / R: 1553   ->1663     
## 3    MI           3   14959604 / R: 1384   ->1640     
## 4    MI           4   12616049 / R: 1716   ->1744     
## 5    MI           5   14601533 / R: 1655   ->1690     
## 6    OH           6   15055204 / R: 1686   ->1687     
##                         Player_Name
## 1  GARY HUA                        
## 2  DAKSHESH DARURI                 
## 3  ADITYA BAJAJ                    
## 4  PATRICK H SCHILLING             
## 5  HANSHI ZUO                      
## 6  HANSEN SONG

5. Join the separated data to the main dataframe.

joined <- distinct(left_join(df3, df2, by="Player_Name"))
## Warning: Column `Player_Name` joining factors with different levels,
## coercing to character vector
joined <- joined[ , -2 ]

head(joined)
##    state                                id
## 1    ON   15445895 / R: 1794   ->1817     
## 2    ON   15445895 / R: 1794   ->1817     
## 3    ON   15445895 / R: 1794   ->1817     
## 4    ON   15445895 / R: 1794   ->1817     
## 5    ON   15445895 / R: 1794   ->1817     
## 6    ON   15445895 / R: 1794   ->1817     
##                         Player_Name Pair_Num Total_Pts NA   Round Outcome
## 1  GARY HUA                               1      6.0   NA Round_1   W  39
## 2  GARY HUA                               1      6.0   NA Round_2   W  21
## 3  GARY HUA                               1      6.0   NA Round_3   W  18
## 4  GARY HUA                               1      6.0   NA Round_4   W  14
## 5  GARY HUA                               1      6.0   NA Round_5   W   7
## 6  GARY HUA                               1      6.0   NA Round_6   D  12

6. Separate the data – Part 2.

In this step, I applied a series of further separations to the main dataframe using regular expressions to pull out the player’s Outcome, Opponent, ID, Pre-Score, and Post-Score.

joined1 <- joined %>%
  separate(Outcome, c("Outcome", "Opponent"), sep = "[[:space:]]+") %>%
  separate(id, c("id", "score"), sep = "[\\/]")
joined2 <- joined1 %>%
  separate(score, c("Pre-Score", "Post-Score"), sep = "[\\-]")
joined3 <- joined2 %>%
  separate(`Pre-Score`, c("Pre-Score2", "Pre-Score"), sep = "[\\:]")
joined4 <- joined3 %>%
  separate(`Pre-Score`, c("Pre-Score", "Pre-Score2"), sep = "[[:alpha:]]+")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 378 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
joined5 <- joined4 %>%
  separate(`Post-Score`, c("Post-Score2", "Post-Score"), sep = "[\\>]")
joined6 <- joined5 %>%
  separate(`Post-Score`, c("Post-Score", "Post-Score2"), sep = "[[:alpha:]]+")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 392 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
final_join <- joined6[ , c(-5,-6) ]

head(final_join)
##    state         id Pre-Score Post-Score                       Player_Name
## 1    ON   15445895    1794     1817       GARY HUA                        
## 2    ON   15445895    1794     1817       GARY HUA                        
## 3    ON   15445895    1794     1817       GARY HUA                        
## 4    ON   15445895    1794     1817       GARY HUA                        
## 5    ON   15445895    1794     1817       GARY HUA                        
## 6    ON   15445895    1794     1817       GARY HUA                        
##   Pair_Num Total_Pts NA   Round Outcome Opponent
## 1       1      6.0   NA Round_1       W       39
## 2       1      6.0   NA Round_2       W       21
## 3       1      6.0   NA Round_3       W       18
## 4       1      6.0   NA Round_4       W       14
## 5       1      6.0   NA Round_5       W        7
## 6       1      6.0   NA Round_6       D       12

7. Create a reference dataframe.

opponent_info <- distinct((data.frame(final_join$`Pre-Score`, final_join$Pair_Num)))

names(opponent_info) <- c("Opponent_Score", "Opponent")

head(opponent_info)
##   Opponent_Score Opponent
## 1        1794          1 
## 2        1553          2 
## 3        1384          3 
## 4        1716          4 
## 5        1655          5 
## 6        1686          6

8. Make the variables numeric.

final_join$Pair_Num <- as.numeric(final_join$Pair_Num)

final_join <- mutate(final_join, Pair_Num = Pair_Num - 1)
## Warning: package 'bindrcpp' was built under R version 3.4.3
final_join$Opponent <- as.numeric(final_join$Opponent)

opponent_info$Opponent <- as.numeric(opponent_info$Opponent)

opponent_info <- mutate(opponent_info, Opponent = Opponent - 1)

9. Add the opponent’s pre-score to each round.

merged <- full_join(final_join, opponent_info[, c("Opponent", "Opponent_Score")], by="Opponent")

names(merged) <- c("Player_Name", "Player_ID", "Pre_Score", "Post_Score", "Player_State", "Pair_Num", "Total_Pts", "NA", "Round", "Outcome", "Opponent", "Opponent_Score")

head(merged)
##   Player_Name  Player_ID Pre_Score Post_Score
## 1         ON   15445895    1794     1817     
## 2         ON   15445895    1794     1817     
## 3         ON   15445895    1794     1817     
## 4         ON   15445895    1794     1817     
## 5         ON   15445895    1794     1817     
## 6         ON   15445895    1794     1817     
##                        Player_State Pair_Num Total_Pts NA   Round Outcome
## 1  GARY HUA                                1     6.0   NA Round_1       W
## 2  GARY HUA                                1     6.0   NA Round_2       W
## 3  GARY HUA                                1     6.0   NA Round_3       W
## 4  GARY HUA                                1     6.0   NA Round_4       W
## 5  GARY HUA                                1     6.0   NA Round_5       W
## 6  GARY HUA                                1     6.0   NA Round_6       D
##   Opponent Opponent_Score
## 1       39           1436
## 2       21           1563
## 3       18        1600   
## 4       14        1610   
## 5        7        1649   
## 6       12        1663

10. Aggregate the opponent’s pre-scores.

merged$Opponent_Score <- as.character(merged$Opponent_Score)

merged$Opponent_Score <- as.numeric(merged$Opponent_Score)
aggregated <- aggregate(merged[ , 12], list(merged$Pair_Num), mean)

head(aggregated)
##   Group.1        x
## 1       1 1605.286
## 2       2 1469.286
## 3       3 1563.571
## 4       4 1573.571
## 5       5 1500.857
## 6       6 1518.714

11. Create a final dataframe.

final <- distinct(data.frame(merged$Pair_Num, merged$Player_Name, merged$Player_State, merged$Total_Pts, merged$Pre_Score))

final <- merge(final, aggregated, by.x = "merged.Pair_Num", by.y = 1, all.x = TRUE)

names(final) <- c("Pair_Num", "Player_Name", "Player_State", "Total_Pts", "Pre_Score", "Avg_Opponent_Score")

head(final)
##   Pair_Num Player_Name                      Player_State Total_Pts
## 1        1         ON   GARY HUA                             6.0  
## 2        2         MI   DAKSHESH DARURI                      6.0  
## 3        3         MI   ADITYA BAJAJ                         6.0  
## 4        4         MI   PATRICK H SCHILLING                  5.5  
## 5        5         MI   HANSHI ZUO                           5.5  
## 6        6         OH   HANSEN SONG                          5.0  
##   Pre_Score Avg_Opponent_Score
## 1   1794              1605.286
## 2   1553              1469.286
## 3   1384              1563.571
## 4   1716              1573.571
## 5   1655              1500.857
## 6   1686              1518.714

12. Write the dataframe into a CSV file.

write.csv(final, "tournamentinfo.csv")