In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be:
Gary Hua, ON, 6.0, 1794, 1605

1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

Load packages

library(tidyverse)

Read the data file from github

#provide the github data path
dt_path <- "https://raw.githubusercontent.com/Naik-Khyati/607_p1_chess/main/data/tournamentinfo.txt"

# read the txt data file
chess_raw_data <-  read.csv(dt_path, header = FALSE, sep = "|")

head(chess_raw_data,12)
##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2                                                                                      Pair 
## 3                                                                                      Num  
## 4  -----------------------------------------------------------------------------------------
## 5                                                                                         1 
## 6                                                                                        ON 
## 7  -----------------------------------------------------------------------------------------
## 8                                                                                         2 
## 9                                                                                        MI 
## 10 -----------------------------------------------------------------------------------------
## 11                                                                                        3 
## 12                                                                                       MI 
##                                   V2    V3    V4    V5    V6    V7    V8    V9
## 1                                                                             
## 2   Player Name                      Total Round Round Round Round Round Round
## 3   USCF ID / Rtg (Pre->Post)         Pts    1     2     3     4     5     6  
## 4                                                                             
## 5   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7 D  12
## 6   15445895 / R: 1794   ->1817      N:2   W     B     W     B     W     B    
## 7                                                                             
## 8   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16 W  20
## 9   14598900 / R: 1553   ->1663      N:2   B     W     B     W     B     W    
## 10                                                                            
## 11  ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11 W  13
## 12  14959604 / R: 1384   ->1640      N:2   W     B     W     B     W     B    
##      V10 V11
## 1         NA
## 2  Round  NA
## 3    7    NA
## 4         NA
## 5  D   4  NA
## 6  W      NA
## 7         NA
## 8  W   7  NA
## 9  B      NA
## 10        NA
## 11 W  12  NA
## 12 W      NA

Remove the first 4 rows from the dataframe as those are not required

chess_dt <- chess_raw_data %>% slice(-1:-4)
head(chess_dt)
##                                                                                          V1
## 1                                                                                        1 
## 2                                                                                       ON 
## 3 -----------------------------------------------------------------------------------------
## 4                                                                                        2 
## 5                                                                                       MI 
## 6 -----------------------------------------------------------------------------------------
##                                  V2    V3    V4    V5    V6    V7    V8    V9
## 1  GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7 D  12
## 2  15445895 / R: 1794   ->1817      N:2   W     B     W     B     W     B    
## 3                                                                            
## 4  DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16 W  20
## 5  14598900 / R: 1553   ->1663      N:2   B     W     B     W     B     W    
## 6                                                                            
##     V10 V11
## 1 D   4  NA
## 2 W      NA
## 3        NA
## 4 W   7  NA
## 5 B      NA
## 6        NA

Remove rows with — as they do not contain any data

#delete rows with ----
chess_dt <- chess_dt[!grepl("---", chess_dt[ ,1]), ]
head(chess_dt)
##       V1                                V2    V3    V4    V5    V6    V7    V8
## 1     1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 2    ON   15445895 / R: 1794   ->1817      N:2   W     B     W     B     W    
## 4     2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 5    MI   14598900 / R: 1553   ->1663      N:2   B     W     B     W     B    
## 7     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 8    MI   14959604 / R: 1384   ->1640      N:2   W     B     W     B     W    
##      V9   V10 V11
## 1 D  12 D   4  NA
## 2 B     W      NA
## 4 W  20 W   7  NA
## 5 W     B      NA
## 7 W  13 W  12  NA
## 8 B     W      NA

Check if column named V11 contains any information/data

unique(chess_dt$V11)
## [1] NA

Since column V11 does not contan any data, remove it from the dataframe

chess_dt <- chess_dt %>% select(-V11)
head(chess_dt)
##       V1                                V2    V3    V4    V5    V6    V7    V8
## 1     1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 2    ON   15445895 / R: 1794   ->1817      N:2   W     B     W     B     W    
## 4     2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 5    MI   14598900 / R: 1553   ->1663      N:2   B     W     B     W     B    
## 7     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 8    MI   14959604 / R: 1384   ->1640      N:2   W     B     W     B     W    
##      V9   V10
## 1 D  12 D   4
## 2 B     W    
## 4 W  20 W   7
## 5 W     B    
## 7 W  13 W  12
## 8 B     W

Perform further data manipulations

Essentially our data is stored in alternate rows. Hence, i will be saving records in even rows as a data frame and records in odd rows into another data frame.

Also, I will only keep relevant columns in the data frame that i will require int he final data frame.

chess_dt_odd <- chess_dt[seq(1, nrow(chess_dt), 2), ]

#  select (V2,V3) %>% rename (name = V2, tot_pts = V3)
chess_dt_even <- chess_dt[seq(2, nrow(chess_dt), 2), ] %>% select (V1,V2) %>% rename (state = V1, pre_rt =V2)

Combine odd and even dataframes

chess_dt_comb <- cbind(chess_dt_odd, chess_dt_even)
head(chess_dt_comb)
##        V1                                V2    V3    V4    V5    V6    V7    V8
## 1      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 4      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 7      3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 10     4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26 D   5
## 13     5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13 D   4
## 16     6   HANSEN SONG                      5.0   W  34 D  29 L  11 W  35 D  10
##       V9   V10  state                            pre_rt
## 1  D  12 D   4    ON   15445895 / R: 1794   ->1817     
## 4  W  20 W   7    MI   14598900 / R: 1553   ->1663     
## 7  W  13 W  12    MI   14959604 / R: 1384   ->1640     
## 10 W  19 D   1    MI   12616049 / R: 1716   ->1744     
## 13 W  14 W  17    MI   14601533 / R: 1655   ->1690     
## 16 W  27 W  21    OH   15055204 / R: 1686   ->1687

Data cleaning operations on multiple columns in the dataframe

#split the pre rating column by character R:
chess_dt_comb <- separate(chess_dt_comb, pre_rt, c("uscf_id", "rating" ), "R:")

#cleaning the rating column further to keep only digits 
chess_dt_comb$pre_rating <- str_extract(chess_dt_comb$rating, "\\d+\\d")

# Convert pre_rating column to numeric
chess_dt_comb$pre_rating <- as.numeric(chess_dt_comb$pre_rating)

# Convert V3 column to numeric
chess_dt_comb$V3 <- as.numeric(chess_dt_comb$V3)

# Remove white space from column V1
chess_dt_comb$V1 <- trimws(chess_dt_comb$V1)

# Remove white space from column V2
chess_dt_comb$V2 <-  trimws(chess_dt_comb$V2)

# Remove white space from column state
chess_dt_comb$state <-  trimws(chess_dt_comb$state)

To get average opponent rating, first create a dataframe name opp_data in long form using gather function

opp_data <- chess_dt_comb %>%
  select(V1,V4:V10) %>%
  gather(key='opp_nbr',value="opp_id",V4:V10) %>% arrange(V1)
glimpse(opp_data)
## Rows: 448
## Columns: 3
## $ V1      <chr> "1", "1", "1", "1", "1", "1", "1", "10", "10", "10", "10", "10…
## $ opp_nbr <chr> "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V4", "V5", "V6", "…
## $ opp_id  <chr> "W  39", "W  21", "W  18", "W  14", "W   7", "D  12", "D   4",…

Perform further data manipulations on opponent id column

# keep only numeric values in opp_id column
opp_data$opp_id <- str_extract(opp_data$opp_id,"[0-9]+")

Note that opponent id column have NAs in it

opp_data %>%
  select(V1,opp_id) %>% 
  summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
##   V1 opp_id
## 1  0     40

Remove rows with na as opponent id

opp_data <- opp_data %>% drop_na(opp_id)

Check if there are no NAs in the opp_id column

opp_data %>%
  select(V1,opp_id) %>%  
  summarise_all(funs(sum(is.na(.)))) 
##   V1 opp_id
## 1  0      0

Perform further data manipulation on opp_data to fetch average pre rating for the opponents

# Left join opp_data with chess_dt_comb to fetch the average pre rating for opponents
opp_data <- opp_data %>%
  left_join(chess_dt_comb, c("opp_id" = "V1")) %>% select (V1, pre_rating) %>%
  group_by (V1) %>%
  summarize(avg_opp_rating=  round(mean(pre_rating),0))

glimpse(opp_data)
## Rows: 64
## Columns: 2
## $ V1             <chr> "1", "10", "11", "12", "13", "14", "15", "16", "17", "1…
## $ avg_opp_rating <dbl> 1605, 1554, 1468, 1506, 1498, 1515, 1484, 1386, 1499, 1…

Create a chess_dt_final dataframe by left join operation on chess_dt_comb with opp_data. Also rename columns V2 and V3 appropriately.

chess_dt_final <- chess_dt_comb %>%
  left_join(opp_data, c("V1" = "V1")) %>% select (V2, state, V3, pre_rating,avg_opp_rating) %>% rename(player_name = V2, tot_points = V3)
head(chess_dt_final,100)
##                   player_name state tot_points pre_rating avg_opp_rating
## 1                    GARY HUA    ON        6.0       1794           1605
## 2             DAKSHESH DARURI    MI        6.0       1553           1469
## 3                ADITYA BAJAJ    MI        6.0       1384           1564
## 4         PATRICK H SCHILLING    MI        5.5       1716           1574
## 5                  HANSHI ZUO    MI        5.5       1655           1501
## 6                 HANSEN SONG    OH        5.0       1686           1519
## 7           GARY DEE SWATHELL    MI        5.0       1649           1372
## 8            EZEKIEL HOUGHTON    MI        5.0       1641           1468
## 9                 STEFANO LEE    ON        5.0       1411           1523
## 10                  ANVIT RAO    MI        5.0       1365           1554
## 11   CAMERON WILLIAM MC LEMAN    MI        4.5       1712           1468
## 12             KENNETH J TACK    MI        4.5       1663           1506
## 13          TORRANCE HENRY JR    MI        4.5       1666           1498
## 14               BRADLEY SHAW    MI        4.5       1610           1515
## 15     ZACHARY JAMES HOUGHTON    MI        4.5       1220           1484
## 16               MIKE NIKITIN    MI        4.0       1604           1386
## 17         RONALD GRZEGORCZYK    MI        4.0       1629           1499
## 18              DAVID SUNDEEN    MI        4.0       1600           1480
## 19               DIPANKAR ROY    MI        4.0       1564           1426
## 20                JASON ZHENG    MI        4.0       1595           1411
## 21              DINH DANG BUI    ON        4.0       1563           1470
## 22           EUGENE L MCCLURE    MI        4.0       1555           1300
## 23                   ALAN BUI    ON        4.0       1363           1214
## 24          MICHAEL R ALDRICH    MI        4.0       1229           1357
## 25           LOREN SCHWIEBERT    MI        3.5       1745           1363
## 26                    MAX ZHU    ON        3.5       1579           1507
## 27             GAURAV GIDWANI    MI        3.5       1552           1222
## 28 SOFIA ADINA STANESCU-BELLU    MI        3.5       1507           1522
## 29           CHIEDOZIE OKORIE    MI        3.5       1602           1314
## 30         GEORGE AVERY JONES    ON        3.5       1522           1144
## 31               RISHI SHETTY    MI        3.5       1494           1260
## 32      JOSHUA PHILIP MATHEWS    ON        3.5       1441           1379
## 33                    JADE GE    MI        3.5       1449           1277
## 34     MICHAEL JEFFERY THOMAS    MI        3.5       1399           1375
## 35           JOSHUA DAVID LEE    MI        3.5       1438           1150
## 36              SIDDHARTH JHA    MI        3.5       1355           1388
## 37       AMIYATOSH PWNANANDAM    MI        3.5        980           1385
## 38                  BRIAN LIU    MI        3.0       1423           1539
## 39              JOEL R HENDON    MI        3.0       1436           1430
## 40               FOREST ZHANG    MI        3.0       1348           1391
## 41        KYLE WILLIAM MURPHY    MI        3.0       1403           1248
## 42                   JARED GE    MI        3.0       1332           1150
## 43          ROBERT GLEN VASEY    MI        3.0       1283           1107
## 44         JUSTIN D SCHILLING    MI        3.0       1199           1327
## 45                  DEREK YAN    MI        3.0       1242           1152
## 46   JACOB ALEXANDER LAVALLEY    MI        3.0        377           1358
## 47                ERIC WRIGHT    MI        2.5       1362           1392
## 48               DANIEL KHAIN    MI        2.5       1382           1356
## 49           MICHAEL J MARTIN    MI        2.5       1291           1286
## 50                 SHIVAM JHA    MI        2.5       1056           1296
## 51             TEJAS AYYAGARI    MI        2.5       1011           1356
## 52                  ETHAN GUO    MI        2.5        935           1495
## 53              JOSE C YBARRA    MI        2.0       1393           1345
## 54                LARRY HODGE    MI        2.0       1270           1206
## 55                  ALEX KONG    MI        2.0       1186           1406
## 56               MARISA RICCI    MI        2.0       1153           1414
## 57                 MICHAEL LU    MI        2.0       1092           1363
## 58               VIRAJ MOHILE    MI        2.0        917           1391
## 59          SEAN M MC CORMICK    MI        2.0        853           1319
## 60                 JULIA SHEN    MI        1.5        967           1330
## 61              JEZZEL FARKAS    ON        1.5        955           1327
## 62              ASHWIN BALAJI    MI        1.0       1530           1186
## 63       THOMAS JOSEPH HOSMER    MI        1.0       1175           1350
## 64                     BEN LI    MI        1.0       1163           1263

Write the csv into the working directory. This csv can be loaded into database for further analysis.

write.csv(chess_dt_final,"chess_tournmnt_clean_data.csv",row.names = FALSE)