To read the txt file:

df<- read.delim(url, sep="|", header=FALSE,skip=4)
head(df)

##                                                                                          V1
## 1                                                                                        1 
## 2                                                                                       ON 
## 3 -----------------------------------------------------------------------------------------
## 4                                                                                        2 
## 5                                                                                       MI 
## 6 -----------------------------------------------------------------------------------------
##                                  V2    V3    V4    V5    V6    V7    V8    V9
## 1  GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7 D  12
## 2  15445895 / R: 1794   ->1817      N:2   W     B     W     B     W     B    
## 3                                                                            
## 4  DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16 W  20
## 5  14598900 / R: 1553   ->1663      N:2   B     W     B     W     B     W    
## 6                                                                            
##     V10 V11
## 1 D   4  NA
## 2 W      NA
## 3        NA
## 4 W   7  NA
## 5 B      NA
## 6        NA

To transform the data into the correct dataframe:

rows_to_remove<- seq(from= 0, to = nrow(df), by=3) #removing extra separator lines
clean_df<- df[-rows_to_remove, ] #anything before the "," indicates the row
head(clean_df)

##       V1                                V2    V3    V4    V5    V6    V7    V8
## 1     1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 2    ON   15445895 / R: 1794   ->1817      N:2   W     B     W     B     W    
## 4     2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 5    MI   14598900 / R: 1553   ->1663      N:2   B     W     B     W     B    
## 7     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 8    MI   14959604 / R: 1384   ->1640      N:2   W     B     W     B     W    
##      V9   V10 V11
## 1 D  12 D   4  NA
## 2 B     W      NA
## 4 W  20 W   7  NA
## 5 W     B      NA
## 7 W  13 W  12  NA
## 8 B     W      NA

#filter out every second row and merge two data frames together into one:
second_row<- seq(2, nrow(clean_df), by =2)
second_row_df<-clean_df[second_row, ]
head(data.frame(second_row_df))

##        V1                                V2    V3    V4    V5    V6    V7    V8
## 2     ON   15445895 / R: 1794   ->1817      N:2   W     B     W     B     W    
## 5     MI   14598900 / R: 1553   ->1663      N:2   B     W     B     W     B    
## 8     MI   14959604 / R: 1384   ->1640      N:2   W     B     W     B     W    
## 11    MI   12616049 / R: 1716   ->1744      N:2   W     B     W     B     W    
## 14    MI   14601533 / R: 1655   ->1690      N:2   B     W     B     W     B    
## 17    OH   15055204 / R: 1686   ->1687      N:3   W     B     W     B     B    
##       V9   V10 V11
## 2  B     W      NA
## 5  W     B      NA
## 8  B     W      NA
## 11 B     B      NA
## 14 W     B      NA
## 17 W     B      NA

first_row_df<-clean_df[-second_row, ]

head(first_row_df)

##        V1                                V2    V3    V4    V5    V6    V7    V8
## 1      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 4      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 7      3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 10     4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26 D   5
## 13     5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13 D   4
## 16     6   HANSEN SONG                      5.0   W  34 D  29 L  11 W  35 D  10
##       V9   V10 V11
## 1  D  12 D   4  NA
## 4  W  20 W   7  NA
## 7  W  13 W  12  NA
## 10 W  19 D   1  NA
## 13 W  14 W  17  NA
## 16 W  27 W  21  NA

#to combine the two dataframes:
com_df<- cbind(first_row_df, second_row_df)
head(com_df)

##        V1                                V2    V3    V4    V5    V6    V7    V8
## 1      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 4      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 7      3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 10     4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26 D   5
## 13     5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13 D   4
## 16     6   HANSEN SONG                      5.0   W  34 D  29 L  11 W  35 D  10
##       V9   V10 V11     V1                                V2    V3    V4    V5
## 1  D  12 D   4  NA    ON   15445895 / R: 1794   ->1817      N:2   W     B    
## 4  W  20 W   7  NA    MI   14598900 / R: 1553   ->1663      N:2   B     W    
## 7  W  13 W  12  NA    MI   14959604 / R: 1384   ->1640      N:2   W     B    
## 10 W  19 D   1  NA    MI   12616049 / R: 1716   ->1744      N:2   W     B    
## 13 W  14 W  17  NA    MI   14601533 / R: 1655   ->1690      N:2   B     W    
## 16 W  27 W  21  NA    OH   15055204 / R: 1686   ->1687      N:3   W     B    
##       V6    V7    V8    V9   V10 V11
## 1  W     B     W     B     W      NA
## 4  B     W     B     W     B      NA
## 7  W     B     W     B     W      NA
## 10 W     B     W     B     B      NA
## 13 B     W     B     W     B      NA
## 16 W     B     B     W     B      NA

remove extra columns and rename existing columns:

com_df<- com_df %>%
  subset(select=c(1:10, 12:13))

colnames(com_df)<- c("PairID", "Name", "Points", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7", "State", "USCF_ID")
head(com_df)

##    PairID                              Name Points Round1 Round2 Round3 Round4
## 1      1   GARY HUA                          6.0    W  39  W  21  W  18  W  14
## 4      2   DAKSHESH DARURI                   6.0    W  63  W  58  L   4  W  17
## 7      3   ADITYA BAJAJ                      6.0    L   8  W  61  W  25  W  21
## 10     4   PATRICK H SCHILLING               5.5    W  23  D  28  W   2  W  26
## 13     5   HANSHI ZUO                        5.5    W  45  W  37  D  12  D  13
## 16     6   HANSEN SONG                       5.0    W  34  D  29  L  11  W  35
##    Round5 Round6 Round7  State                           USCF_ID
## 1   W   7  D  12  D   4    ON   15445895 / R: 1794   ->1817     
## 4   W  16  W  20  W   7    MI   14598900 / R: 1553   ->1663     
## 7   W  11  W  13  W  12    MI   14959604 / R: 1384   ->1640     
## 10  D   5  W  19  D   1    MI   12616049 / R: 1716   ->1744     
## 13  D   4  W  14  W  17    MI   14601533 / R: 1655   ->1690     
## 16  D  10  W  27  W  21    OH   15055204 / R: 1686   ->1687

To make the values atomic in each cell

com_df<- com_df %>%
  separate_wider_delim('USCF_ID',
                       delim="/",
                       names= c('USCF_ID',
                                'Pre_rating'))

To extract ONLY the pre rating from the Pre_rating column:

com_df$Pre_rating<- trimws(str_sub(com_df$Pre_rating, 4,8))
print(com_df)

## # A tibble: 64 × 13
##    PairID   Name   Points Round1 Round2 Round3 Round4 Round5 Round6 Round7 State
##    <chr>    <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>
##  1 "    1 " " GAR… "6.0 … W  39  W  21  W  18  W  14  W   7  D  12  D   4  "   …
##  2 "    2 " " DAK… "6.0 … W  63  W  58  L   4  W  17  W  16  W  20  W   7  "   …
##  3 "    3 " " ADI… "6.0 … L   8  W  61  W  25  W  21  W  11  W  13  W  12  "   …
##  4 "    4 " " PAT… "5.5 … W  23  D  28  W   2  W  26  D   5  W  19  D   1  "   …
##  5 "    5 " " HAN… "5.5 … W  45  W  37  D  12  D  13  D   4  W  14  W  17  "   …
##  6 "    6 " " HAN… "5.0 … W  34  D  29  L  11  W  35  D  10  W  27  W  21  "   …
##  7 "    7 " " GAR… "5.0 … W  57  W  46  W  13  W  11  L   1  W   9  L   2  "   …
##  8 "    8 " " EZE… "5.0 … W   3  W  32  L  14  L   9  W  47  W  28  W  19  "   …
##  9 "    9 " " STE… "5.0 … W  25  L  18  W  59  W   8  W  26  L   7  W  20  "   …
## 10 "   10 " " ANV… "5.0 … D  16  L  19  W  55  W  31  D   6  W  25  W  18  "   …
## # ℹ 54 more rows
## # ℹ 2 more variables: USCF_ID <chr>, Pre_rating <chr>

To extract only PairID from the rounds:

com_df[, 4:10]<- com_df[, 4:10] %>%
  lapply(function(x) trimws(gsub("[^0-9]", "", x)))

#to convert columns to be numeric:
com_df[,c(1, 3:10, 12:13)]<- lapply(com_df[,c(1, 3:10, 12:13)], as.numeric)
print(com_df)

## # A tibble: 64 × 13
##    PairID Name     Points Round1 Round2 Round3 Round4 Round5 Round6 Round7 State
##     <dbl> <chr>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <chr>
##  1      1 " GARY …    6       39     21     18     14      7     12      4 "   …
##  2      2 " DAKSH…    6       63     58      4     17     16     20      7 "   …
##  3      3 " ADITY…    6        8     61     25     21     11     13     12 "   …
##  4      4 " PATRI…    5.5     23     28      2     26      5     19      1 "   …
##  5      5 " HANSH…    5.5     45     37     12     13      4     14     17 "   …
##  6      6 " HANSE…    5       34     29     11     35     10     27     21 "   …
##  7      7 " GARY …    5       57     46     13     11      1      9      2 "   …
##  8      8 " EZEKI…    5        3     32     14      9     47     28     19 "   …
##  9      9 " STEFA…    5       25     18     59      8     26      7     20 "   …
## 10     10 " ANVIT…    5       16     19     55     31      6     25     18 "   …
## # ℹ 54 more rows
## # ℹ 2 more variables: USCF_ID <dbl>, Pre_rating <dbl>

Alternative way but less accurate:

com_df[, 4:10]<- com_df[, 4:10] %>% lapply(function(x) trimws(str_sub(x, 3,5))) view(com_df)

To create a function in searching the PairID with opponent pre-rating:

fetch<- function(df) {
  lapply(df, function(x) {
  x<- x[!is.na(x)] #to remove NA from matching
  
  if (length(x)==0) {
    return(NA)
  }
  P_rating<- com_df$Pre_rating[com_df$PairID == x]
  
  if (length(P_rating) > 0) {
    return(P_rating)
  } else {
    return(NA)}
  })}
#transformation to make each round PairID in exchange for the pre_rating
com_df[, 4:10]<- lapply(com_df[, 4:10], fetch)
view(com_df)
# need to convert the Rounds back to numeric again:
com_df[, 4:10]<- lapply(com_df[, 4:10], as.numeric)
summary(com_df)

##      PairID          Name               Points          Round1    
##  Min.   : 1.00   Length:64          Min.   :1.000   Min.   : 377  
##  1st Qu.:16.75   Class :character   1st Qu.:2.500   1st Qu.:1263  
##  Median :32.50   Mode  :character   Median :3.500   Median :1430  
##  Mean   :32.50                      Mean   :3.438   Mean   :1392  
##  3rd Qu.:48.25                      3rd Qu.:4.000   3rd Qu.:1596  
##  Max.   :64.00                      Max.   :6.000   Max.   :1794  
##                                                     NA's   :4     
##      Round2         Round3         Round4         Round5         Round6    
##  Min.   : 377   Min.   : 377   Min.   : 377   Min.   : 377   Min.   : 377  
##  1st Qu.:1232   1st Qu.:1215   1st Qu.:1222   1st Qu.:1239   1st Qu.:1249  
##  Median :1407   Median :1407   Median :1407   Median :1437   Median :1430  
##  Mean   :1385   Mean   :1372   Mean   :1378   Mean   :1387   Mean   :1396  
##  3rd Qu.:1591   3rd Qu.:1583   3rd Qu.:1591   3rd Qu.:1600   3rd Qu.:1599  
##  Max.   :1794   Max.   :1794   Max.   :1794   Max.   :1794   Max.   :1794  
##  NA's   :2      NA's   :4      NA's   :2      NA's   :12     NA's   :6     
##      Round7        State              USCF_ID           Pre_rating  
##  Min.   : 377   Length:64          Min.   :10131499   Min.   : 377  
##  1st Qu.:1232   Class :character   1st Qu.:12902535   1st Qu.:1227  
##  Median :1417   Mode  :character   Median :14736764   Median :1407  
##  Mean   :1382                      Mean   :14167956   Mean   :1378  
##  3rd Qu.:1591                      3rd Qu.:15117878   3rd Qu.:1583  
##  Max.   :1794                      Max.   :15771592   Max.   :1794  
##  NA's   :10

To swap the column positions:

com_df<- com_df %>%
  mutate(Avg_op_rating = floor(rowMeans(com_df[1:64, 4:10], na.rm = TRUE)))
print(com_df)

## # A tibble: 64 × 14
##    PairID Name     Points Round1 Round2 Round3 Round4 Round5 Round6 Round7 State
##     <dbl> <chr>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <chr>
##  1      1 " GARY …    6     1436   1563   1600   1610   1649   1663   1716 "   …
##  2      2 " DAKSH…    6     1175    917   1716   1629   1604   1595   1649 "   …
##  3      3 " ADITY…    6     1641    955   1745   1563   1712   1666   1663 "   …
##  4      4 " PATRI…    5.5   1363   1507   1553   1579   1655   1564   1794 "   …
##  5      5 " HANSH…    5.5   1242    980   1663   1666   1716   1610   1629 "   …
##  6      6 " HANSE…    5     1399   1602   1712   1438   1365   1552   1563 "   …
##  7      7 " GARY …    5     1092    377   1666   1712   1794   1411   1553 "   …
##  8      8 " EZEKI…    5     1384   1441   1610   1411   1362   1507   1564 "   …
##  9      9 " STEFA…    5     1745   1600    853   1641   1579   1649   1595 "   …
## 10     10 " ANVIT…    5     1604   1564   1186   1494   1686   1745   1600 "   …
## # ℹ 54 more rows
## # ℹ 3 more variables: USCF_ID <dbl>, Pre_rating <dbl>, Avg_op_rating <dbl>

To reaarange the columns for CSV export:

final<- com_df %>%
  subset(select=c(Name, State, Points, Pre_rating, Avg_op_rating))
print(final)

## # A tibble: 64 × 5
##    Name                                State    Points Pre_rating Avg_op_rating
##    <chr>                               <chr>     <dbl>      <dbl>         <dbl>
##  1 " GARY HUA                        " "   ON "    6         1794          1605
##  2 " DAKSHESH DARURI                 " "   MI "    6         1553          1469
##  3 " ADITYA BAJAJ                    " "   MI "    6         1384          1563
##  4 " PATRICK H SCHILLING             " "   MI "    5.5       1716          1573
##  5 " HANSHI ZUO                      " "   MI "    5.5       1655          1500
##  6 " HANSEN SONG                     " "   OH "    5         1686          1518
##  7 " GARY DEE SWATHELL               " "   MI "    5         1649          1372
##  8 " EZEKIEL HOUGHTON                " "   MI "    5         1641          1468
##  9 " STEFANO LEE                     " "   ON "    5         1411          1523
## 10 " ANVIT RAO                       " "   MI "    5         1365          1554
## # ℹ 54 more rows

Save to CSV:

write.csv(final, file='Project_Chess.csv', row.names = FALSE)

Lab 5 Project1

Chi Hang(Philip) Cheung

2024-10-03

To read the txt file:

To transform the data into the correct dataframe:

remove extra columns and rename existing columns:

To make the values atomic in each cell

To extract ONLY the pre rating from the Pre_rating column:

To extract only PairID from the rounds:

Alternative way but less accurate:

To create a function in searching the PairID with opponent pre-rating:

To swap the column positions:

To reaarange the columns for CSV export:

Save to CSV: