DATA607. Project#1

#Get the .txt file from GitHub
url <- "https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project1/tournamentinfo.txt"
chess <- read.delim(url, header = FALSE, stringsAsFactors = FALSE)
head(chess)

##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 3  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 4  -----------------------------------------------------------------------------------------
## 5      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 6     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |

#Deliminating "-" sign
library(stringr)

## Warning: package 'stringr' was built under R version 3.4.1

chess_clean <- data.frame(str_replace_all(chess$V1,"-",""))

#Removing blank lines
chess_clean <- data.frame(chess_clean[!apply(chess_clean == "", 1, all),])
head(chess_clean)

##                                             chess_clean..apply.chess_clean........1..all....
## 1  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2   Num  | USCF ID / Rtg (Pre>Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 4      ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 5      2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 6      MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |

#Merging 2 rows to align with column names and insert result into new data.frame
chess_new <- data.frame(c())
for (i in 1:dim(chess_clean)[1]){
 if (i %% 2 == 1) {
  data1 <- rbind(chess_new$data1, as.character(chess_clean[i,1]))
  data2 <- as.character(chess_clean[i+1,1])
  merged <- data.frame(paste0(data1,data2))
  chess_new <- rbind(chess_new, merged)
  }
}

#Checking dimension of new data.frame to make sure merging is successfully done.
dim(chess_new)

## [1] 65  1

head(chess_new)

##                                                                                                                                                                  paste0.data1..data2.
## 1  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|  Num  | USCF ID / Rtg (Pre>Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 2       1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|   ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 3       2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|   MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 4       3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|   MI | 14959604 / R: 1384   >1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 5       4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|   MI | 12616049 / R: 1716   >1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## 6       5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|   MI | 14601533 / R: 1655   >1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |

#Still, I need to seperate column by "|". To do that, I set up column names for new data.frame to do separation.
library(tidyr)

## Warning: package 'tidyr' was built under R version 3.4.1

chess_new_col <- c("Pair","Player Name","Total Pts.","Round 1","Round 2","Round 3", "Round 4","Round 5","Round 6","Round 7","State","USCF ID / Rtg (Pre->Post)","Pts","1","2","3","4","5","6","7","Avg.Pre Rating")

chess_new_2 <- separate(chess_new, col = paste0.data1..data2., into = chess_new_col,  sep ="\\|")
nrow(chess_new_2)

## [1] 65

#Note that I have 2 column names as a result of separation. Will have to remove first row to avoid duplication.
chess_new_2 <- chess_new_2[c(2:nrow(chess_new_2)),]

#Note that "USCF ID / Rtg (Pre->Post)" column needs to be separated. Will have to speparate 2 different columns by "/"
chess_new_2["USCF ID / Rtg (Pre->Post)"]

##           USCF ID / Rtg (Pre->Post)
## 2   15445895 / R: 1794   >1817     
## 3   14598900 / R: 1553   >1663     
## 4   14959604 / R: 1384   >1640     
## 5   12616049 / R: 1716   >1744     
## 6   14601533 / R: 1655   >1690     
## 7   15055204 / R: 1686   >1687     
## 8   11146376 / R: 1649   >1673     
## 9   15142253 / R: 1641P17>1657P24  
## 10  14954524 / R: 1411   >1564     
## 11  14150362 / R: 1365   >1544     
## 12  12581589 / R: 1712   >1696     
## 13  12681257 / R: 1663   >1670     
## 14  15082995 / R: 1666   >1662     
## 15  10131499 / R: 1610   >1618     
## 16  15619130 / R: 1220P13>1416P20  
## 17  10295068 / R: 1604   >1613     
## 18  10297702 / R: 1629   >1610     
## 19  11342094 / R: 1600   >1600     
## 20  14862333 / R: 1564   >1570     
## 21  14529060 / R: 1595   >1569     
## 22  15495066 / R: 1563P22>1562     
## 23  12405534 / R: 1555   >1529     
## 24  15030142 / R: 1363   >1371     
## 25  13469010 / R: 1229   >1300     
## 26  12486656 / R: 1745   >1681     
## 27  15131520 / R: 1579   >1564     
## 28  14476567 / R: 1552   >1539     
## 29  14882954 / R: 1507   >1513     
## 30  15323285 / R: 1602P6 >1508P12  
## 31  12577178 / R: 1522   >1444     
## 32  15131618 / R: 1494   >1444     
## 33  14073750 / R: 1441   >1433     
## 34  14691842 / R: 1449   >1421     
## 35  15051807 / R: 1399   >1400     
## 36  14601397 / R: 1438   >1392     
## 37  14773163 / R: 1355   >1367     
## 38  15489571 / R:  980P12>1077P17  
## 39  15108523 / R: 1423   >1439     
## 40  12923035 / R: 1436P23>1413     
## 41  14892710 / R: 1348   >1346     
## 42  15761443 / R: 1403P5 >1341P9   
## 43  14462326 / R: 1332   >1256     
## 44  14101068 / R: 1283   >1244     
## 45  15323504 / R: 1199   >1199     
## 46  15372807 / R: 1242   >1191     
## 47  15490981 / R:  377P3 >1076P10  
## 48  12533115 / R: 1362   >1341     
## 49  14369165 / R: 1382   >1335     
## 50  12531685 / R: 1291P12>1259P17  
## 51  14773178 / R: 1056   >1111     
## 52  15205474 / R: 1011   >1097     
## 53  14918803 / R:  935   >1092     
## 54  12578849 / R: 1393   >1359     
## 55  12836773 / R: 1270   >1200     
## 56  15412571 / R: 1186   >1163     
## 57  14679887 / R: 1153   >1140     
## 58  15113330 / R: 1092   >1079     
## 59  14700365 / R:  917   > 941     
## 60  12841036 / R:  853   > 878     
## 61  14579262 / R:  967   > 984     
## 62  15771592 / R:  955P11> 979P18  
## 63  15219542 / R: 1530   >1535     
## 64  15057092 / R: 1175   >1125     
## 65  15006561 / R: 1163   >1112

tidy <- separate(chess_new_2, col = "USCF ID / Rtg (Pre->Post)", into = c("USCF ID", "Rtg (Pre->Post)"), sep = "\\/")
head(tidy)

##     Pair                       Player Name Total Pts. Round 1 Round 2
## 2     1   GARY HUA                              6.0     W  39   W  21
## 3     2   DAKSHESH DARURI                       6.0     W  63   W  58
## 4     3   ADITYA BAJAJ                          6.0     L   8   W  61
## 5     4   PATRICK H SCHILLING                   5.5     W  23   D  28
## 6     5   HANSHI ZUO                            5.5     W  45   W  37
## 7     6   HANSEN SONG                           5.0     W  34   D  29
##   Round 3 Round 4 Round 5 Round 6 Round 7  State    USCF ID
## 2   W  18   W  14   W   7   D  12   D   4    ON   15445895 
## 3   L   4   W  17   W  16   W  20   W   7    MI   14598900 
## 4   W  25   W  21   W  11   W  13   W  12    MI   14959604 
## 5   W   2   W  26   D   5   W  19   D   1    MI   12616049 
## 6   D  12   D  13   D   4   W  14   W  17    MI   14601533 
## 7   L  11   W  35   D  10   W  27   W  21    OH   15055204 
##         Rtg (Pre->Post)   Pts     1     2     3     4     5     6     7
## 2  R: 1794   >1817      N:2   W     B     W     B     W     B     W    
## 3  R: 1553   >1663      N:2   B     W     B     W     B     W     B    
## 4  R: 1384   >1640      N:2   W     B     W     B     W     B     W    
## 5  R: 1716   >1744      N:2   W     B     W     B     W     B     B    
## 6  R: 1655   >1690      N:2   B     W     B     W     B     W     B    
## 7  R: 1686   >1687      N:3   W     B     W     B     B     W     B    
##   Avg.Pre Rating
## 2               
## 3               
## 4               
## 5               
## 6               
## 7

#More cleaning. "Rtg (Pre->Post)" needs to be separated by ">"
tidy2 <- separate(tidy, col = "Rtg (Pre->Post)", into = c("Rtg-Pre", "Rtg-Post"), sep = "\\>")

#Use regex to extract numeric values only
tidy2$'Rtg-Pre' <- as.numeric(str_extract_all(tidy2$`Rtg-Pre`,"\\b\\d{1,}"))
tidy2$'Rtg-Post' <- as.numeric(str_extract_all(tidy2$`Rtg-Post`,"\\b\\d{1,}"))

#In order to calculate average rating, we need numeric values from Round 1 to 7 and convert them into numeric.
tidy2$'round1_clean' <- as.numeric(str_extract_all(tidy2$`Round 1`,"\\b\\d{1,}"))
tidy2$'round2_clean' <- as.numeric(str_extract_all(tidy2$`Round 2`,"\\b\\d{1,}"))
tidy2$'round3_clean' <- as.numeric(str_extract_all(tidy2$`Round 3`,"\\b\\d{1,}"))
tidy2$'round4_clean' <- as.numeric(str_extract_all(tidy2$`Round 4`,"\\b\\d{1,}"))
tidy2$'round5_clean' <- as.numeric(str_extract_all(tidy2$`Round 5`,"\\b\\d{1,}"))
tidy2$'round6_clean' <- as.numeric(str_extract_all(tidy2$`Round 6`,"\\b\\d{1,}"))
tidy2$'round7_clean' <- as.numeric(str_extract_all(tidy2$`Round 7`,"\\b\\d{1,}"))

#"Rtg-Pre" values matching with each round1 to 7 (numeric values) will be required for average rating calculation. Put these matches in R1 to R7.
tidy2$'R1' <- tidy2$`Rtg-Pre`[tidy2$'round1_clean']
tidy2$'R2' <- tidy2$`Rtg-Pre`[tidy2$'round2_clean']
tidy2$'R3' <- tidy2$`Rtg-Pre`[tidy2$'round3_clean']
tidy2$'R4' <- tidy2$`Rtg-Pre`[tidy2$'round4_clean']
tidy2$'R5' <- tidy2$`Rtg-Pre`[tidy2$'round5_clean']
tidy2$'R6' <- tidy2$`Rtg-Pre`[tidy2$'round6_clean']
tidy2$'R7' <- tidy2$`Rtg-Pre`[tidy2$'round7_clean']

head(tidy2)

##     Pair                       Player Name Total Pts. Round 1 Round 2
## 2     1   GARY HUA                              6.0     W  39   W  21
## 3     2   DAKSHESH DARURI                       6.0     W  63   W  58
## 4     3   ADITYA BAJAJ                          6.0     L   8   W  61
## 5     4   PATRICK H SCHILLING                   5.5     W  23   D  28
## 6     5   HANSHI ZUO                            5.5     W  45   W  37
## 7     6   HANSEN SONG                           5.0     W  34   D  29
##   Round 3 Round 4 Round 5 Round 6 Round 7  State    USCF ID Rtg-Pre
## 2   W  18   W  14   W   7   D  12   D   4    ON   15445895     1794
## 3   L   4   W  17   W  16   W  20   W   7    MI   14598900     1553
## 4   W  25   W  21   W  11   W  13   W  12    MI   14959604     1384
## 5   W   2   W  26   D   5   W  19   D   1    MI   12616049     1716
## 6   D  12   D  13   D   4   W  14   W  17    MI   14601533     1655
## 7   L  11   W  35   D  10   W  27   W  21    OH   15055204     1686
##   Rtg-Post   Pts     1     2     3     4     5     6     7 Avg.Pre Rating
## 2     1817 N:2   W     B     W     B     W     B     W                   
## 3     1663 N:2   B     W     B     W     B     W     B                   
## 4     1640 N:2   W     B     W     B     W     B     W                   
## 5     1744 N:2   W     B     W     B     W     B     B                   
## 6     1690 N:2   B     W     B     W     B     W     B                   
## 7     1687 N:3   W     B     W     B     B     W     B                   
##   round1_clean round2_clean round3_clean round4_clean round5_clean
## 2           39           21           18           14            7
## 3           63           58            4           17           16
## 4            8           61           25           21           11
## 5           23           28            2           26            5
## 6           45           37           12           13            4
## 7           34           29           11           35           10
##   round6_clean round7_clean   R1   R2   R3   R4   R5   R6   R7
## 2           12            4 1436 1563 1600 1610 1649 1663 1716
## 3           20            7 1175  917 1716 1629 1604 1595 1649
## 4           13           12 1641  955 1745 1563 1712 1666 1663
## 5           19            1 1363 1507 1553 1579 1655 1564 1794
## 6           14           17 1242  980 1663 1666 1716 1610 1629
## 7           27           21 1399 1602 1712 1438 1365 1552 1563

#Filter out necessary columns for our purpose.
chess_new_3 <- tidy2[c(1,2,3,11, 13,14,31,32,33,34,35,36,37,23)]

head(chess_new_3)

##     Pair                       Player Name Total Pts.  State Rtg-Pre
## 2     1   GARY HUA                              6.0      ON     1794
## 3     2   DAKSHESH DARURI                       6.0      MI     1553
## 4     3   ADITYA BAJAJ                          6.0      MI     1384
## 5     4   PATRICK H SCHILLING                   5.5      MI     1716
## 6     5   HANSHI ZUO                            5.5      MI     1655
## 7     6   HANSEN SONG                           5.0      OH     1686
##   Rtg-Post   R1   R2   R3   R4   R5   R6   R7 Avg.Pre Rating
## 2     1817 1436 1563 1600 1610 1649 1663 1716               
## 3     1663 1175  917 1716 1629 1604 1595 1649               
## 4     1640 1641  955 1745 1563 1712 1666 1663               
## 5     1744 1363 1507 1553 1579 1655 1564 1794               
## 6     1690 1242  980 1663 1666 1716 1610 1629               
## 7     1687 1399 1602 1712 1438 1365 1552 1563

#Calculate average rating with information we have.
avg_rating <- rowMeans(chess_new_3[,7:13], na.rm = TRUE)
chess_new_3$'Avg.Pre Rating' = round(avg_rating)

#Well, looking at the structure of table, we still need to trim some of the column. Life as a cleaner is almost over!
str(chess_new_3)

## 'data.frame':    64 obs. of  14 variables:
##  $ Pair          : chr  "    1 " "    2 " "    3 " "    4 " ...
##  $ Player Name   : chr  " GARY HUA                        " " DAKSHESH DARURI                 " " ADITYA BAJAJ                    " " PATRICK H SCHILLING             " ...
##  $ Total Pts.    : chr  "6.0  " "6.0  " "6.0  " "5.5  " ...
##  $ State         : chr  "   ON " "   MI " "   MI " "   MI " ...
##  $ Rtg-Pre       : num  1794 1553 1384 1716 1655 ...
##  $ Rtg-Post      : num  1817 1663 1640 1744 1690 ...
##  $ R1            : num  1436 1175 1641 1363 1242 ...
##  $ R2            : num  1563 917 955 1507 980 ...
##  $ R3            : num  1600 1716 1745 1553 1663 ...
##  $ R4            : num  1610 1629 1563 1579 1666 ...
##  $ R5            : num  1649 1604 1712 1655 1716 ...
##  $ R6            : num  1663 1595 1666 1564 1610 ...
##  $ R7            : num  1716 1649 1663 1794 1629 ...
##  $ Avg.Pre Rating: num  1605 1469 1564 1574 1501 ...

chess_new_3$Pair <- trimws(chess_new_3$Pair)
chess_new_3$`Player Name` <- trimws(chess_new_3$`Player Name`)
chess_new_3$`Total Pts.` <- trimws(chess_new_3$`Total Pts.`)
chess_new_3$State <- trimws(chess_new_3$State)

#Looks clean and our cleaning job is done!
str(chess_new_3)

## 'data.frame':    64 obs. of  14 variables:
##  $ Pair          : chr  "1" "2" "3" "4" ...
##  $ Player Name   : chr  "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
##  $ Total Pts.    : chr  "6.0" "6.0" "6.0" "5.5" ...
##  $ State         : chr  "ON" "MI" "MI" "MI" ...
##  $ Rtg-Pre       : num  1794 1553 1384 1716 1655 ...
##  $ Rtg-Post      : num  1817 1663 1640 1744 1690 ...
##  $ R1            : num  1436 1175 1641 1363 1242 ...
##  $ R2            : num  1563 917 955 1507 980 ...
##  $ R3            : num  1600 1716 1745 1553 1663 ...
##  $ R4            : num  1610 1629 1563 1579 1666 ...
##  $ R5            : num  1649 1604 1712 1655 1716 ...
##  $ R6            : num  1663 1595 1666 1564 1610 ...
##  $ R7            : num  1716 1649 1663 1794 1629 ...
##  $ Avg.Pre Rating: num  1605 1469 1564 1574 1501 ...

head(chess_new_3)

##   Pair         Player Name Total Pts. State Rtg-Pre Rtg-Post   R1   R2
## 2    1            GARY HUA        6.0    ON    1794     1817 1436 1563
## 3    2     DAKSHESH DARURI        6.0    MI    1553     1663 1175  917
## 4    3        ADITYA BAJAJ        6.0    MI    1384     1640 1641  955
## 5    4 PATRICK H SCHILLING        5.5    MI    1716     1744 1363 1507
## 6    5          HANSHI ZUO        5.5    MI    1655     1690 1242  980
## 7    6         HANSEN SONG        5.0    OH    1686     1687 1399 1602
##     R3   R4   R5   R6   R7 Avg.Pre Rating
## 2 1600 1610 1649 1663 1716           1605
## 3 1716 1629 1604 1595 1649           1469
## 4 1745 1563 1712 1666 1663           1564
## 5 1553 1579 1655 1564 1794           1574
## 6 1663 1666 1716 1610 1629           1501
## 7 1712 1438 1365 1552 1563           1519

#Select only necessary columns to finalize
chess_final <- chess_new_3[c(2,4,3,5,14)]
head(chess_final)

##           Player Name State Total Pts. Rtg-Pre Avg.Pre Rating
## 2            GARY HUA    ON        6.0    1794           1605
## 3     DAKSHESH DARURI    MI        6.0    1553           1469
## 4        ADITYA BAJAJ    MI        6.0    1384           1564
## 5 PATRICK H SCHILLING    MI        5.5    1716           1574
## 6          HANSHI ZUO    MI        5.5    1655           1501
## 7         HANSEN SONG    OH        5.0    1686           1519

#Export CSV
write.table(chess_final, file = "ahwang-tournamentInfo.csv",row.names=FALSE, na="",col.names=TRUE, sep=",")

DATA607. Project#1

Sang Yoon (Andy) Hwang

February 22, 2018