#Get the .txt file from GitHub
url <- "https://raw.githubusercontent.com/wheremagichappens/an.dy/master/DATA607/Project1/tournamentinfo.txt"
chess <- read.delim(url, header = FALSE, stringsAsFactors = FALSE)
head(chess)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 3 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 4 -----------------------------------------------------------------------------------------
## 5 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 6 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
#Deliminating "-" sign
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.1
chess_clean <- data.frame(str_replace_all(chess$V1,"-",""))
#Removing blank lines
chess_clean <- data.frame(chess_clean[!apply(chess_clean == "", 1, all),])
head(chess_clean)
## chess_clean..apply.chess_clean........1..all....
## 1 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 2 Num | USCF ID / Rtg (Pre>Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 3 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 4 ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W |
## 5 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 6 MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B |
#Merging 2 rows to align with column names and insert result into new data.frame
chess_new <- data.frame(c())
for (i in 1:dim(chess_clean)[1]){
if (i %% 2 == 1) {
data1 <- rbind(chess_new$data1, as.character(chess_clean[i,1]))
data2 <- as.character(chess_clean[i+1,1])
merged <- data.frame(paste0(data1,data2))
chess_new <- rbind(chess_new, merged)
}
}
#Checking dimension of new data.frame to make sure merging is successfully done.
dim(chess_new)
## [1] 65 1
head(chess_new)
## paste0.data1..data2.
## 1 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| Num | USCF ID / Rtg (Pre>Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 2 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W |
## 3 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7| MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B |
## 4 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12| MI | 14959604 / R: 1384 >1640 |N:2 |W |B |W |B |W |B |W |
## 5 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1| MI | 12616049 / R: 1716 >1744 |N:2 |W |B |W |B |W |B |B |
## 6 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17| MI | 14601533 / R: 1655 >1690 |N:2 |B |W |B |W |B |W |B |
#Still, I need to seperate column by "|". To do that, I set up column names for new data.frame to do separation.
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.1
chess_new_col <- c("Pair","Player Name","Total Pts.","Round 1","Round 2","Round 3", "Round 4","Round 5","Round 6","Round 7","State","USCF ID / Rtg (Pre->Post)","Pts","1","2","3","4","5","6","7","Avg.Pre Rating")
chess_new_2 <- separate(chess_new, col = paste0.data1..data2., into = chess_new_col, sep ="\\|")
nrow(chess_new_2)
## [1] 65
#Note that I have 2 column names as a result of separation. Will have to remove first row to avoid duplication.
chess_new_2 <- chess_new_2[c(2:nrow(chess_new_2)),]
#Note that "USCF ID / Rtg (Pre->Post)" column needs to be separated. Will have to speparate 2 different columns by "/"
chess_new_2["USCF ID / Rtg (Pre->Post)"]
## USCF ID / Rtg (Pre->Post)
## 2 15445895 / R: 1794 >1817
## 3 14598900 / R: 1553 >1663
## 4 14959604 / R: 1384 >1640
## 5 12616049 / R: 1716 >1744
## 6 14601533 / R: 1655 >1690
## 7 15055204 / R: 1686 >1687
## 8 11146376 / R: 1649 >1673
## 9 15142253 / R: 1641P17>1657P24
## 10 14954524 / R: 1411 >1564
## 11 14150362 / R: 1365 >1544
## 12 12581589 / R: 1712 >1696
## 13 12681257 / R: 1663 >1670
## 14 15082995 / R: 1666 >1662
## 15 10131499 / R: 1610 >1618
## 16 15619130 / R: 1220P13>1416P20
## 17 10295068 / R: 1604 >1613
## 18 10297702 / R: 1629 >1610
## 19 11342094 / R: 1600 >1600
## 20 14862333 / R: 1564 >1570
## 21 14529060 / R: 1595 >1569
## 22 15495066 / R: 1563P22>1562
## 23 12405534 / R: 1555 >1529
## 24 15030142 / R: 1363 >1371
## 25 13469010 / R: 1229 >1300
## 26 12486656 / R: 1745 >1681
## 27 15131520 / R: 1579 >1564
## 28 14476567 / R: 1552 >1539
## 29 14882954 / R: 1507 >1513
## 30 15323285 / R: 1602P6 >1508P12
## 31 12577178 / R: 1522 >1444
## 32 15131618 / R: 1494 >1444
## 33 14073750 / R: 1441 >1433
## 34 14691842 / R: 1449 >1421
## 35 15051807 / R: 1399 >1400
## 36 14601397 / R: 1438 >1392
## 37 14773163 / R: 1355 >1367
## 38 15489571 / R: 980P12>1077P17
## 39 15108523 / R: 1423 >1439
## 40 12923035 / R: 1436P23>1413
## 41 14892710 / R: 1348 >1346
## 42 15761443 / R: 1403P5 >1341P9
## 43 14462326 / R: 1332 >1256
## 44 14101068 / R: 1283 >1244
## 45 15323504 / R: 1199 >1199
## 46 15372807 / R: 1242 >1191
## 47 15490981 / R: 377P3 >1076P10
## 48 12533115 / R: 1362 >1341
## 49 14369165 / R: 1382 >1335
## 50 12531685 / R: 1291P12>1259P17
## 51 14773178 / R: 1056 >1111
## 52 15205474 / R: 1011 >1097
## 53 14918803 / R: 935 >1092
## 54 12578849 / R: 1393 >1359
## 55 12836773 / R: 1270 >1200
## 56 15412571 / R: 1186 >1163
## 57 14679887 / R: 1153 >1140
## 58 15113330 / R: 1092 >1079
## 59 14700365 / R: 917 > 941
## 60 12841036 / R: 853 > 878
## 61 14579262 / R: 967 > 984
## 62 15771592 / R: 955P11> 979P18
## 63 15219542 / R: 1530 >1535
## 64 15057092 / R: 1175 >1125
## 65 15006561 / R: 1163 >1112
tidy <- separate(chess_new_2, col = "USCF ID / Rtg (Pre->Post)", into = c("USCF ID", "Rtg (Pre->Post)"), sep = "\\/")
head(tidy)
## Pair Player Name Total Pts. Round 1 Round 2
## 2 1 GARY HUA 6.0 W 39 W 21
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58
## 4 3 ADITYA BAJAJ 6.0 L 8 W 61
## 5 4 PATRICK H SCHILLING 5.5 W 23 D 28
## 6 5 HANSHI ZUO 5.5 W 45 W 37
## 7 6 HANSEN SONG 5.0 W 34 D 29
## Round 3 Round 4 Round 5 Round 6 Round 7 State USCF ID
## 2 W 18 W 14 W 7 D 12 D 4 ON 15445895
## 3 L 4 W 17 W 16 W 20 W 7 MI 14598900
## 4 W 25 W 21 W 11 W 13 W 12 MI 14959604
## 5 W 2 W 26 D 5 W 19 D 1 MI 12616049
## 6 D 12 D 13 D 4 W 14 W 17 MI 14601533
## 7 L 11 W 35 D 10 W 27 W 21 OH 15055204
## Rtg (Pre->Post) Pts 1 2 3 4 5 6 7
## 2 R: 1794 >1817 N:2 W B W B W B W
## 3 R: 1553 >1663 N:2 B W B W B W B
## 4 R: 1384 >1640 N:2 W B W B W B W
## 5 R: 1716 >1744 N:2 W B W B W B B
## 6 R: 1655 >1690 N:2 B W B W B W B
## 7 R: 1686 >1687 N:3 W B W B B W B
## Avg.Pre Rating
## 2
## 3
## 4
## 5
## 6
## 7
#More cleaning. "Rtg (Pre->Post)" needs to be separated by ">"
tidy2 <- separate(tidy, col = "Rtg (Pre->Post)", into = c("Rtg-Pre", "Rtg-Post"), sep = "\\>")
#Use regex to extract numeric values only
tidy2$'Rtg-Pre' <- as.numeric(str_extract_all(tidy2$`Rtg-Pre`,"\\b\\d{1,}"))
tidy2$'Rtg-Post' <- as.numeric(str_extract_all(tidy2$`Rtg-Post`,"\\b\\d{1,}"))
#In order to calculate average rating, we need numeric values from Round 1 to 7 and convert them into numeric.
tidy2$'round1_clean' <- as.numeric(str_extract_all(tidy2$`Round 1`,"\\b\\d{1,}"))
tidy2$'round2_clean' <- as.numeric(str_extract_all(tidy2$`Round 2`,"\\b\\d{1,}"))
tidy2$'round3_clean' <- as.numeric(str_extract_all(tidy2$`Round 3`,"\\b\\d{1,}"))
tidy2$'round4_clean' <- as.numeric(str_extract_all(tidy2$`Round 4`,"\\b\\d{1,}"))
tidy2$'round5_clean' <- as.numeric(str_extract_all(tidy2$`Round 5`,"\\b\\d{1,}"))
tidy2$'round6_clean' <- as.numeric(str_extract_all(tidy2$`Round 6`,"\\b\\d{1,}"))
tidy2$'round7_clean' <- as.numeric(str_extract_all(tidy2$`Round 7`,"\\b\\d{1,}"))
#"Rtg-Pre" values matching with each round1 to 7 (numeric values) will be required for average rating calculation. Put these matches in R1 to R7.
tidy2$'R1' <- tidy2$`Rtg-Pre`[tidy2$'round1_clean']
tidy2$'R2' <- tidy2$`Rtg-Pre`[tidy2$'round2_clean']
tidy2$'R3' <- tidy2$`Rtg-Pre`[tidy2$'round3_clean']
tidy2$'R4' <- tidy2$`Rtg-Pre`[tidy2$'round4_clean']
tidy2$'R5' <- tidy2$`Rtg-Pre`[tidy2$'round5_clean']
tidy2$'R6' <- tidy2$`Rtg-Pre`[tidy2$'round6_clean']
tidy2$'R7' <- tidy2$`Rtg-Pre`[tidy2$'round7_clean']
head(tidy2)
## Pair Player Name Total Pts. Round 1 Round 2
## 2 1 GARY HUA 6.0 W 39 W 21
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58
## 4 3 ADITYA BAJAJ 6.0 L 8 W 61
## 5 4 PATRICK H SCHILLING 5.5 W 23 D 28
## 6 5 HANSHI ZUO 5.5 W 45 W 37
## 7 6 HANSEN SONG 5.0 W 34 D 29
## Round 3 Round 4 Round 5 Round 6 Round 7 State USCF ID Rtg-Pre
## 2 W 18 W 14 W 7 D 12 D 4 ON 15445895 1794
## 3 L 4 W 17 W 16 W 20 W 7 MI 14598900 1553
## 4 W 25 W 21 W 11 W 13 W 12 MI 14959604 1384
## 5 W 2 W 26 D 5 W 19 D 1 MI 12616049 1716
## 6 D 12 D 13 D 4 W 14 W 17 MI 14601533 1655
## 7 L 11 W 35 D 10 W 27 W 21 OH 15055204 1686
## Rtg-Post Pts 1 2 3 4 5 6 7 Avg.Pre Rating
## 2 1817 N:2 W B W B W B W
## 3 1663 N:2 B W B W B W B
## 4 1640 N:2 W B W B W B W
## 5 1744 N:2 W B W B W B B
## 6 1690 N:2 B W B W B W B
## 7 1687 N:3 W B W B B W B
## round1_clean round2_clean round3_clean round4_clean round5_clean
## 2 39 21 18 14 7
## 3 63 58 4 17 16
## 4 8 61 25 21 11
## 5 23 28 2 26 5
## 6 45 37 12 13 4
## 7 34 29 11 35 10
## round6_clean round7_clean R1 R2 R3 R4 R5 R6 R7
## 2 12 4 1436 1563 1600 1610 1649 1663 1716
## 3 20 7 1175 917 1716 1629 1604 1595 1649
## 4 13 12 1641 955 1745 1563 1712 1666 1663
## 5 19 1 1363 1507 1553 1579 1655 1564 1794
## 6 14 17 1242 980 1663 1666 1716 1610 1629
## 7 27 21 1399 1602 1712 1438 1365 1552 1563
#Filter out necessary columns for our purpose.
chess_new_3 <- tidy2[c(1,2,3,11, 13,14,31,32,33,34,35,36,37,23)]
head(chess_new_3)
## Pair Player Name Total Pts. State Rtg-Pre
## 2 1 GARY HUA 6.0 ON 1794
## 3 2 DAKSHESH DARURI 6.0 MI 1553
## 4 3 ADITYA BAJAJ 6.0 MI 1384
## 5 4 PATRICK H SCHILLING 5.5 MI 1716
## 6 5 HANSHI ZUO 5.5 MI 1655
## 7 6 HANSEN SONG 5.0 OH 1686
## Rtg-Post R1 R2 R3 R4 R5 R6 R7 Avg.Pre Rating
## 2 1817 1436 1563 1600 1610 1649 1663 1716
## 3 1663 1175 917 1716 1629 1604 1595 1649
## 4 1640 1641 955 1745 1563 1712 1666 1663
## 5 1744 1363 1507 1553 1579 1655 1564 1794
## 6 1690 1242 980 1663 1666 1716 1610 1629
## 7 1687 1399 1602 1712 1438 1365 1552 1563
#Calculate average rating with information we have.
avg_rating <- rowMeans(chess_new_3[,7:13], na.rm = TRUE)
chess_new_3$'Avg.Pre Rating' = round(avg_rating)
#Well, looking at the structure of table, we still need to trim some of the column. Life as a cleaner is almost over!
str(chess_new_3)
## 'data.frame': 64 obs. of 14 variables:
## $ Pair : chr " 1 " " 2 " " 3 " " 4 " ...
## $ Player Name : chr " GARY HUA " " DAKSHESH DARURI " " ADITYA BAJAJ " " PATRICK H SCHILLING " ...
## $ Total Pts. : chr "6.0 " "6.0 " "6.0 " "5.5 " ...
## $ State : chr " ON " " MI " " MI " " MI " ...
## $ Rtg-Pre : num 1794 1553 1384 1716 1655 ...
## $ Rtg-Post : num 1817 1663 1640 1744 1690 ...
## $ R1 : num 1436 1175 1641 1363 1242 ...
## $ R2 : num 1563 917 955 1507 980 ...
## $ R3 : num 1600 1716 1745 1553 1663 ...
## $ R4 : num 1610 1629 1563 1579 1666 ...
## $ R5 : num 1649 1604 1712 1655 1716 ...
## $ R6 : num 1663 1595 1666 1564 1610 ...
## $ R7 : num 1716 1649 1663 1794 1629 ...
## $ Avg.Pre Rating: num 1605 1469 1564 1574 1501 ...
chess_new_3$Pair <- trimws(chess_new_3$Pair)
chess_new_3$`Player Name` <- trimws(chess_new_3$`Player Name`)
chess_new_3$`Total Pts.` <- trimws(chess_new_3$`Total Pts.`)
chess_new_3$State <- trimws(chess_new_3$State)
#Looks clean and our cleaning job is done!
str(chess_new_3)
## 'data.frame': 64 obs. of 14 variables:
## $ Pair : chr "1" "2" "3" "4" ...
## $ Player Name : chr "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
## $ Total Pts. : chr "6.0" "6.0" "6.0" "5.5" ...
## $ State : chr "ON" "MI" "MI" "MI" ...
## $ Rtg-Pre : num 1794 1553 1384 1716 1655 ...
## $ Rtg-Post : num 1817 1663 1640 1744 1690 ...
## $ R1 : num 1436 1175 1641 1363 1242 ...
## $ R2 : num 1563 917 955 1507 980 ...
## $ R3 : num 1600 1716 1745 1553 1663 ...
## $ R4 : num 1610 1629 1563 1579 1666 ...
## $ R5 : num 1649 1604 1712 1655 1716 ...
## $ R6 : num 1663 1595 1666 1564 1610 ...
## $ R7 : num 1716 1649 1663 1794 1629 ...
## $ Avg.Pre Rating: num 1605 1469 1564 1574 1501 ...
head(chess_new_3)
## Pair Player Name Total Pts. State Rtg-Pre Rtg-Post R1 R2
## 2 1 GARY HUA 6.0 ON 1794 1817 1436 1563
## 3 2 DAKSHESH DARURI 6.0 MI 1553 1663 1175 917
## 4 3 ADITYA BAJAJ 6.0 MI 1384 1640 1641 955
## 5 4 PATRICK H SCHILLING 5.5 MI 1716 1744 1363 1507
## 6 5 HANSHI ZUO 5.5 MI 1655 1690 1242 980
## 7 6 HANSEN SONG 5.0 OH 1686 1687 1399 1602
## R3 R4 R5 R6 R7 Avg.Pre Rating
## 2 1600 1610 1649 1663 1716 1605
## 3 1716 1629 1604 1595 1649 1469
## 4 1745 1563 1712 1666 1663 1564
## 5 1553 1579 1655 1564 1794 1574
## 6 1663 1666 1716 1610 1629 1501
## 7 1712 1438 1365 1552 1563 1519
#Select only necessary columns to finalize
chess_final <- chess_new_3[c(2,4,3,5,14)]
head(chess_final)
## Player Name State Total Pts. Rtg-Pre Avg.Pre Rating
## 2 GARY HUA ON 6.0 1794 1605
## 3 DAKSHESH DARURI MI 6.0 1553 1469
## 4 ADITYA BAJAJ MI 6.0 1384 1564
## 5 PATRICK H SCHILLING MI 5.5 1716 1574
## 6 HANSHI ZUO MI 5.5 1655 1501
## 7 HANSEN SONG OH 5.0 1686 1519
#Export CSV
write.table(chess_final, file = "ahwang-tournamentInfo.csv",row.names=FALSE, na="",col.names=TRUE, sep=",")