Project 1

The purpose of this project is to text file and turn into a csv after its been “cleaned” and relevant data calculated and extracted. The exercise uses a chess tournament text with player names, state origin, win-loss record, pre-match ratings, post-match ratings.

tournamentUrl <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project1/tournamentinfo.txt"


rawData <- readLines(tournamentUrl)
## Warning in readLines(tournamentUrl): incomplete final line found on 'https://
## raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project1/
## tournamentinfo.txt'
View(rawData)
#Take a quick glimpse of the data
rawData[1:10]
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"
#get rid of the dotted seperators 
lines <- str_detect(rawData, "-----------")
tournieData <- rawData[!lines]

#Eliminate Headers//Will Add Later
tournieData2 <- tournieData[-c(1:2)]

Break down individual data components

This is the most important and involved aspect of the project. My approach is to extract all the relevant components and piece it together into a final data frame. Since the text file dedicates two rows of data per player, you can extract information row by row and then further pull data from each row.

#Get the rows with the names, id's, points, opponent id
id_name_points <- tournieData2[seq(from=1, to = length(tournieData2), by=2)]
head(id_name_points)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
player_id <- c(1:64)

#Get rows with the states, pre-ratings, and post-ratings
st_ratings <- tournieData2[seq(from=2, to = length(tournieData2), by=2)]
head(st_ratings)
## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"
#Extract the data we need to do further calculations and create a new data frame

#solution to player_names found on stackoverflow
player_names <- str_extract(id_name_points, '[:alpha:](\\w+(?:\\s+\\w+)*)')
head(player_names)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"
player_points <- as.numeric(str_extract(id_name_points, "\\d.\\d"))
head(player_points)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0
player_state <- str_extract(st_ratings, '[:alpha:]{2}')
head(player_state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
#Detect and extract pre-ratings
both_ratings<-str_extract_all(st_ratings,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")

pre<-str_detect(unlist(both_ratings),"\\b\\d{3,4}P?\\b")
post<-str_detect(unlist(both_ratings),"\\>.?\\b\\d{3,4}P?\\b")

preRate<-unlist(both_ratings)[(pre)&(!post)]
preRate<-as.integer(str_replace_all(preRate,"P",""))

oppId <- str_extract_all(str_extract_all(id_name_points, "\\d+\\|"), "\\d+")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
#Calculate total number of opponents per player
num.opp <- sapply(oppId, length)
#Create a sequence up to the number of maximum opponents -> 7
max_seq_opp <- seq_len(max(num.opp))

#Create dataframe of opponent ids per match per player
#The idea is to replace the opponent ids with their rating and then take the average
mat <- t(sapply(oppId, "[", i = max_seq_opp)) 
DF_opp <- as.data.frame(mat)


key <- data.frame(player_id, preRate)
Copy_dfOpp <- DF_opp
Copy_dfOpp[] <- key$preRate[match(unlist(Copy_dfOpp), key$player_id)]
as_tibble(head(Copy_dfOpp))
## # A tibble: 6 x 7
##      V1    V2    V3    V4    V5    V6    V7
##   <int> <int> <int> <int> <int> <int> <int>
## 1  1436  1563  1600  1610  1649  1663  1716
## 2  1175   917  1716  1629  1604  1595  1649
## 3  1641   955  1745  1563  1712  1666  1663
## 4  1363  1507  1553  1579  1655  1564  1794
## 5  1242   980  1663  1666  1716  1610  1629
## 6  1399  1602  1712  1438  1365  1552  1563
#Make na.rm equal to true to handle NA values
Copy_dfOpp$Avg_Rating_Opp <- round(apply(Copy_dfOpp, 1, mean, na.rm=TRUE))
as_tibble(head(Copy_dfOpp))
## # A tibble: 6 x 8
##      V1    V2    V3    V4    V5    V6    V7 Avg_Rating_Opp
##   <int> <int> <int> <int> <int> <int> <int>          <dbl>
## 1  1436  1563  1600  1610  1649  1663  1716           1605
## 2  1175   917  1716  1629  1604  1595  1649           1469
## 3  1641   955  1745  1563  1712  1666  1663           1564
## 4  1363  1507  1553  1579  1655  1564  1794           1574
## 5  1242   980  1663  1666  1716  1610  1629           1501
## 6  1399  1602  1712  1438  1365  1552  1563           1519
Final_DataFrame <- tibble(player_id, player_names, player_state, player_points, preRate, Copy_dfOpp$Avg_Rating_Opp)

colnames(Final_DataFrame) <- c("ID", "Names", "State", "Points", "Pre-Rating", "Average Opponent Pre-Rating")

as_tibble(Final_DataFrame)
## # A tibble: 64 x 6
##       ID Names             State Points `Pre-Rating` `Average Opponent Pre-Rati~
##    <int> <chr>             <chr>  <dbl>        <int>                       <dbl>
##  1     1 GARY HUA          ON       6           1794                        1605
##  2     2 DAKSHESH DARURI   MI       6           1553                        1469
##  3     3 ADITYA BAJAJ      MI       6           1384                        1564
##  4     4 PATRICK H SCHILL~ MI       5.5         1716                        1574
##  5     5 HANSHI ZUO        MI       5.5         1655                        1501
##  6     6 HANSEN SONG       OH       5           1686                        1519
##  7     7 GARY DEE SWATHELL MI       5           1649                        1372
##  8     8 EZEKIEL HOUGHTON  MI       5           1641                        1468
##  9     9 STEFANO LEE       ON       5           1411                        1523
## 10    10 ANVIT RAO         MI       5           1365                        1554
## # ... with 54 more rows

Write CSV File

write.csv(Final_DataFrame, "clean_tournament_info.csv", row.names = FALSE)

Graphical Analysis

ggplot(Final_DataFrame, aes(Points, `Average Opponent Pre-Rating`, color="player id")) + geom_point(color="green") + geom_text(aes(label=player_id),hjust=0, vjust=0) + theme(
legend.justification = c("right", "top"))

ggplot(Final_DataFrame, aes( `Average Opponent Pre-Rating`, preRate, color="players")) + geom_point(color="black") + theme(
legend.justification = c("right", "top"))

Conclusion

This was a harder assignment than first anticipated. It took a lot of online searching to find the best way to approach all the text data and try/test the correct regex to detect and extract the relevant information.

Sources/References:

Solution to find names: https://stackoverflow.com/questions/37307727/regex-to-extract-contact-name-and-email-from-string

A nice guide to cleaning text files: https://cran.r-project.org/doc/contrib/de_Jonge+van_der_Loo-Introduction_to_data_cleaning_with_R.pdf

Stringr cheat sheet: https://github.com/rstudio/cheatsheets/blob/master/strings.pdf