Load required packages

knitr::opts_chunk$set(echo = TRUE)

# RPubs link - 

# Check if the package is installed. If not, install the package
if(!require('stringr')) {
  install.packages('stringr')
  library(stringr)
}
## Loading required package: stringr
# Check if the package is installed. If not, install the package
if(!require('data.table')) {
  install.packages('data.table')
  library(data.table)
}
## Loading required package: data.table
# Check if the package is installed. If not, install the package
if(!require('taRifx')) {
  install.packages('taRifx')
  library(taRifx)
}
## Loading required package: taRifx
## 
## Attaching package: 'taRifx'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last, shift

Read data file

first_time = 1

fileName <- "https://raw.githubusercontent.com/kalyanparthasarathy/CUNY-MSDS/master/tournamentinfo.txt"

conn <- file(fileName,open="r")

linesFromFile <-readLines(conn)
for (i in 5:length(linesFromFile)){
  if( ( (i-5) %% 3) == 0) {
    # Data resides in two lines so we need to combine both lines into single DF and look for the interested columns
    
    # Parse the first line of text (here the first line is the Name line)
    readLine1 <- linesFromFile[i]
    
    # IMPORTANT - Limitation of sub() function with back referencing of RegEx values
    # Only 9 values can be back referenced so need to do two step split for each line of data
    firstTwoColsLine1 <- unlist(
      sub(
        "(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
        , "\\1"
        , readLine1
      )
    )
    
    remainingColsLine1 <- unlist(
      sub(
        "(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
        , "\\2,\\3,\\4,\\5,\\6,\\7,\\8,\\9"
        , readLine1
      )
    )
    
    firstTwoColsLine1 <- unlist(
      sub(
        "(.*)\\|(.*)$"
        , "\\1,\\2"
        , firstTwoColsLine1
      )
    )
    
    parsedLine1 <- paste(firstTwoColsLine1, ",", remainingColsLine1)
    rawDataDF1 <- data.frame(parsedLine1)
    
    names(rawDataDF1) <- c("Data")
    
    out <- strsplit(as.character(rawDataDF1$Data),',')
    rawDataDF1 <- data.frame(rawDataDF1$Data, do.call(rbind, out))
    rawDataDF1$rawDataDF1.Data <- NULL
    names(rawDataDF1) <- c("C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10")
    
    # Parse the first line of text (here the first line is the Name line)
    readLine2 <- linesFromFile[i+1]
    
    # IMPORTANT - Limitation of sub() function with back referencing of RegEx values
    # Only 9 values can be back referenced so need to do two step split for each line of data
    firstTwoColsLine2 <- unlist(
      sub(
        "(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
        , "\\1"
        , readLine2
      )
    )
    
    remainingColsLine2 <- unlist(
      sub(
        "(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
        , "\\2,\\3,\\4,\\5,\\6,\\7,\\8,\\9"
        , readLine2
      )
    )
    
    firstTwoColsLine2 <- unlist(
      sub(
        "(.*)\\|(.*)$"
        , "\\1,\\2"
        , firstTwoColsLine2
      )
    )
    
    parsedLine2 <- paste(firstTwoColsLine2, ",", remainingColsLine2)
    rawDataDF2 <- data.frame(parsedLine2)
    
    names(rawDataDF2) <- c("Data")
    
    out <- strsplit(as.character(rawDataDF2$Data),',')
    rawDataDF2 <- data.frame(rawDataDF2$Data, do.call(rbind, out))
    rawDataDF2$rawDataDF2.Data <- NULL
    names(rawDataDF2) <- c("C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20")
    
    if(first_time == 1) {
      first_time = 2
      
      allDataDF1 <- rawDataDF1
      allDataDF2 <- rawDataDF2
    }
    else {
      # Merge each row of data into the allDataDF1 & allDataDF2
      allDataDF1 <- rbind(allDataDF1, rawDataDF1)
      allDataDF2 <- rbind(allDataDF2, rawDataDF2)
    }
  }
}
close(conn)

Formatting the data

# Merge both lines of data into sinlge DF
allDataDF <- cbind(allDataDF1, allDataDF2)

# Format the Column 1 - Position - This information is needed to calculate "Avg Pre Chess Rating of Opponents" value
allDataDF$C1 <- sub("[[:space:]]+?([[:digit:]]{1,}).*", "\\1", allDataDF$C1)

# Format the Column 2 - Name
allDataDF$C2 <- tools::toTitleCase(tolower(str_trim(unlist(sub("[[:space:]]+?([[:alpha:]] )+[[:space:]]+", "\\1", allDataDF$C2)))))

# Format the Column 3 - Total Number of Points
allDataDF$C3 <- str_trim(unlist(sub("[[:space:]]+?([[:alpha:]] )+[[:space:]]+", "\\1", allDataDF$C3)))

# Format the Column 4 - Round 1
allDataDF$C4 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C4)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C4 <- sub("[[:alpha:]]+", "0", allDataDF$C4)


# Format the Column 5 - Round 2
allDataDF$C5 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C5)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C5 <- sub("[[:alpha:]]+", "0", allDataDF$C5)

# Format the Column 6 - Round 3
allDataDF$C6 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C6)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C6 <- sub("[[:alpha:]]+", "0", allDataDF$C6)

# Format the Column 7 - Round 4
allDataDF$C7 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C7)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C7 <- sub("[[:alpha:]]+", "0", allDataDF$C7)

# Format the Column 8 - Round 5
allDataDF$C8 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C8)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C8 <- sub("[[:alpha:]]+", "0", allDataDF$C8)

# Format the Column 9 - Round 6
allDataDF$C9 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C9)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C9 <- sub("[[:alpha:]]+", "0", allDataDF$C9)

# Format the Column 10 - Round 7
allDataDF$C10 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C10)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C10 <- sub("[[:alpha:]]+", "0", allDataDF$C10)

# Format the Column 11 - State
allDataDF$C11 <- str_trim(unlist(sub("[[:space:]]+?([[:alpha:]] )+[[:space:]]+", "\\1", allDataDF$C11)))

# Format the Column 12 - Pre-rating
allDataDF$C12 <- str_trim(allDataDF$C12)
allDataDF$C12 <- str_trim(unlist(sub("[[:digit:]]+[[:space:]].[[:space:]]R:[[:space:]]+?([[:alnum:]]+)[[:space:]]{0,1}.*", "\\1", allDataDF$C12)))
allDataDF$C12 <- unlist(sub("([[:digit:]]+)[[:alpha:]][[:digit:]]+", "\\1", allDataDF$C12)) # Remove the values from "P", if anything exists

# Extract only the columns we are interested in
allDataDFNew <- allDataDF[, c(1, 2, 11, 3, 12, 4, 5, 6, 7, 8, 9, 10)]

# Add a new column for the Average Score
allDataDFNew$'Avg Score' <- 0

names(allDataDFNew) <- c("SNo", "Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", "Opp 1", "Opp 2", "Opp 3", "Opp 4", "Opp 5", "Opp 6", "Opp 7", "Average Pre Chess Rating of Opponents")

Perform calculations

# Calculate "Average Pre Chess Rating of Opponents"
for (i in 1:nrow(allDataDFNew)) {
  totalRounds <- as.integer(7)
  totalOppScore <- as.integer(0)
  
  # Opponent 1
  if(unlist(as.integer(allDataDFNew$`Opp 1`[i])) == 0) totalRounds = totalRounds - 1 else totalOppScore = totalOppScore + as.integer(unlist(allDataDFNew$`Player's Pre-Rating`[as.integer(allDataDFNew$`Opp 1`[i])]))

  # Opponent 2
  if(unlist(as.integer(allDataDFNew$`Opp 2`[i])) == 0) totalRounds = totalRounds - 1 else totalOppScore = totalOppScore + as.integer(unlist(allDataDFNew$`Player's Pre-Rating`[as.integer(allDataDFNew$`Opp 2`[i])]))

  # Opponent 3
  if(unlist(as.integer(allDataDFNew$`Opp 3`[i])) == 0) totalRounds = totalRounds - 1 else totalOppScore = totalOppScore + as.integer(unlist(allDataDFNew$`Player's Pre-Rating`[as.integer(allDataDFNew$`Opp 3`[i])]))

  # Opponent 4
  if(unlist(as.integer(allDataDFNew$`Opp 4`[i])) == 0) totalRounds = totalRounds - 1 else totalOppScore = totalOppScore + as.integer(unlist(allDataDFNew$`Player's Pre-Rating`[as.integer(allDataDFNew$`Opp 4`[i])]))

  # Opponent 5
  if(unlist(as.integer(allDataDFNew$`Opp 5`[i])) == 0) totalRounds = totalRounds - 1 else totalOppScore = totalOppScore + as.integer(unlist(allDataDFNew$`Player's Pre-Rating`[as.integer(allDataDFNew$`Opp 5`[i])]))

  # Opponent 6
  if(unlist(as.integer(allDataDFNew$`Opp 6`[i])) == 0) totalRounds = totalRounds - 1 else totalOppScore = totalOppScore + as.integer(unlist(allDataDFNew$`Player's Pre-Rating`[as.integer(allDataDFNew$`Opp 6`[i])]))

  # Opponent 7
  if(unlist(as.integer(allDataDFNew$`Opp 7`[i])) == 0) totalRounds = totalRounds - 1 else totalOppScore = totalOppScore + as.integer(unlist(allDataDFNew$`Player's Pre-Rating`[as.integer(allDataDFNew$`Opp 7`[i])]))

  allDataDFNew$`Average Pre Chess Rating of Opponents`[i] = round(totalOppScore/totalRounds, digits=0)
  
}

Write final output

# Copy only the required columns and create a new Dataframe for final output
allDataDFFinal <- allDataDFNew[, c(2, 3, 4, 5, 13)]

# This is the final formatted and calculated data in a Dataframe.
allDataDFFinal
##                 Player's Name Player's State Total Number of Points
## 1                    Gary Hua             ON                    6.0
## 2             Dakshesh Daruri             MI                    6.0
## 3                Aditya Bajaj             MI                    6.0
## 4         Patrick h Schilling             MI                    5.5
## 5                  Hanshi Zuo             MI                    5.5
## 6                 Hansen Song             OH                    5.0
## 7           Gary Dee Swathell             MI                    5.0
## 8            Ezekiel Houghton             MI                    5.0
## 9                 Stefano Lee             ON                    5.0
## 10                  Anvit Rao             MI                    5.0
## 11   Cameron William Mc Leman             MI                    4.5
## 12             Kenneth j Tack             MI                    4.5
## 13          Torrance Henry Jr             MI                    4.5
## 14               Bradley Shaw             MI                    4.5
## 15     Zachary James Houghton             MI                    4.5
## 16               Mike Nikitin             MI                    4.0
## 17         Ronald Grzegorczyk             MI                    4.0
## 18              David Sundeen             MI                    4.0
## 19               Dipankar Roy             MI                    4.0
## 20                Jason Zheng             MI                    4.0
## 21              Dinh Dang Bui             ON                    4.0
## 22           Eugene l Mcclure             MI                    4.0
## 23                   Alan Bui             ON                    4.0
## 24          Michael r Aldrich             MI                    4.0
## 25           Loren Schwiebert             MI                    3.5
## 26                    Max Zhu             ON                    3.5
## 27             Gaurav Gidwani             MI                    3.5
## 28 Sofia Adina Stanescu-Bellu             MI                    3.5
## 29           Chiedozie Okorie             MI                    3.5
## 30         George Avery Jones             ON                    3.5
## 31               Rishi Shetty             MI                    3.5
## 32      Joshua Philip Mathews             ON                    3.5
## 33                    Jade Ge             MI                    3.5
## 34     Michael Jeffery Thomas             MI                    3.5
## 35           Joshua David Lee             MI                    3.5
## 36              Siddharth Jha             MI                    3.5
## 37       Amiyatosh Pwnanandam             MI                    3.5
## 38                  Brian Liu             MI                    3.0
## 39              Joel r Hendon             MI                    3.0
## 40               Forest Zhang             MI                    3.0
## 41        Kyle William Murphy             MI                    3.0
## 42                   Jared Ge             MI                    3.0
## 43          Robert Glen Vasey             MI                    3.0
## 44         Justin d Schilling             MI                    3.0
## 45                  Derek Yan             MI                    3.0
## 46   Jacob Alexander Lavalley             MI                    3.0
## 47                Eric Wright             MI                    2.5
## 48               Daniel Khain             MI                    2.5
## 49           Michael j Martin             MI                    2.5
## 50                 Shivam Jha             MI                    2.5
## 51             Tejas Ayyagari             MI                    2.5
## 52                  Ethan Guo             MI                    2.5
## 53              Jose c Ybarra             MI                    2.0
## 54                Larry Hodge             MI                    2.0
## 55                  Alex Kong             MI                    2.0
## 56               Marisa Ricci             MI                    2.0
## 57                 Michael Lu             MI                    2.0
## 58               Viraj Mohile             MI                    2.0
## 59          Sean m Mc Cormick             MI                    2.0
## 60                 Julia Shen             MI                    1.5
## 61              Jezzel Farkas             ON                    1.5
## 62              Ashwin Balaji             MI                    1.0
## 63       Thomas Joseph Hosmer             MI                    1.0
## 64                     Ben Li             MI                    1.0
##    Player's Pre-Rating Average Pre Chess Rating of Opponents
## 1                 1794                                  1605
## 2                 1553                                  1469
## 3                 1384                                  1564
## 4                 1716                                  1574
## 5                 1655                                  1501
## 6                 1686                                  1519
## 7                 1649                                  1372
## 8                 1641                                  1468
## 9                 1411                                  1523
## 10                1365                                  1554
## 11                1712                                  1468
## 12                1663                                  1506
## 13                1666                                  1498
## 14                1610                                  1515
## 15                1220                                  1484
## 16                1604                                  1386
## 17                1629                                  1499
## 18                1600                                  1480
## 19                1564                                  1426
## 20                1595                                  1411
## 21                1563                                  1470
## 22                1555                                  1300
## 23                1363                                  1214
## 24                1229                                  1357
## 25                1745                                  1363
## 26                1579                                  1507
## 27                1552                                  1222
## 28                1507                                  1522
## 29                1602                                  1314
## 30                1522                                  1144
## 31                1494                                  1260
## 32                1441                                  1379
## 33                1449                                  1277
## 34                1399                                  1375
## 35                1438                                  1150
## 36                1355                                  1388
## 37                 980                                  1385
## 38                1423                                  1539
## 39                1436                                  1430
## 40                1348                                  1391
## 41                1403                                  1248
## 42                1332                                  1150
## 43                1283                                  1107
## 44                1199                                  1327
## 45                1242                                  1152
## 46                 377                                  1358
## 47                1362                                  1392
## 48                1382                                  1356
## 49                1291                                  1286
## 50                1056                                  1296
## 51                1011                                  1356
## 52                 935                                  1495
## 53                1393                                  1345
## 54                1270                                  1206
## 55                1186                                  1406
## 56                1153                                  1414
## 57                1092                                  1363
## 58                 917                                  1391
## 59                 853                                  1319
## 60                 967                                  1330
## 61                 955                                  1327
## 62                1530                                  1186
## 63                1175                                  1350
## 64                1163                                  1263
# Write to a CSV file
write.csv(allDataDFFinal, file = "DATA607_Project_1_output.csv")