Load required packages
knitr::opts_chunk$set(echo = TRUE)
# RPubs link -
# Check if the package is installed. If not, install the package
if(!require('stringr')) {
install.packages('stringr')
library(stringr)
}
## Loading required package: stringr
# Check if the package is installed. If not, install the package
if(!require('data.table')) {
install.packages('data.table')
library(data.table)
}
## Loading required package: data.table
# Check if the package is installed. If not, install the package
if(!require('taRifx')) {
install.packages('taRifx')
library(taRifx)
}
## Loading required package: taRifx
##
## Attaching package: 'taRifx'
## The following objects are masked from 'package:data.table':
##
## between, first, last, shift
Read data file
first_time = 1
fileName <- "https://raw.githubusercontent.com/kalyanparthasarathy/CUNY-MSDS/master/tournamentinfo.txt"
conn <- file(fileName,open="r")
linesFromFile <-readLines(conn)
for (i in 5:length(linesFromFile)){
if( ( (i-5) %% 3) == 0) {
# Data resides in two lines so we need to combine both lines into single DF and look for the interested columns
# Parse the first line of text (here the first line is the Name line)
readLine1 <- linesFromFile[i]
# IMPORTANT - Limitation of sub() function with back referencing of RegEx values
# Only 9 values can be back referenced so need to do two step split for each line of data
firstTwoColsLine1 <- unlist(
sub(
"(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
, "\\1"
, readLine1
)
)
remainingColsLine1 <- unlist(
sub(
"(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
, "\\2,\\3,\\4,\\5,\\6,\\7,\\8,\\9"
, readLine1
)
)
firstTwoColsLine1 <- unlist(
sub(
"(.*)\\|(.*)$"
, "\\1,\\2"
, firstTwoColsLine1
)
)
parsedLine1 <- paste(firstTwoColsLine1, ",", remainingColsLine1)
rawDataDF1 <- data.frame(parsedLine1)
names(rawDataDF1) <- c("Data")
out <- strsplit(as.character(rawDataDF1$Data),',')
rawDataDF1 <- data.frame(rawDataDF1$Data, do.call(rbind, out))
rawDataDF1$rawDataDF1.Data <- NULL
names(rawDataDF1) <- c("C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10")
# Parse the first line of text (here the first line is the Name line)
readLine2 <- linesFromFile[i+1]
# IMPORTANT - Limitation of sub() function with back referencing of RegEx values
# Only 9 values can be back referenced so need to do two step split for each line of data
firstTwoColsLine2 <- unlist(
sub(
"(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
, "\\1"
, readLine2
)
)
remainingColsLine2 <- unlist(
sub(
"(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)\\|$"
, "\\2,\\3,\\4,\\5,\\6,\\7,\\8,\\9"
, readLine2
)
)
firstTwoColsLine2 <- unlist(
sub(
"(.*)\\|(.*)$"
, "\\1,\\2"
, firstTwoColsLine2
)
)
parsedLine2 <- paste(firstTwoColsLine2, ",", remainingColsLine2)
rawDataDF2 <- data.frame(parsedLine2)
names(rawDataDF2) <- c("Data")
out <- strsplit(as.character(rawDataDF2$Data),',')
rawDataDF2 <- data.frame(rawDataDF2$Data, do.call(rbind, out))
rawDataDF2$rawDataDF2.Data <- NULL
names(rawDataDF2) <- c("C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20")
if(first_time == 1) {
first_time = 2
allDataDF1 <- rawDataDF1
allDataDF2 <- rawDataDF2
}
else {
# Merge each row of data into the allDataDF1 & allDataDF2
allDataDF1 <- rbind(allDataDF1, rawDataDF1)
allDataDF2 <- rbind(allDataDF2, rawDataDF2)
}
}
}
close(conn)
Formatting the data
# Merge both lines of data into sinlge DF
allDataDF <- cbind(allDataDF1, allDataDF2)
# Format the Column 1 - Position - This information is needed to calculate "Avg Pre Chess Rating of Opponents" value
allDataDF$C1 <- sub("[[:space:]]+?([[:digit:]]{1,}).*", "\\1", allDataDF$C1)
# Format the Column 2 - Name
allDataDF$C2 <- tools::toTitleCase(tolower(str_trim(unlist(sub("[[:space:]]+?([[:alpha:]] )+[[:space:]]+", "\\1", allDataDF$C2)))))
# Format the Column 3 - Total Number of Points
allDataDF$C3 <- str_trim(unlist(sub("[[:space:]]+?([[:alpha:]] )+[[:space:]]+", "\\1", allDataDF$C3)))
# Format the Column 4 - Round 1
allDataDF$C4 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C4)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C4 <- sub("[[:alpha:]]+", "0", allDataDF$C4)
# Format the Column 5 - Round 2
allDataDF$C5 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C5)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C5 <- sub("[[:alpha:]]+", "0", allDataDF$C5)
# Format the Column 6 - Round 3
allDataDF$C6 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C6)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C6 <- sub("[[:alpha:]]+", "0", allDataDF$C6)
# Format the Column 7 - Round 4
allDataDF$C7 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C7)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C7 <- sub("[[:alpha:]]+", "0", allDataDF$C7)
# Format the Column 8 - Round 5
allDataDF$C8 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C8)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C8 <- sub("[[:alpha:]]+", "0", allDataDF$C8)
# Format the Column 9 - Round 6
allDataDF$C9 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C9)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C9 <- sub("[[:alpha:]]+", "0", allDataDF$C9)
# Format the Column 10 - Round 7
allDataDF$C10 <- str_trim(unlist(sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", allDataDF$C10)))
# If the value has "Alphabet", change to 0, value is missing for this cell
allDataDF$C10 <- sub("[[:alpha:]]+", "0", allDataDF$C10)
# Format the Column 11 - State
allDataDF$C11 <- str_trim(unlist(sub("[[:space:]]+?([[:alpha:]] )+[[:space:]]+", "\\1", allDataDF$C11)))
# Format the Column 12 - Pre-rating
allDataDF$C12 <- str_trim(allDataDF$C12)
allDataDF$C12 <- str_trim(unlist(sub("[[:digit:]]+[[:space:]].[[:space:]]R:[[:space:]]+?([[:alnum:]]+)[[:space:]]{0,1}.*", "\\1", allDataDF$C12)))
allDataDF$C12 <- unlist(sub("([[:digit:]]+)[[:alpha:]][[:digit:]]+", "\\1", allDataDF$C12)) # Remove the values from "P", if anything exists
# Extract only the columns we are interested in
allDataDFNew <- allDataDF[, c(1, 2, 11, 3, 12, 4, 5, 6, 7, 8, 9, 10)]
# Add a new column for the Average Score
allDataDFNew$'Avg Score' <- 0
names(allDataDFNew) <- c("SNo", "Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", "Opp 1", "Opp 2", "Opp 3", "Opp 4", "Opp 5", "Opp 6", "Opp 7", "Average Pre Chess Rating of Opponents")
Write final output
# Copy only the required columns and create a new Dataframe for final output
allDataDFFinal <- allDataDFNew[, c(2, 3, 4, 5, 13)]
# This is the final formatted and calculated data in a Dataframe.
allDataDFFinal
## Player's Name Player's State Total Number of Points
## 1 Gary Hua ON 6.0
## 2 Dakshesh Daruri MI 6.0
## 3 Aditya Bajaj MI 6.0
## 4 Patrick h Schilling MI 5.5
## 5 Hanshi Zuo MI 5.5
## 6 Hansen Song OH 5.0
## 7 Gary Dee Swathell MI 5.0
## 8 Ezekiel Houghton MI 5.0
## 9 Stefano Lee ON 5.0
## 10 Anvit Rao MI 5.0
## 11 Cameron William Mc Leman MI 4.5
## 12 Kenneth j Tack MI 4.5
## 13 Torrance Henry Jr MI 4.5
## 14 Bradley Shaw MI 4.5
## 15 Zachary James Houghton MI 4.5
## 16 Mike Nikitin MI 4.0
## 17 Ronald Grzegorczyk MI 4.0
## 18 David Sundeen MI 4.0
## 19 Dipankar Roy MI 4.0
## 20 Jason Zheng MI 4.0
## 21 Dinh Dang Bui ON 4.0
## 22 Eugene l Mcclure MI 4.0
## 23 Alan Bui ON 4.0
## 24 Michael r Aldrich MI 4.0
## 25 Loren Schwiebert MI 3.5
## 26 Max Zhu ON 3.5
## 27 Gaurav Gidwani MI 3.5
## 28 Sofia Adina Stanescu-Bellu MI 3.5
## 29 Chiedozie Okorie MI 3.5
## 30 George Avery Jones ON 3.5
## 31 Rishi Shetty MI 3.5
## 32 Joshua Philip Mathews ON 3.5
## 33 Jade Ge MI 3.5
## 34 Michael Jeffery Thomas MI 3.5
## 35 Joshua David Lee MI 3.5
## 36 Siddharth Jha MI 3.5
## 37 Amiyatosh Pwnanandam MI 3.5
## 38 Brian Liu MI 3.0
## 39 Joel r Hendon MI 3.0
## 40 Forest Zhang MI 3.0
## 41 Kyle William Murphy MI 3.0
## 42 Jared Ge MI 3.0
## 43 Robert Glen Vasey MI 3.0
## 44 Justin d Schilling MI 3.0
## 45 Derek Yan MI 3.0
## 46 Jacob Alexander Lavalley MI 3.0
## 47 Eric Wright MI 2.5
## 48 Daniel Khain MI 2.5
## 49 Michael j Martin MI 2.5
## 50 Shivam Jha MI 2.5
## 51 Tejas Ayyagari MI 2.5
## 52 Ethan Guo MI 2.5
## 53 Jose c Ybarra MI 2.0
## 54 Larry Hodge MI 2.0
## 55 Alex Kong MI 2.0
## 56 Marisa Ricci MI 2.0
## 57 Michael Lu MI 2.0
## 58 Viraj Mohile MI 2.0
## 59 Sean m Mc Cormick MI 2.0
## 60 Julia Shen MI 1.5
## 61 Jezzel Farkas ON 1.5
## 62 Ashwin Balaji MI 1.0
## 63 Thomas Joseph Hosmer MI 1.0
## 64 Ben Li MI 1.0
## Player's Pre-Rating Average Pre Chess Rating of Opponents
## 1 1794 1605
## 2 1553 1469
## 3 1384 1564
## 4 1716 1574
## 5 1655 1501
## 6 1686 1519
## 7 1649 1372
## 8 1641 1468
## 9 1411 1523
## 10 1365 1554
## 11 1712 1468
## 12 1663 1506
## 13 1666 1498
## 14 1610 1515
## 15 1220 1484
## 16 1604 1386
## 17 1629 1499
## 18 1600 1480
## 19 1564 1426
## 20 1595 1411
## 21 1563 1470
## 22 1555 1300
## 23 1363 1214
## 24 1229 1357
## 25 1745 1363
## 26 1579 1507
## 27 1552 1222
## 28 1507 1522
## 29 1602 1314
## 30 1522 1144
## 31 1494 1260
## 32 1441 1379
## 33 1449 1277
## 34 1399 1375
## 35 1438 1150
## 36 1355 1388
## 37 980 1385
## 38 1423 1539
## 39 1436 1430
## 40 1348 1391
## 41 1403 1248
## 42 1332 1150
## 43 1283 1107
## 44 1199 1327
## 45 1242 1152
## 46 377 1358
## 47 1362 1392
## 48 1382 1356
## 49 1291 1286
## 50 1056 1296
## 51 1011 1356
## 52 935 1495
## 53 1393 1345
## 54 1270 1206
## 55 1186 1406
## 56 1153 1414
## 57 1092 1363
## 58 917 1391
## 59 853 1319
## 60 967 1330
## 61 955 1327
## 62 1530 1186
## 63 1175 1350
## 64 1163 1263
# Write to a CSV file
write.csv(allDataDFFinal, file = "DATA607_Project_1_output.csv")