Project 1 - Create an optimal data set from an in-optimal text file, full of data.

So right away the read.table function comes to mind, but I would like to see if I can do the initial parsing manually.

library(stringr)
#open text file
text = file('C:/Users/Exped/Desktop/tournamentinfo.txt')
#read lines into object
tournamentFile = readLines(con=text)
#seperate the object into different lines
tournyVector = unlist(tournamentFile, recursive = TRUE, use.names = TRUE)
#next three lines are just removing unneccessary lines from the object
remove = tournyVector[seq(1, length(tournyVector),3)]
tournyVector = tournyVector[! tournyVector %in% remove]
tournyVector = tournyVector[3:length(tournyVector)]
# Heres the object
summary(tournyVector)
##    Length     Class      Mode 
##       128 character character
head(tournyVector)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [4] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [5] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [6] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

Now I have a big list of complementary lines, so I seperate them.

nameRow = (tournyVector[seq(1, length(tournyVector),2)])
detailsRow= (tournyVector[seq(2, length(tournyVector),2)])

Here I make some extract functions

#Made some individual functions to extract what I need from the rows containing names
extractName_nameRow = function(nameRow){
  name = c(trimws(substr(nameRow, start = 8, stop = 40), which =c('right')))
  return(name)}

extractPoints_nameRow = function(nameRow){
  points = c(lapply(trimws(substr(nameRow, start = 42, stop = 44), which =c('right')),as.numeric))
  return(points)}
#bunch of nested functions there...but basically I take a portion of the line, trim its white space, and treat it as a numeric.
extractRoundsOpp_nameRow = function(nameRow){
  step1 = trimws(substr(nameRow, start = 46, stop = 90), which =c('right'))
  step1 = str_replace_all(step1,'    ', '00')
  who = c(lapply((str_extract_all(step1,'[:digit:]{1,2}')),as.numeric))
  return(who)
}
#Same thing, only for the row with extra details
extractState_detailsRow = function(detailsRow){
  state = c(trimws(substr(detailsRow, start = 4, stop = 5), which =c('right')))
  return(state)}

extractPreELO_detailsRow = function(detailsRow){
  preELO = c(lapply(trimws(substr(detailsRow, start=23, stop = 26), which =c('left')),as.numeric))
  return(preELO)
}

I use the extract functions

#I use the functions here
names = extractName_nameRow(nameRow)
points = unlist((extractPoints_nameRow(nameRow)))
states = unlist(extractState_detailsRow(detailsRow))
preElo = unlist(extractPreELO_detailsRow(detailsRow))
oppVector = (extractRoundsOpp_nameRow(nameRow))
# Make a dataframe of what we have so far  
sufficientData= data.frame(names,points, states, preElo)

This loop averages the opponent Pre chess elo of each contender and appends it to the DF

# For loops should generally be avoided in R, nested for loops especially. I originally did this with lapply function, but I thought this loop would be easier to read. 
averageOpponent = c()
playerCount = 1
for(player in oppVector){
  totalOpponentElo = 0
  count = 0
  for(opponent in player){
    if(opponent!=0){
    totalOpponentElo = totalOpponentElo + sufficientData$preElo[opponent]
    count = count + 1
    }
    else{}
  }
  averageOpponent[playerCount] = totalOpponentElo/count
  playerCount = playerCount+1
}
sufficientData$averageOpponentElo = averageOpponent
sufficientData = setNames(sufficientData,c('Name','Pts','State','PrElo','OpPreAvgELO'))
summary(sufficientData$OpPreAvgELO)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1107    1310    1382    1379    1481    1605
write.csv(sufficientData,file = 'C:/Users/Exped/Desktop/chessAverages')