Read in source file from my computer

library(stringr)

raw.data <- readLines("D:/CUNY_SPS_DA/607_Data_Aq/Projet1/source_file/tournamentinfo.txt")
## Warning in readLines("D:/CUNY_SPS_DA/607_Data_Aq/Projet1/source_file/
## tournamentinfo.txt"): incomplete final line found on 'D:/CUNY_SPS_DA/
## 607_Data_Aq/Projet1/source_file/tournamentinfo.txt'
head(raw.data,10)
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"
nrow <-length(raw.data)  #Number of the row in the file

Check data from the source file

raw.data[5]
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
raw.data[6]
## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

Get charactors from rows and check data in extraction

chars1 <- unlist(str_extract_all(raw.data[5], "[[:alnum:][:alnum:].*, ]{2,}"))
chars1
##  [1] "    1 "                           
##  [2] " GARY HUA                        "
##  [3] "6.0  "                            
##  [4] "W  39"                            
##  [5] "W  21"                            
##  [6] "W  18"                            
##  [7] "W  14"                            
##  [8] "W   7"                            
##  [9] "D  12"                            
## [10] "D   4"
chars1[2] #get name
## [1] " GARY HUA                        "
length(chars1)
## [1] 10
chars2 <- unlist(str_extract_all(raw.data[6], "[[:alnum:][:alnum:].*, ]{2,}"))
chars2
##  [1] "   ON "     " 15445895 " " R"         " 1794   "   "1817     " 
##  [6] "2  "        "W    "      "B    "      "W    "      "B    "     
## [11] "W    "      "B    "      "W    "
chars2[1] #get location name
## [1] "   ON "
length(chars2)
## [1] 13

Create two tables to store data from the source file

a vector “table1” for the row of player’s id, name and record- a string as a list in factor type

a vector “df” for the row of player’s location and - a string as a list in factor type

j <- 0
k <- 0
table1=c()
df=c()

for(i in 5:nrow) {
    
    remainder1 <- (i-2)%%3
    remainder2 <- i%%3
    
    # Create table "table1":player's names
    if(remainder1 == 0) {
      j <-j+1
      table1[j] <-data.frame(unlist(str_extract_all(raw.data[i], "[[:alnum:][:alnum:].*\\-?*//[:alnum:], ]{2,}")))
    }
    
    # Create table "df":player's scores
    else if(remainder2 == 0) {
      k <- k+1
      df[k] <-data.frame(unlist(str_extract_all(raw.data[i], "[[:alnum:][:alnum:].*, ]{2,}")))
    }
    else{ i<-i+1 } 
}

print(j) # number of player
## [1] 64
print(table1[j]) # last player info
## [[1]]
##  [1]    64                              BEN LI                          
##  [3] 1.0                               L  22                            
##  [5] D  30                             L  31                            
##  [7] D  49                             L  46                            
##  [9] L  42                             L  54                            
## 10 Levels:    64   BEN LI                           1.0   D  30 ... L  54
print(table1[[j]][1]) # last player's ID
## [1]    64 
## 10 Levels:    64   BEN LI                           1.0   D  30 ... L  54
print(table1[[j]][2]) # last player's name
## [1]  BEN LI                          
## 10 Levels:    64   BEN LI                           1.0   D  30 ... L  54
#print(df[[k]]) # last player info
print(df[[k]][1]) # last player's location
## [1]    MI 
## Levels:          MI   1163     15006561   R 1112      B     W
print(df[[k]][4]) # last player's pre_Rtg
## [1]  1163   
## Levels:          MI   1163     15006561   R 1112      B     W

Contrust a data frame to store the data we need from two vectors

create vectors for variables

clearn data

Id=c()
name=c()
location=c()
pre_Rtg=c()
total=c()
round1=c()
round2=c()
round3=c()
round4=c()
round5=c()
round6=c()
round7=c()

for(i in 1:j){
  
  #create vectors for variables
  Id[i] <- as.character(table1[[i]][1])
  name[i] <- as.character(table1[[i]][2])
  location[i] <- as.character(df[[i]][1])
  pre_Rtg[i] <- as.character(df[[i]][4])
  total[i]<- as.character(table1[[i]][3])
  round1[i]<- as.character(table1[[i]][4])
  round2[i]<- as.character(table1[[i]][5])
  round3[i]<- as.character(table1[[i]][6])
  round4[i]<- as.character(table1[[i]][7])
  round5[i]<- as.character(table1[[i]][8])
  round6[i]<- as.character(table1[[i]][9])
  round7[i]<- as.character(table1[[i]][10])
  
  
  #clear data
  Id[i] <-as.numeric(as.character(str_replace_all(Id[i], pattern = " ", replacement = "")))
  name[i] <-unlist(str_extract_all(name[i], "\\b[[:alpha:].*\\-?*\\>, ]{2,}"))
  location[i] <-str_replace_all(location[i], pattern = " ", replacement = "")
  pre_Rtg[i] <-as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i],"[:digit:]+"))))
  total[i]<-as.numeric(as.character(str_replace_all(total[i], pattern = " ", replacement = "")))
  
  round1[i] <-str_sub(round1[i], start= -2)
  round1[i] <-as.numeric(as.character(str_replace_all(round1[i], pattern = " ", replacement = "")))
  round2[i] <-str_sub(round2[i], start= -2)
  round2[i] <-as.numeric(as.character(str_replace_all(round2[i], pattern = " ", replacement = "")))
  round3[i] <-str_sub(round3[i], start= -2)
  round3[i] <-as.numeric(as.character(str_replace_all(round3[i], pattern = " ", replacement = "")))
  round4[i] <-str_sub(round4[i], start= -2)
  round4[i] <-as.numeric(as.character(str_replace_all(round4[i], pattern = " ", replacement = "")))
  round5[i] <-str_sub(round5[i], start= -2)
  round5[i] <-as.numeric(as.character(str_replace_all(round5[i], pattern = " ", replacement = "")))
  round6[i] <-str_sub(round6[i], start= -2)
  round6[i] <-as.numeric(as.character(str_replace_all(round6[i], pattern = " ", replacement = "")))
  round7[i] <-str_sub(round7[i], start= -2)
  round7[i] <-as.numeric(as.character(str_replace_all(round7[i], pattern = " ", replacement = "")))
}
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
#Check data 
head(name,1)
## [1] "GARY HUA                        "
head(location,1)
## [1] "ON"
head(pre_Rtg,1)
## [1] "1794"
head(total,1)
## [1] "6"
head(round1,1)
## [1] "39"
head(round2,1)
## [1] "21"
head(round3,1)
## [1] "18"
head(round4,1)
## [1] "14"
head(round5,1)
## [1] "7"
head(round6,1)
## [1] "12"
head(round7,1)
## [1] "4"
#Check special values
head(Id,5)
## [1] "1" "2" "3" "4" "5"
name[28]
## [1] "SOFIA ADINA STANESCU-BELLU      "
pre_Rtg[8]
## [1] "1641"

combining vectors into a data frame

mydata <- data.frame(Id,name,location,pre_Rtg,total,round1,round2,round3,round4,round5,round6,round7)
head(mydata,5)
##   Id                             name location pre_Rtg total round1 round2
## 1  1 GARY HUA                               ON    1794     6     39     21
## 2  2 DAKSHESH DARURI                        MI    1553     6     63     58
## 3  3 ADITYA BAJAJ                           MI    1384     6      8     61
## 4  4 PATRICK H SCHILLING                    MI    1716   5.5     23     28
## 5  5 HANSHI ZUO                             MI    1655   5.5     45     37
##   round3 round4 round5 round6 round7
## 1     18     14      7     12      4
## 2      4     17     16     20      7
## 3     25     21     11     13     12
## 4      2     26      5     19      1
## 5     12     13      4     14     17
dim(mydata)
## [1] 64 12
str(mydata)
## 'data.frame':    64 obs. of  12 variables:
##  $ Id      : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
##  $ name    : Factor w/ 64 levels "ADITYA BAJAJ                    ",..: 24 12 1 51 28 27 23 21 59 5 ...
##  $ location: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
##  $ pre_Rtg : Factor w/ 64 levels "1011","1056",..: 57 37 22 55 50 53 49 48 26 20 ...
##  $ total   : Factor w/ 11 levels "1","1.5","2",..: 11 11 11 10 10 9 9 9 9 9 ...
##  $ round1  : Factor w/ 60 levels "1","10","11",..: 32 56 59 16 38 28 49 23 18 8 ...
##  $ round2  : Factor w/ 62 levels "1","10","11",..: 14 54 57 21 31 22 41 26 10 11 ...
##  $ round3  : Factor w/ 60 levels "1","10","11",..: 9 33 17 11 4 3 5 6 52 48 ...
##  $ round4  : Factor w/ 62 levels "1","10","11",..: 6 9 14 19 5 29 3 62 61 25 ...
##  $ round5  : Factor w/ 52 levels "1","10","11",..: 50 7 3 38 29 2 1 36 17 45 ...
##  $ round6  : Factor w/ 58 levels "1","10","11",..: 4 13 5 11 6 20 58 21 56 18 ...
##  $ round7  : Factor w/ 54 levels "1","10","11",..: 31 52 4 1 8 13 11 10 12 9 ...

Clear data type in mydata

  mydata$Id <- as.numeric(as.character(mydata$Id ))
  mydata$name <- as.character(mydata$name )
  mydata$location <- as.character(mydata$location)
  mydata$pre_Rtg <- as.numeric(as.character(mydata$pre_Rtg))
  mydata$total<- as.numeric(as.character(mydata$total))
  mydata$round1<- as.numeric(as.character(mydata$round1))
  mydata$round2<- as.numeric(as.character(mydata$round2))
  mydata$round3<- as.numeric(as.character(mydata$round3))
  mydata$round4<- as.numeric(as.character(mydata$round4))
  mydata$round5<- as.numeric(as.character(mydata$round5))
  mydata$round6<- as.numeric(as.character(mydata$round6))
  mydata$round7<- as.numeric(as.character(mydata$round7))
  
  str(mydata)
## 'data.frame':    64 obs. of  12 variables:
##  $ Id      : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ name    : chr  "GARY HUA                        " "DAKSHESH DARURI                 " "ADITYA BAJAJ                    " "PATRICK H SCHILLING             " ...
##  $ location: chr  "ON" "MI" "MI" "MI" ...
##  $ pre_Rtg : num  1794 1553 1384 1716 1655 ...
##  $ total   : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ round1  : num  39 63 8 23 45 34 57 3 25 16 ...
##  $ round2  : num  21 58 61 28 37 29 46 32 18 19 ...
##  $ round3  : num  18 4 25 2 12 11 13 14 59 55 ...
##  $ round4  : num  14 17 21 26 13 35 11 9 8 31 ...
##  $ round5  : num  7 16 11 5 4 10 1 47 26 6 ...
##  $ round6  : num  12 20 13 19 14 27 9 28 7 25 ...
##  $ round7  : num  4 7 12 1 17 21 2 19 20 18 ...
  summary(mydata)
##        Id            name             location            pre_Rtg    
##  Min.   : 1.00   Length:64          Length:64          Min.   : 377  
##  1st Qu.:16.75   Class :character   Class :character   1st Qu.:1227  
##  Median :32.50   Mode  :character   Mode  :character   Median :1407  
##  Mean   :32.50                                         Mean   :1378  
##  3rd Qu.:48.25                                         3rd Qu.:1583  
##  Max.   :64.00                                         Max.   :1794  
##                                                                      
##      total           round1          round2          round3     
##  Min.   :1.000   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.:2.500   1st Qu.:15.75   1st Qu.:16.25   1st Qu.:16.50  
##  Median :3.500   Median :30.50   Median :31.50   Median :31.50  
##  Mean   :3.438   Mean   :31.50   Mean   :31.60   Mean   :31.68  
##  3rd Qu.:4.000   3rd Qu.:47.25   3rd Qu.:46.75   3rd Qu.:46.25  
##  Max.   :6.000   Max.   :64.00   Max.   :64.00   Max.   :64.00  
##                  NA's   :4       NA's   :2       NA's   :4      
##      round4          round5          round6          round7     
##  Min.   : 1.00   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.:16.25   1st Qu.:14.75   1st Qu.:15.25   1st Qu.:14.25  
##  Median :31.50   Median :28.50   Median :29.50   Median :30.50  
##  Mean   :31.68   Mean   :30.19   Mean   :30.12   Mean   :30.06  
##  3rd Qu.:46.75   3rd Qu.:45.25   3rd Qu.:44.75   3rd Qu.:44.75  
##  Max.   :64.00   Max.   :64.00   Max.   :64.00   Max.   :64.00  
##  NA's   :2       NA's   :12      NA's   :6       NA's   :10

Replace values in each round by their pre_Rtg values

for (i in 1:j){
   r1 <- mydata$round1[i]
   r2 <- mydata$round2[i]
   r3 <- mydata$round3[i]
   r4 <- mydata$round4[i]
   r5 <- mydata$round5[i]
   r6 <- mydata$round6[i]
   r7 <- mydata$round7[i]

  mydata$round1[i] <- mydata$pre_Rtg[r1] 
  mydata$round2[i] <- mydata$pre_Rtg[r2]
   mydata$round3[i] <- mydata$pre_Rtg[r3] 
  mydata$round4[i] <- mydata$pre_Rtg[r4]
   mydata$round5[i] <- mydata$pre_Rtg[r5] 
  mydata$round6[i] <- mydata$pre_Rtg[r6]
   mydata$round7[i] <- mydata$pre_Rtg[r7] 
  
}
head(mydata)
##   Id                             name location pre_Rtg total round1 round2
## 1  1 GARY HUA                               ON    1794   6.0   1436   1563
## 2  2 DAKSHESH DARURI                        MI    1553   6.0   1175    917
## 3  3 ADITYA BAJAJ                           MI    1384   6.0   1641    955
## 4  4 PATRICK H SCHILLING                    MI    1716   5.5   1363   1507
## 5  5 HANSHI ZUO                             MI    1655   5.5   1242    980
## 6  6 HANSEN SONG                            OH    1686   5.0   1399   1602
##   round3 round4 round5 round6 round7
## 1   1600   1610   1649   1663   1716
## 2   1716   1629   1604   1595   1649
## 3   1745   1563   1712   1666   1663
## 4   1553   1579   1655   1564   1794
## 5   1663   1666   1716   1610   1629
## 6   1712   1438   1365   1552   1563
dim(mydata)
## [1] 64 12

Calculation of Average of scores of rounds

insert new column avg in mydata

get sum of scores in each row and calculate the avg of each

mydata["avg"] <- 0

mydata[mydata=="" ]<-0

for(i in 1:j){ 
 colSums(mydata[,c(6,12)], na.rm = TRUE)

mydata$avg[i] <-as.integer((mydata$round1[i]+mydata$round2[i]+mydata$round3[i]+mydata$round4[i]+mydata$round5[i]+ mydata$round6[i]+ mydata$round7[i])/7)

}

head(mydata$avg)
## [1] 1605 1469 1563 1573 1500 1518
head(mydata)
##   Id                             name location pre_Rtg total round1 round2
## 1  1 GARY HUA                               ON    1794   6.0   1436   1563
## 2  2 DAKSHESH DARURI                        MI    1553   6.0   1175    917
## 3  3 ADITYA BAJAJ                           MI    1384   6.0   1641    955
## 4  4 PATRICK H SCHILLING                    MI    1716   5.5   1363   1507
## 5  5 HANSHI ZUO                             MI    1655   5.5   1242    980
## 6  6 HANSEN SONG                            OH    1686   5.0   1399   1602
##   round3 round4 round5 round6 round7  avg
## 1   1600   1610   1649   1663   1716 1605
## 2   1716   1629   1604   1595   1649 1469
## 3   1745   1563   1712   1666   1663 1563
## 4   1553   1579   1655   1564   1794 1573
## 5   1663   1666   1716   1610   1629 1500
## 6   1712   1438   1365   1552   1563 1518

Required outcome

#outcome_string <- data.frame(mydata$name,mydata$location, mydata$total,mydata$pre_Rtg,mydata$avg)
#summary(outcome)
#head(outcome)

outcome_string<- data.frame(paste(mydata$name,mydata$location, mydata$total,mydata$pre_Rtg,mydata$avg, sep = ","))
head(outcome_string)
##   paste.mydata.name..mydata.location..mydata.total..mydata.pre_Rtg..
## 1                    GARY HUA                        ,ON,6,1794,1605
## 2                    DAKSHESH DARURI                 ,MI,6,1553,1469
## 3                    ADITYA BAJAJ                    ,MI,6,1384,1563
## 4                  PATRICK H SCHILLING             ,MI,5.5,1716,1573
## 5                  HANSHI ZUO                      ,MI,5.5,1655,1500
## 6                    HANSEN SONG                     ,OH,5,1686,1518
write.table(outcome_string, "D:/CUNY_SPS_DA/607_Data_Aq/Projet1/mydata.txt", sep="\t")

Note