607 Projet1

Read in source file from my computer

library(stringr)

raw.data <- readLines("D:/CUNY_SPS_DA/607_Data_Aq/Projet1/source_file/tournamentinfo.txt")

## Warning in readLines("D:/CUNY_SPS_DA/607_Data_Aq/Projet1/source_file/
## tournamentinfo.txt"): incomplete final line found on 'D:/CUNY_SPS_DA/
## 607_Data_Aq/Projet1/source_file/tournamentinfo.txt'

head(raw.data,10)

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

nrow <-length(raw.data)  #Number of the row in the file

Check data from the source file

raw.data[5]

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"

raw.data[6]

## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

Get charactors from rows and check data in extraction

chars1 <- unlist(str_extract_all(raw.data[5], "[[:alnum:][:alnum:].*, ]{2,}"))
chars1

##  [1] "    1 "                           
##  [2] " GARY HUA                        "
##  [3] "6.0  "                            
##  [4] "W  39"                            
##  [5] "W  21"                            
##  [6] "W  18"                            
##  [7] "W  14"                            
##  [8] "W   7"                            
##  [9] "D  12"                            
## [10] "D   4"

chars1[2] #get name

## [1] " GARY HUA                        "

length(chars1)

## [1] 10

chars2 <- unlist(str_extract_all(raw.data[6], "[[:alnum:][:alnum:].*, ]{2,}"))
chars2

##  [1] "   ON "     " 15445895 " " R"         " 1794   "   "1817     " 
##  [6] "2  "        "W    "      "B    "      "W    "      "B    "     
## [11] "W    "      "B    "      "W    "

chars2[1] #get location name

## [1] "   ON "

length(chars2)

## [1] 13

Create two tables to store data from the source file

a vector “table1” for the row of player’s id, name and record- a string as a list in factor type

a vector “df” for the row of player’s location and - a string as a list in factor type

j <- 0
k <- 0
table1=c()
df=c()

for(i in 5:nrow) {
    
    remainder1 <- (i-2)%%3
    remainder2 <- i%%3
    
    # Create table "table1":player's names
    if(remainder1 == 0) {
      j <-j+1
      table1[j] <-data.frame(unlist(str_extract_all(raw.data[i], "[[:alnum:][:alnum:].*\\-?*//[:alnum:], ]{2,}")))
    }
    
    # Create table "df":player's scores
    else if(remainder2 == 0) {
      k <- k+1
      df[k] <-data.frame(unlist(str_extract_all(raw.data[i], "[[:alnum:][:alnum:].*, ]{2,}")))
    }
    else{ i<-i+1 } 
}

print(j) # number of player

## [1] 64

print(table1[j]) # last player info

## [[1]]
##  [1]    64                              BEN LI                          
##  [3] 1.0                               L  22                            
##  [5] D  30                             L  31                            
##  [7] D  49                             L  46                            
##  [9] L  42                             L  54                            
## 10 Levels:    64   BEN LI                           1.0   D  30 ... L  54

print(table1[[j]][1]) # last player's ID

## [1]    64 
## 10 Levels:    64   BEN LI                           1.0   D  30 ... L  54

print(table1[[j]][2]) # last player's name

## [1]  BEN LI                          
## 10 Levels:    64   BEN LI                           1.0   D  30 ... L  54

#print(df[[k]]) # last player info
print(df[[k]][1]) # last player's location

## [1]    MI 
## Levels:          MI   1163     15006561   R 1112      B     W

print(df[[k]][4]) # last player's pre_Rtg

## [1]  1163   
## Levels:          MI   1163     15006561   R 1112      B     W

Contrust a data frame to store the data we need from two vectors

create vectors for variables

clearn data

Id=c()
name=c()
location=c()
pre_Rtg=c()
total=c()
round1=c()
round2=c()
round3=c()
round4=c()
round5=c()
round6=c()
round7=c()

for(i in 1:j){
  
  #create vectors for variables
  Id[i] <- as.character(table1[[i]][1])
  name[i] <- as.character(table1[[i]][2])
  location[i] <- as.character(df[[i]][1])
  pre_Rtg[i] <- as.character(df[[i]][4])
  total[i]<- as.character(table1[[i]][3])
  round1[i]<- as.character(table1[[i]][4])
  round2[i]<- as.character(table1[[i]][5])
  round3[i]<- as.character(table1[[i]][6])
  round4[i]<- as.character(table1[[i]][7])
  round5[i]<- as.character(table1[[i]][8])
  round6[i]<- as.character(table1[[i]][9])
  round7[i]<- as.character(table1[[i]][10])
  
  
  #clear data
  Id[i] <-as.numeric(as.character(str_replace_all(Id[i], pattern = " ", replacement = "")))
  name[i] <-unlist(str_extract_all(name[i], "\\b[[:alpha:].*\\-?*\\>, ]{2,}"))
  location[i] <-str_replace_all(location[i], pattern = " ", replacement = "")
  pre_Rtg[i] <-as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i],"[:digit:]+"))))
  total[i]<-as.numeric(as.character(str_replace_all(total[i], pattern = " ", replacement = "")))
  
  round1[i] <-str_sub(round1[i], start= -2)
  round1[i] <-as.numeric(as.character(str_replace_all(round1[i], pattern = " ", replacement = "")))
  round2[i] <-str_sub(round2[i], start= -2)
  round2[i] <-as.numeric(as.character(str_replace_all(round2[i], pattern = " ", replacement = "")))
  round3[i] <-str_sub(round3[i], start= -2)
  round3[i] <-as.numeric(as.character(str_replace_all(round3[i], pattern = " ", replacement = "")))
  round4[i] <-str_sub(round4[i], start= -2)
  round4[i] <-as.numeric(as.character(str_replace_all(round4[i], pattern = " ", replacement = "")))
  round5[i] <-str_sub(round5[i], start= -2)
  round5[i] <-as.numeric(as.character(str_replace_all(round5[i], pattern = " ", replacement = "")))
  round6[i] <-str_sub(round6[i], start= -2)
  round6[i] <-as.numeric(as.character(str_replace_all(round6[i], pattern = " ", replacement = "")))
  round7[i] <-str_sub(round7[i], start= -2)
  round7[i] <-as.numeric(as.character(str_replace_all(round7[i], pattern = " ", replacement = "")))
}

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length

#Check data 
head(name,1)

## [1] "GARY HUA                        "

head(location,1)

## [1] "ON"

head(pre_Rtg,1)

## [1] "1794"

head(total,1)

## [1] "6"

head(round1,1)

## [1] "39"

head(round2,1)

## [1] "21"

head(round3,1)

## [1] "18"

head(round4,1)

## [1] "14"

head(round5,1)

## [1] "7"

head(round6,1)

## [1] "12"

head(round7,1)

## [1] "4"

#Check special values
head(Id,5)

## [1] "1" "2" "3" "4" "5"

name[28]

## [1] "SOFIA ADINA STANESCU-BELLU      "

pre_Rtg[8]

## [1] "1641"

combining vectors into a data frame

mydata <- data.frame(Id,name,location,pre_Rtg,total,round1,round2,round3,round4,round5,round6,round7)
head(mydata,5)

##   Id                             name location pre_Rtg total round1 round2
## 1  1 GARY HUA                               ON    1794     6     39     21
## 2  2 DAKSHESH DARURI                        MI    1553     6     63     58
## 3  3 ADITYA BAJAJ                           MI    1384     6      8     61
## 4  4 PATRICK H SCHILLING                    MI    1716   5.5     23     28
## 5  5 HANSHI ZUO                             MI    1655   5.5     45     37
##   round3 round4 round5 round6 round7
## 1     18     14      7     12      4
## 2      4     17     16     20      7
## 3     25     21     11     13     12
## 4      2     26      5     19      1
## 5     12     13      4     14     17

dim(mydata)

## [1] 64 12

str(mydata)

## 'data.frame':    64 obs. of  12 variables:
##  $ Id      : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
##  $ name    : Factor w/ 64 levels "ADITYA BAJAJ                    ",..: 24 12 1 51 28 27 23 21 59 5 ...
##  $ location: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
##  $ pre_Rtg : Factor w/ 64 levels "1011","1056",..: 57 37 22 55 50 53 49 48 26 20 ...
##  $ total   : Factor w/ 11 levels "1","1.5","2",..: 11 11 11 10 10 9 9 9 9 9 ...
##  $ round1  : Factor w/ 60 levels "1","10","11",..: 32 56 59 16 38 28 49 23 18 8 ...
##  $ round2  : Factor w/ 62 levels "1","10","11",..: 14 54 57 21 31 22 41 26 10 11 ...
##  $ round3  : Factor w/ 60 levels "1","10","11",..: 9 33 17 11 4 3 5 6 52 48 ...
##  $ round4  : Factor w/ 62 levels "1","10","11",..: 6 9 14 19 5 29 3 62 61 25 ...
##  $ round5  : Factor w/ 52 levels "1","10","11",..: 50 7 3 38 29 2 1 36 17 45 ...
##  $ round6  : Factor w/ 58 levels "1","10","11",..: 4 13 5 11 6 20 58 21 56 18 ...
##  $ round7  : Factor w/ 54 levels "1","10","11",..: 31 52 4 1 8 13 11 10 12 9 ...

Clear data type in mydata

  mydata$Id <- as.numeric(as.character(mydata$Id ))
  mydata$name <- as.character(mydata$name )
  mydata$location <- as.character(mydata$location)
  mydata$pre_Rtg <- as.numeric(as.character(mydata$pre_Rtg))
  mydata$total<- as.numeric(as.character(mydata$total))
  mydata$round1<- as.numeric(as.character(mydata$round1))
  mydata$round2<- as.numeric(as.character(mydata$round2))
  mydata$round3<- as.numeric(as.character(mydata$round3))
  mydata$round4<- as.numeric(as.character(mydata$round4))
  mydata$round5<- as.numeric(as.character(mydata$round5))
  mydata$round6<- as.numeric(as.character(mydata$round6))
  mydata$round7<- as.numeric(as.character(mydata$round7))
  
  str(mydata)

## 'data.frame':    64 obs. of  12 variables:
##  $ Id      : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ name    : chr  "GARY HUA                        " "DAKSHESH DARURI                 " "ADITYA BAJAJ                    " "PATRICK H SCHILLING             " ...
##  $ location: chr  "ON" "MI" "MI" "MI" ...
##  $ pre_Rtg : num  1794 1553 1384 1716 1655 ...
##  $ total   : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ round1  : num  39 63 8 23 45 34 57 3 25 16 ...
##  $ round2  : num  21 58 61 28 37 29 46 32 18 19 ...
##  $ round3  : num  18 4 25 2 12 11 13 14 59 55 ...
##  $ round4  : num  14 17 21 26 13 35 11 9 8 31 ...
##  $ round5  : num  7 16 11 5 4 10 1 47 26 6 ...
##  $ round6  : num  12 20 13 19 14 27 9 28 7 25 ...
##  $ round7  : num  4 7 12 1 17 21 2 19 20 18 ...

  summary(mydata)

##        Id            name             location            pre_Rtg    
##  Min.   : 1.00   Length:64          Length:64          Min.   : 377  
##  1st Qu.:16.75   Class :character   Class :character   1st Qu.:1227  
##  Median :32.50   Mode  :character   Mode  :character   Median :1407  
##  Mean   :32.50                                         Mean   :1378  
##  3rd Qu.:48.25                                         3rd Qu.:1583  
##  Max.   :64.00                                         Max.   :1794  
##                                                                      
##      total           round1          round2          round3     
##  Min.   :1.000   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.:2.500   1st Qu.:15.75   1st Qu.:16.25   1st Qu.:16.50  
##  Median :3.500   Median :30.50   Median :31.50   Median :31.50  
##  Mean   :3.438   Mean   :31.50   Mean   :31.60   Mean   :31.68  
##  3rd Qu.:4.000   3rd Qu.:47.25   3rd Qu.:46.75   3rd Qu.:46.25  
##  Max.   :6.000   Max.   :64.00   Max.   :64.00   Max.   :64.00  
##                  NA's   :4       NA's   :2       NA's   :4      
##      round4          round5          round6          round7     
##  Min.   : 1.00   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.:16.25   1st Qu.:14.75   1st Qu.:15.25   1st Qu.:14.25  
##  Median :31.50   Median :28.50   Median :29.50   Median :30.50  
##  Mean   :31.68   Mean   :30.19   Mean   :30.12   Mean   :30.06  
##  3rd Qu.:46.75   3rd Qu.:45.25   3rd Qu.:44.75   3rd Qu.:44.75  
##  Max.   :64.00   Max.   :64.00   Max.   :64.00   Max.   :64.00  
##  NA's   :2       NA's   :12      NA's   :6       NA's   :10

Replace values in each round by their pre_Rtg values

for (i in 1:j){
   r1 <- mydata$round1[i]
   r2 <- mydata$round2[i]
   r3 <- mydata$round3[i]
   r4 <- mydata$round4[i]
   r5 <- mydata$round5[i]
   r6 <- mydata$round6[i]
   r7 <- mydata$round7[i]

  mydata$round1[i] <- mydata$pre_Rtg[r1] 
  mydata$round2[i] <- mydata$pre_Rtg[r2]
   mydata$round3[i] <- mydata$pre_Rtg[r3] 
  mydata$round4[i] <- mydata$pre_Rtg[r4]
   mydata$round5[i] <- mydata$pre_Rtg[r5] 
  mydata$round6[i] <- mydata$pre_Rtg[r6]
   mydata$round7[i] <- mydata$pre_Rtg[r7] 
  
}
head(mydata)

##   Id                             name location pre_Rtg total round1 round2
## 1  1 GARY HUA                               ON    1794   6.0   1436   1563
## 2  2 DAKSHESH DARURI                        MI    1553   6.0   1175    917
## 3  3 ADITYA BAJAJ                           MI    1384   6.0   1641    955
## 4  4 PATRICK H SCHILLING                    MI    1716   5.5   1363   1507
## 5  5 HANSHI ZUO                             MI    1655   5.5   1242    980
## 6  6 HANSEN SONG                            OH    1686   5.0   1399   1602
##   round3 round4 round5 round6 round7
## 1   1600   1610   1649   1663   1716
## 2   1716   1629   1604   1595   1649
## 3   1745   1563   1712   1666   1663
## 4   1553   1579   1655   1564   1794
## 5   1663   1666   1716   1610   1629
## 6   1712   1438   1365   1552   1563

dim(mydata)

## [1] 64 12

Calculation of Average of scores of rounds

insert new column avg in mydata

get sum of scores in each row and calculate the avg of each

mydata["avg"] <- 0

mydata[mydata=="" ]<-0

for(i in 1:j){ 
 colSums(mydata[,c(6,12)], na.rm = TRUE)

mydata$avg[i] <-as.integer((mydata$round1[i]+mydata$round2[i]+mydata$round3[i]+mydata$round4[i]+mydata$round5[i]+ mydata$round6[i]+ mydata$round7[i])/7)

}

head(mydata$avg)

## [1] 1605 1469 1563 1573 1500 1518

head(mydata)

##   Id                             name location pre_Rtg total round1 round2
## 1  1 GARY HUA                               ON    1794   6.0   1436   1563
## 2  2 DAKSHESH DARURI                        MI    1553   6.0   1175    917
## 3  3 ADITYA BAJAJ                           MI    1384   6.0   1641    955
## 4  4 PATRICK H SCHILLING                    MI    1716   5.5   1363   1507
## 5  5 HANSHI ZUO                             MI    1655   5.5   1242    980
## 6  6 HANSEN SONG                            OH    1686   5.0   1399   1602
##   round3 round4 round5 round6 round7  avg
## 1   1600   1610   1649   1663   1716 1605
## 2   1716   1629   1604   1595   1649 1469
## 3   1745   1563   1712   1666   1663 1563
## 4   1553   1579   1655   1564   1794 1573
## 5   1663   1666   1716   1610   1629 1500
## 6   1712   1438   1365   1552   1563 1518

Required outcome

#outcome_string <- data.frame(mydata$name,mydata$location, mydata$total,mydata$pre_Rtg,mydata$avg)
#summary(outcome)
#head(outcome)

outcome_string<- data.frame(paste(mydata$name,mydata$location, mydata$total,mydata$pre_Rtg,mydata$avg, sep = ","))
head(outcome_string)

##   paste.mydata.name..mydata.location..mydata.total..mydata.pre_Rtg..
## 1                    GARY HUA                        ,ON,6,1794,1605
## 2                    DAKSHESH DARURI                 ,MI,6,1553,1469
## 3                    ADITYA BAJAJ                    ,MI,6,1384,1563
## 4                  PATRICK H SCHILLING             ,MI,5.5,1716,1573
## 5                  HANSHI ZUO                      ,MI,5.5,1655,1500
## 6                    HANSEN SONG                     ,OH,5,1686,1518

write.table(outcome_string, "D:/CUNY_SPS_DA/607_Data_Aq/Projet1/mydata.txt", sep="\t")

Note

607 Projet1

Chunmei Zhu

September 21, 2017

Read in source file from my computer

Check data from the source file

Get charactors from rows and check data in extraction

Create two tables to store data from the source file

a vector “table1” for the row of player’s id, name and record- a string as a list in factor type

a vector “df” for the row of player’s location and - a string as a list in factor type

Contrust a data frame to store the data we need from two vectors

create vectors for variables

clearn data

combining vectors into a data frame

Clear data type in mydata

Replace values in each round by their pre_Rtg values

Calculation of Average of scores of rounds

insert new column avg in mydata

get sum of scores in each row and calculate the avg of each

Required outcome