library(stringr)
raw.data <- readLines("D:/CUNY_SPS_DA/607_Data_Aq/Projet1/source_file/tournamentinfo.txt")
## Warning in readLines("D:/CUNY_SPS_DA/607_Data_Aq/Projet1/source_file/
## tournamentinfo.txt"): incomplete final line found on 'D:/CUNY_SPS_DA/
## 607_Data_Aq/Projet1/source_file/tournamentinfo.txt'
head(raw.data,10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
nrow <-length(raw.data) #Number of the row in the file
raw.data[5]
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
raw.data[6]
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
chars1 <- unlist(str_extract_all(raw.data[5], "[[:alnum:][:alnum:].*, ]{2,}"))
chars1
## [1] " 1 "
## [2] " GARY HUA "
## [3] "6.0 "
## [4] "W 39"
## [5] "W 21"
## [6] "W 18"
## [7] "W 14"
## [8] "W 7"
## [9] "D 12"
## [10] "D 4"
chars1[2] #get name
## [1] " GARY HUA "
length(chars1)
## [1] 10
chars2 <- unlist(str_extract_all(raw.data[6], "[[:alnum:][:alnum:].*, ]{2,}"))
chars2
## [1] " ON " " 15445895 " " R" " 1794 " "1817 "
## [6] "2 " "W " "B " "W " "B "
## [11] "W " "B " "W "
chars2[1] #get location name
## [1] " ON "
length(chars2)
## [1] 13
j <- 0
k <- 0
table1=c()
df=c()
for(i in 5:nrow) {
remainder1 <- (i-2)%%3
remainder2 <- i%%3
# Create table "table1":player's names
if(remainder1 == 0) {
j <-j+1
table1[j] <-data.frame(unlist(str_extract_all(raw.data[i], "[[:alnum:][:alnum:].*\\-?*//[:alnum:], ]{2,}")))
}
# Create table "df":player's scores
else if(remainder2 == 0) {
k <- k+1
df[k] <-data.frame(unlist(str_extract_all(raw.data[i], "[[:alnum:][:alnum:].*, ]{2,}")))
}
else{ i<-i+1 }
}
print(j) # number of player
## [1] 64
print(table1[j]) # last player info
## [[1]]
## [1] 64 BEN LI
## [3] 1.0 L 22
## [5] D 30 L 31
## [7] D 49 L 46
## [9] L 42 L 54
## 10 Levels: 64 BEN LI 1.0 D 30 ... L 54
print(table1[[j]][1]) # last player's ID
## [1] 64
## 10 Levels: 64 BEN LI 1.0 D 30 ... L 54
print(table1[[j]][2]) # last player's name
## [1] BEN LI
## 10 Levels: 64 BEN LI 1.0 D 30 ... L 54
#print(df[[k]]) # last player info
print(df[[k]][1]) # last player's location
## [1] MI
## Levels: MI 1163 15006561 R 1112 B W
print(df[[k]][4]) # last player's pre_Rtg
## [1] 1163
## Levels: MI 1163 15006561 R 1112 B W
Id=c()
name=c()
location=c()
pre_Rtg=c()
total=c()
round1=c()
round2=c()
round3=c()
round4=c()
round5=c()
round6=c()
round7=c()
for(i in 1:j){
#create vectors for variables
Id[i] <- as.character(table1[[i]][1])
name[i] <- as.character(table1[[i]][2])
location[i] <- as.character(df[[i]][1])
pre_Rtg[i] <- as.character(df[[i]][4])
total[i]<- as.character(table1[[i]][3])
round1[i]<- as.character(table1[[i]][4])
round2[i]<- as.character(table1[[i]][5])
round3[i]<- as.character(table1[[i]][6])
round4[i]<- as.character(table1[[i]][7])
round5[i]<- as.character(table1[[i]][8])
round6[i]<- as.character(table1[[i]][9])
round7[i]<- as.character(table1[[i]][10])
#clear data
Id[i] <-as.numeric(as.character(str_replace_all(Id[i], pattern = " ", replacement = "")))
name[i] <-unlist(str_extract_all(name[i], "\\b[[:alpha:].*\\-?*\\>, ]{2,}"))
location[i] <-str_replace_all(location[i], pattern = " ", replacement = "")
pre_Rtg[i] <-as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i],"[:digit:]+"))))
total[i]<-as.numeric(as.character(str_replace_all(total[i], pattern = " ", replacement = "")))
round1[i] <-str_sub(round1[i], start= -2)
round1[i] <-as.numeric(as.character(str_replace_all(round1[i], pattern = " ", replacement = "")))
round2[i] <-str_sub(round2[i], start= -2)
round2[i] <-as.numeric(as.character(str_replace_all(round2[i], pattern = " ", replacement = "")))
round3[i] <-str_sub(round3[i], start= -2)
round3[i] <-as.numeric(as.character(str_replace_all(round3[i], pattern = " ", replacement = "")))
round4[i] <-str_sub(round4[i], start= -2)
round4[i] <-as.numeric(as.character(str_replace_all(round4[i], pattern = " ", replacement = "")))
round5[i] <-str_sub(round5[i], start= -2)
round5[i] <-as.numeric(as.character(str_replace_all(round5[i], pattern = " ", replacement = "")))
round6[i] <-str_sub(round6[i], start= -2)
round6[i] <-as.numeric(as.character(str_replace_all(round6[i], pattern = " ", replacement = "")))
round7[i] <-str_sub(round7[i], start= -2)
round7[i] <-as.numeric(as.character(str_replace_all(round7[i], pattern = " ", replacement = "")))
}
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
## Warning in pre_Rtg[i] <-
## as.numeric(as.character(unlist(str_extract_all(pre_Rtg[i], : number of
## items to replace is not a multiple of replacement length
#Check data
head(name,1)
## [1] "GARY HUA "
head(location,1)
## [1] "ON"
head(pre_Rtg,1)
## [1] "1794"
head(total,1)
## [1] "6"
head(round1,1)
## [1] "39"
head(round2,1)
## [1] "21"
head(round3,1)
## [1] "18"
head(round4,1)
## [1] "14"
head(round5,1)
## [1] "7"
head(round6,1)
## [1] "12"
head(round7,1)
## [1] "4"
#Check special values
head(Id,5)
## [1] "1" "2" "3" "4" "5"
name[28]
## [1] "SOFIA ADINA STANESCU-BELLU "
pre_Rtg[8]
## [1] "1641"
mydata <- data.frame(Id,name,location,pre_Rtg,total,round1,round2,round3,round4,round5,round6,round7)
head(mydata,5)
## Id name location pre_Rtg total round1 round2
## 1 1 GARY HUA ON 1794 6 39 21
## 2 2 DAKSHESH DARURI MI 1553 6 63 58
## 3 3 ADITYA BAJAJ MI 1384 6 8 61
## 4 4 PATRICK H SCHILLING MI 1716 5.5 23 28
## 5 5 HANSHI ZUO MI 1655 5.5 45 37
## round3 round4 round5 round6 round7
## 1 18 14 7 12 4
## 2 4 17 16 20 7
## 3 25 21 11 13 12
## 4 2 26 5 19 1
## 5 12 13 4 14 17
dim(mydata)
## [1] 64 12
str(mydata)
## 'data.frame': 64 obs. of 12 variables:
## $ Id : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
## $ name : Factor w/ 64 levels "ADITYA BAJAJ ",..: 24 12 1 51 28 27 23 21 59 5 ...
## $ location: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
## $ pre_Rtg : Factor w/ 64 levels "1011","1056",..: 57 37 22 55 50 53 49 48 26 20 ...
## $ total : Factor w/ 11 levels "1","1.5","2",..: 11 11 11 10 10 9 9 9 9 9 ...
## $ round1 : Factor w/ 60 levels "1","10","11",..: 32 56 59 16 38 28 49 23 18 8 ...
## $ round2 : Factor w/ 62 levels "1","10","11",..: 14 54 57 21 31 22 41 26 10 11 ...
## $ round3 : Factor w/ 60 levels "1","10","11",..: 9 33 17 11 4 3 5 6 52 48 ...
## $ round4 : Factor w/ 62 levels "1","10","11",..: 6 9 14 19 5 29 3 62 61 25 ...
## $ round5 : Factor w/ 52 levels "1","10","11",..: 50 7 3 38 29 2 1 36 17 45 ...
## $ round6 : Factor w/ 58 levels "1","10","11",..: 4 13 5 11 6 20 58 21 56 18 ...
## $ round7 : Factor w/ 54 levels "1","10","11",..: 31 52 4 1 8 13 11 10 12 9 ...
mydata$Id <- as.numeric(as.character(mydata$Id ))
mydata$name <- as.character(mydata$name )
mydata$location <- as.character(mydata$location)
mydata$pre_Rtg <- as.numeric(as.character(mydata$pre_Rtg))
mydata$total<- as.numeric(as.character(mydata$total))
mydata$round1<- as.numeric(as.character(mydata$round1))
mydata$round2<- as.numeric(as.character(mydata$round2))
mydata$round3<- as.numeric(as.character(mydata$round3))
mydata$round4<- as.numeric(as.character(mydata$round4))
mydata$round5<- as.numeric(as.character(mydata$round5))
mydata$round6<- as.numeric(as.character(mydata$round6))
mydata$round7<- as.numeric(as.character(mydata$round7))
str(mydata)
## 'data.frame': 64 obs. of 12 variables:
## $ Id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ name : chr "GARY HUA " "DAKSHESH DARURI " "ADITYA BAJAJ " "PATRICK H SCHILLING " ...
## $ location: chr "ON" "MI" "MI" "MI" ...
## $ pre_Rtg : num 1794 1553 1384 1716 1655 ...
## $ total : num 6 6 6 5.5 5.5 5 5 5 5 5 ...
## $ round1 : num 39 63 8 23 45 34 57 3 25 16 ...
## $ round2 : num 21 58 61 28 37 29 46 32 18 19 ...
## $ round3 : num 18 4 25 2 12 11 13 14 59 55 ...
## $ round4 : num 14 17 21 26 13 35 11 9 8 31 ...
## $ round5 : num 7 16 11 5 4 10 1 47 26 6 ...
## $ round6 : num 12 20 13 19 14 27 9 28 7 25 ...
## $ round7 : num 4 7 12 1 17 21 2 19 20 18 ...
summary(mydata)
## Id name location pre_Rtg
## Min. : 1.00 Length:64 Length:64 Min. : 377
## 1st Qu.:16.75 Class :character Class :character 1st Qu.:1227
## Median :32.50 Mode :character Mode :character Median :1407
## Mean :32.50 Mean :1378
## 3rd Qu.:48.25 3rd Qu.:1583
## Max. :64.00 Max. :1794
##
## total round1 round2 round3
## Min. :1.000 Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.:2.500 1st Qu.:15.75 1st Qu.:16.25 1st Qu.:16.50
## Median :3.500 Median :30.50 Median :31.50 Median :31.50
## Mean :3.438 Mean :31.50 Mean :31.60 Mean :31.68
## 3rd Qu.:4.000 3rd Qu.:47.25 3rd Qu.:46.75 3rd Qu.:46.25
## Max. :6.000 Max. :64.00 Max. :64.00 Max. :64.00
## NA's :4 NA's :2 NA's :4
## round4 round5 round6 round7
## Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.:16.25 1st Qu.:14.75 1st Qu.:15.25 1st Qu.:14.25
## Median :31.50 Median :28.50 Median :29.50 Median :30.50
## Mean :31.68 Mean :30.19 Mean :30.12 Mean :30.06
## 3rd Qu.:46.75 3rd Qu.:45.25 3rd Qu.:44.75 3rd Qu.:44.75
## Max. :64.00 Max. :64.00 Max. :64.00 Max. :64.00
## NA's :2 NA's :12 NA's :6 NA's :10
for (i in 1:j){
r1 <- mydata$round1[i]
r2 <- mydata$round2[i]
r3 <- mydata$round3[i]
r4 <- mydata$round4[i]
r5 <- mydata$round5[i]
r6 <- mydata$round6[i]
r7 <- mydata$round7[i]
mydata$round1[i] <- mydata$pre_Rtg[r1]
mydata$round2[i] <- mydata$pre_Rtg[r2]
mydata$round3[i] <- mydata$pre_Rtg[r3]
mydata$round4[i] <- mydata$pre_Rtg[r4]
mydata$round5[i] <- mydata$pre_Rtg[r5]
mydata$round6[i] <- mydata$pre_Rtg[r6]
mydata$round7[i] <- mydata$pre_Rtg[r7]
}
head(mydata)
## Id name location pre_Rtg total round1 round2
## 1 1 GARY HUA ON 1794 6.0 1436 1563
## 2 2 DAKSHESH DARURI MI 1553 6.0 1175 917
## 3 3 ADITYA BAJAJ MI 1384 6.0 1641 955
## 4 4 PATRICK H SCHILLING MI 1716 5.5 1363 1507
## 5 5 HANSHI ZUO MI 1655 5.5 1242 980
## 6 6 HANSEN SONG OH 1686 5.0 1399 1602
## round3 round4 round5 round6 round7
## 1 1600 1610 1649 1663 1716
## 2 1716 1629 1604 1595 1649
## 3 1745 1563 1712 1666 1663
## 4 1553 1579 1655 1564 1794
## 5 1663 1666 1716 1610 1629
## 6 1712 1438 1365 1552 1563
dim(mydata)
## [1] 64 12
mydata["avg"] <- 0
mydata[mydata=="" ]<-0
for(i in 1:j){
colSums(mydata[,c(6,12)], na.rm = TRUE)
mydata$avg[i] <-as.integer((mydata$round1[i]+mydata$round2[i]+mydata$round3[i]+mydata$round4[i]+mydata$round5[i]+ mydata$round6[i]+ mydata$round7[i])/7)
}
head(mydata$avg)
## [1] 1605 1469 1563 1573 1500 1518
head(mydata)
## Id name location pre_Rtg total round1 round2
## 1 1 GARY HUA ON 1794 6.0 1436 1563
## 2 2 DAKSHESH DARURI MI 1553 6.0 1175 917
## 3 3 ADITYA BAJAJ MI 1384 6.0 1641 955
## 4 4 PATRICK H SCHILLING MI 1716 5.5 1363 1507
## 5 5 HANSHI ZUO MI 1655 5.5 1242 980
## 6 6 HANSEN SONG OH 1686 5.0 1399 1602
## round3 round4 round5 round6 round7 avg
## 1 1600 1610 1649 1663 1716 1605
## 2 1716 1629 1604 1595 1649 1469
## 3 1745 1563 1712 1666 1663 1563
## 4 1553 1579 1655 1564 1794 1573
## 5 1663 1666 1716 1610 1629 1500
## 6 1712 1438 1365 1552 1563 1518
#outcome_string <- data.frame(mydata$name,mydata$location, mydata$total,mydata$pre_Rtg,mydata$avg)
#summary(outcome)
#head(outcome)
outcome_string<- data.frame(paste(mydata$name,mydata$location, mydata$total,mydata$pre_Rtg,mydata$avg, sep = ","))
head(outcome_string)
## paste.mydata.name..mydata.location..mydata.total..mydata.pre_Rtg..
## 1 GARY HUA ,ON,6,1794,1605
## 2 DAKSHESH DARURI ,MI,6,1553,1469
## 3 ADITYA BAJAJ ,MI,6,1384,1563
## 4 PATRICK H SCHILLING ,MI,5.5,1716,1573
## 5 HANSHI ZUO ,MI,5.5,1655,1500
## 6 HANSEN SONG ,OH,5,1686,1518
write.table(outcome_string, "D:/CUNY_SPS_DA/607_Data_Aq/Projet1/mydata.txt", sep="\t")
Note