In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
#Reading the file
tournament = readLines('tournamentinfo.txt')
Below we read the file and parse it to extract the data we need. I decided to run 2 regex statements one per line and have 2 data frames and combine them in the end to get the data I need.
#Regex to read the 1st line starting with Player number.
t=unlist(str_match(tournament, "(\\d+)\\s\\|\\s*(\\w+\\s\\w+\\s?\\w*?\\W?\\s?\\w*?)\\s*\\|(\\d\\.\\d)\\s+\\|[A-Z]\\s+(\\d*)\\|[A-Z]\\s+(\\d*)\\|[A-Z]\\s+(\\d*)\\|[A-Z]\\s+(\\d*)\\|[A-Z]\\s+(\\d*)\\|[A-Z]\\s+(\\d*)\\|[A-Z]\\s+(\\d*)\\|"))
#Regex to read 2nd line starting with State
t1=unlist(str_match(tournament, "([A-Z]{2}).*\\:\\s+(\\d*)\\w?.*\\s"))
#Getting rid of lines with NA
t=t[!(rowSums(is.na(t))),]
t1=t1[!(rowSums(is.na(t1))),]
#stringsAsFactors = FALSE
t=data.frame(t, stringsAsFactors = FALSE)
t1=data.frame(t1, stringsAsFactors = FALSE)
#Removing the first column
t$X1 <- NULL
t1$X1 <- NULL
#Nameing the dataframe columns
names(t) = c("pnum","name","points","game1","game2","game3","game4","game5","game6","game7")
names(t1) =c("state","prescore")
head(t)
## pnum name points game1 game2 game3 game4 game5 game6
## 1 1 GARY HUA 6.0 39 21 18 14 7 12
## 2 2 DAKSHESH DARURI 6.0 63 58 4 17 16 20
## 3 3 ADITYA BAJAJ 6.0 8 61 25 21 11 13
## 4 4 PATRICK H SCHILLING 5.5 23 28 2 26 5 19
## 5 5 HANSHI ZUO 5.5 45 37 12 13 4 14
## 6 6 HANSEN SONG 5.0 34 29 11 35 10 27
## game7
## 1 4
## 2 7
## 3 12
## 4 1
## 5 17
## 6 21
head(t1)
## state prescore
## 1 ON 1794
## 2 MI 1553
## 3 MI 1384
## 4 MI 1716
## 5 MI 1655
## 6 OH 1686
Next we convert the data into numeric data type in order to perform calculations and have a for loop that goes through each player and calculates the average opponents pregame score.
#converting to numeric
t$game1=as.numeric(as.character(t$game1))
t$game2=as.numeric(as.character(t$game2))
t$game3=as.numeric(as.character(t$game3))
t$game4=as.numeric(as.character(t$game4))
t$game5=as.numeric(as.character(t$game5))
t$game6=as.numeric(as.character(t$game6))
t$game7=as.numeric(as.character(t$game7))
t1$prescore=as.numeric(as.character(t1$prescore))
#looping through and calculating the avg.
for (x in 1:64)
{
t$mean[x]=round(mean(c(t1[t[x,4],2],t1[t[x,5],2],t1[t[x,6],2],t1[t[x,7],2],t1[t[x,8],2],t1[t[x,9],2],t1[t[x,10],2]), na.rm=TRUE), digits=0)
}
Next we create the final data frame which will hold all the required columns from 2 data frames and then we save the dataframe into a file called tournament_scores.csv and then read the fila and display it.
#Creating the final dataframe and naming tghe columns
final = cbind.data.frame(t$name,t1$state,t$points,t1$prescore,t$mean)
names(final) =c("Player_Name","State","Points","Pre_score","AvgPnt_score")
#Trimming the trailing whitespace
final$Player_Name=trimws(final$Player_Name)
#Write, read and display the file.
write.csv(final, "tournament_scores.csv", row.names=FALSE)
readfile = read.csv("tournament_scores.csv")
head(readfile)
## Player_Name State Points Pre_score AvgPnt_score
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519