Project Statement

n this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

First Upload libraries needed for the project

library(stringr)

Upload the files from my computer

source_data<-read.csv("C:/Users/hangr/Documents/Acquisition and data management/tournamentinfo.txt")
head(source_data)

Cleaning the datasets

#First remove separators
rem_sep<-str_detect(source_data,"\\-----")
src_data<-source_data[!rem_sep]
#Remove headers
src_data<-src_data[-c(1:2),]
src_data<-str_split(src_data,"\\|")
#extract names of all players
ext_names<-str_extract_all(src_data,"[[:alpha:]-?[:alpha:] ?]{2,}")
loc<-str_detect(unlist(ext_names),"[[:alpha:]]{3,}")
player_names<-unlist(ext_names)[loc]
head(player_names)
## [1] " GARY HUA                        " " DAKSHESH DARURI                 "
## [3] " ADITYA BAJAJ                    " " PATRICK H SCHILLING             "
## [5] " HANSHI ZUO                      " " HANSEN SONG                     "
#Get State of origin
state_loc<-str_detect(unlist(ext_names),"[[:alpha:]]{2,}")
get_state<-unlist(ext_names)[(state_loc)&(!loc)]
head(get_state)
## [1] "   ON " "   MI " "   MI " "   MI " "   MI " "   OH "
#Get Point per games
pts_data<-str_extract_all(src_data,"\\d{1,}+\\.?.?")
pts_location<-str_detect(unlist(pts_data),"\\d\\.\\d")
pts<-unlist(pts_data)[(pts_location)]
head(pts)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
#Get pre_ratings and post_ratings
#Extract pre ratings
rating_data<-str_extract_all(src_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")

pre_location<-str_detect(unlist(rating_data),"\\b\\d{3,4}P?\\b")
post_location<-str_detect(unlist(rating_data),"\\>.?\\b\\d{3,4}P?\\b")
pre_rating<-unlist(rating_data)[(pre_location)&(!post_location)]
pre_rating<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rating_data)[post_location]
post_rating<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)
## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"
head(post_rating)
## [1] "1817" "1663" "1640" "1744" "1690" "1687"
#Get Games played
game_data<-str_extract_all(src_data,"[[:alpha:]]...\\d{1,2}")
game_data<-str_extract_all(game_data,"\\.?\\d{1,2}")
game_data<-str_replace_all(game_data,"\\b[0]\\b",".")
game<-str_detect(game_data,fixed("."))
game_data<-game_data[!(game)]
head(game_data)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")" 
## [2] "c(\"17\", \"9\")"                                        
## [3] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")" 
## [4] "c(\"15\", \"5\")"                                        
## [5] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")"
## [6] "c(\"13\", \"8\")"
#Combined all the extract into a data frame
player_rank<-data.frame(player_names,get_state,pts,pre_rating,post_rating)
head(player_rank)
#Convert pts, pre_rating and post_rating to numeric
player_rank$pts<-as.numeric(as.character(player_rank$pts))
player_rank$pre_rating<-as.numeric(as.character(player_rank$pre_rating))
player_rank$post_rating<-as.numeric(as.character(player_rank$post_rating))

head(player_rank)
result<-array(0)
#Calculate the average
for (i in 1:nrow(player_rank))
{
  result_match<-as.numeric(str_split(unlist(str_extract_all(game_data[i],"\\d{1,2}")), " "))
  result[i]<-mean(player_rank[result_match,colnames(player_rank)=="pre_rating"])
};
player_rank$avg_rating<-result
head(player_rank)

Write the output to CSV file

write.csv(player_rank, "chess_data.csv", row.names=FALSE)