Project 1

Gehad Gad

February 23, 2020

DATA 607 - Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

#Import libraries and/or Packages

library(stringr)
library (ggplot2)

# I Loaded the text file to my github to be read in r.

file = "https://github.com/GehadGad/DATA607-Project-1/raw/master/tournamentinfo.txt"

# The whole data can be seen from the link below.
#Data <- read.table (url("https://github.com/GehadGad/DATA607-Project-1/raw/master/tournamentinfo.txt"), sep = ",")

#Display the head of the Data

data=readLines(file , warn=FALSE )

head(data)

## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

# Data cleaning, removing separators
data<-data[!str_detect(data,"\\-----")]

# Removing headers and separators
data<-str_split(data[3:length(data)],"\\|")

#Extract names of all players
extracted<-str_extract_all(data,"[[:alpha:]-?[:alpha:]  ?]{2,}")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

names<-unlist(extracted)[str_detect(unlist(extracted),"[[:alpha:]]{3,}")]
head(names)

## [1] " GARY HUA                        " " DAKSHESH DARURI                 "
## [3] " ADITYA BAJAJ                    " " PATRICK H SCHILLING             "
## [5] " HANSHI ZUO                      " " HANSEN SONG                     "

#Extract state of origin
state<-unlist(extracted)[(str_detect(unlist(extracted),"[[:alpha:]]{2}"))&(!str_detect(unlist(extracted),"[[:alpha:]]{3,}"))]
head(state)

## [1] "   ON " "   MI " "   MI " "   MI " "   MI " "   OH "

#Extract points
num_data<-str_extract_all(data,"\\d{1,}+\\.?.?")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

pts<-unlist(num_data)[(str_detect(unlist(num_data),"\\d\\.\\d"))]
head(pts)

## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"

#Extract pre->post
rtg<-str_extract_all(data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

pre_rating<-unlist(rtg)[(str_detect(unlist(rtg),"\\b\\d{3,4}P?\\b"))&(!str_detect(unlist(rtg),"\\>.?\\b\\d{3,4}P?\\b"))]
pre<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rtg)[str_detect(unlist(rtg),"\\>.?\\b\\d{3,4}P?\\b")]
post<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)

## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"

#Extract games played by player
games_data<-str_replace_all(str_extract_all(str_extract_all(data,"[WDL]...\\d{1,2}"),"\\.?\\d{1,2}"),"\\b[0]\\b",".")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing

games<-str_detect(games_data,fixed("."))
games_data<-games_data[!(games)]
head(games_data)

## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"  
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"  
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")" 
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"   
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")" 
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"

id<-seq(1,64,by=1)
names<-str_trim(names,"both")
state<-str_trim(state,"both")
pre_rating<-str_trim(pre_rating,"both")
post_rating<-str_trim(post_rating,"both")
player_ranks<-as.data.frame(cbind(id,names,state,pts,pre_rating,post_rating))

#Calculate the mean of opponent rankings
results<-array(0,dim=nrow(player_ranks))

player_ranks[] <- lapply(player_ranks, gsub, pattern='>', replacement='')
player_ranks[] <- lapply(player_ranks, gsub, pattern='P', replacement='')


player_ranks$pts<-as.numeric(as.character(player_ranks$pts))
player_ranks$pre_rating<-as.numeric(as.character(player_ranks$pre_rating))
player_ranks$post_rating<-as.numeric(as.character(player_ranks$post_rating))

#loop through all the players
for (i in 1:nrow(player_ranks)){ 
  #extract opponent data and query rankings
  match_res<-as.numeric(str_split(unlist(str_extract_all(games_data[i],"\\d{1,2}"))," "))
  #calcuate the average of opponent rankings
results[i]<-mean(player_ranks[match_res,colnames(player_ranks)=="pre_rating"])
};

player_ranks$avg_rating<-results
head(player_ranks)

##   id              names state pts pre_rating post_rating avg_rating
## 1  1           GARY HUA    ON 6.0       1794        1817   1605.286
## 2  2    DAKSHESH DARURI    MI 6.0       1553        1663   1469.286
## 3  3       ADITYA BAJAJ    MI 6.0       1384        1640   1563.571
## 4  4 ATRICK H SCHILLING    MI 5.5       1716        1744   1573.571
## 5  5         HANSHI ZUO    MI 5.5       1655        1690   1500.857
## 6  6        HANSEN SONG    OH 5.0       1686        1687   1518.714

# Export csv file.

write.csv(player_ranks,"player_rank",row.names=TRUE)