DATA 607 Project 1

Background

The assignment tasks students with creating a Mardown file that produces a .csv file for further analysis from a text file with chess tournament results.

Import and Clean Data

chess_file = "https://raw.githubusercontent.com/evanmclaughlin/ECM607/9e2ce75e6992b76c132019c38ce9c51eb0660570/tournamentinfo.txt"

chess_data= readLines( chess_file , warn = FALSE)

#next, eliminate dashed lines and white space
chess_data<-chess_data[!str_detect(chess_data,"\\-----")]
chess_data<-str_split(chess_data[3:length(chess_data)],"\\|")

head(chess_data)

## [[1]]
##  [1] "    1 "                            " GARY HUA                        "
##  [3] "6.0  "                             "W  39"                            
##  [5] "W  21"                             "W  18"                            
##  [7] "W  14"                             "W   7"                            
##  [9] "D  12"                             "D   4"                            
## [11] ""                                 
## 
## [[2]]
##  [1] "   ON "                            " 15445895 / R: 1794   ->1817     "
##  [3] "N:2  "                             "W    "                            
##  [5] "B    "                             "W    "                            
##  [7] "B    "                             "W    "                            
##  [9] "B    "                             "W    "                            
## [11] ""                                 
## 
## [[3]]
##  [1] "    2 "                            " DAKSHESH DARURI                 "
##  [3] "6.0  "                             "W  63"                            
##  [5] "W  58"                             "L   4"                            
##  [7] "W  17"                             "W  16"                            
##  [9] "W  20"                             "W   7"                            
## [11] ""                                 
## 
## [[4]]
##  [1] "   MI "                            " 14598900 / R: 1553   ->1663     "
##  [3] "N:2  "                             "B    "                            
##  [5] "W    "                             "B    "                            
##  [7] "W    "                             "B    "                            
##  [9] "W    "                             "B    "                            
## [11] ""                                 
## 
## [[5]]
##  [1] "    3 "                            " ADITYA BAJAJ                    "
##  [3] "6.0  "                             "L   8"                            
##  [5] "W  61"                             "W  25"                            
##  [7] "W  21"                             "W  11"                            
##  [9] "W  13"                             "W  12"                            
## [11] ""                                 
## 
## [[6]]
##  [1] "   MI "                            " 14959604 / R: 1384   ->1640     "
##  [3] "N:2  "                             "W    "                            
##  [5] "B    "                             "W    "                            
##  [7] "B    "                             "W    "                            
##  [9] "B    "                             "W    "                            
## [11] ""

Extract appropriate data for subsetting

#start with names
chess_ext <- str_extract_all(chess_data,"[[:alpha:]-?[:alpha:]  ?]{2,}")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

chess_names <- unlist(chess_ext)[str_detect(unlist(chess_ext),"[[:alpha:]]{3,}")]
#head(chess_names)

#next, extract state
chess_state <- unlist(chess_ext)[(str_detect(unlist(chess_ext),"[[:alpha:]]{2}"))&(!str_detect(unlist(chess_ext),"[[:alpha:]]{3,}"))]
#head(chess_state)

#Total points next, but first extract all numerical data for subset
chess_num <- str_extract_all(chess_data,"\\d{1,}+\\.?.?")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

points <- unlist(chess_num)[(str_detect(unlist(chess_num),"\\d\\.\\d"))]

#head(points)

# now for pre-rating

rating <- str_extract_all(chess_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

pre_rating <- unlist(rating)[(str_detect(unlist(rating),"\\b\\d{3,4}P?\\b"))&(!str_detect(unlist(rating),"\\>.?\\b\\d{3,4}P?\\b"))]
pre_rating <- str_replace_all(pre_rating,"P","")
#head(pre_rating)

id <- seq(1,64,by=1)
names <- str_trim(chess_names,"both")
state <- str_trim(chess_state,"both")
pre_rating <- str_trim(pre_rating,"both")
rankings <- as.data.frame(cbind(id, names, state, points, pre_rating))

head(rankings)

##   id               names state points pre_rating
## 1  1            GARY HUA    ON    6.0       1794
## 2  2     DAKSHESH DARURI    MI    6.0       1553
## 3  3        ADITYA BAJAJ    MI    6.0       1384
## 4  4 PATRICK H SCHILLING    MI    5.5       1716
## 5  5          HANSHI ZUO    MI    5.5       1655
## 6  6         HANSEN SONG    OH    5.0       1686

Now that we have our data extracted, we can extract the opponent data for the calculated field

#extract opponent data now, pulling in games played
games_info <- str_replace_all(str_extract_all(str_extract_all(chess_data,"[WDL]...\\d{1,2}"),"\\.?\\d{1,2}"),"\\b[0]\\b",".")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing

games <- str_detect(games_info,fixed("."))
games_info <- games_info[!(games)]

head(games_info)

## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"  
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"  
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")" 
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"   
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")" 
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"

Now that the data is cleaned, we can focus on calculating the average opponent rating

rankings[] <- lapply(rankings, gsub, pattern = '>', replacement='')
rankings[] <- lapply(rankings, gsub, pattern = 'P', replacement='')

#create a loop to run calculation for each player using opponent data
matches <- array(0, dim = nrow(rankings))
#make calculation fields numeric
rankings$points <- as.numeric(as.character(rankings$points))
rankings$pre_rating <- as.numeric(as.character(rankings$pre_rating))

for (i in 1:nrow(rankings))
  {outcomes <- as.numeric(str_split(unlist(str_extract_all(games_info[i],"\\d{1,2}"))," "))
matches[i] <- mean(rankings[outcomes, colnames(rankings) == "pre_rating"])
}

Now let’s organize our data one more time before printing

rankings$avg_rating <- matches
head(rankings)

##   id              names state points pre_rating avg_rating
## 1  1           GARY HUA    ON    6.0       1794   1605.286
## 2  2    DAKSHESH DARURI    MI    6.0       1553   1469.286
## 3  3       ADITYA BAJAJ    MI    6.0       1384   1563.571
## 4  4 ATRICK H SCHILLING    MI    5.5       1716   1573.571
## 5  5         HANSHI ZUO    MI    5.5       1655   1500.857
## 6  6        HANSEN SONG    OH    5.0       1686   1518.714

write.csv(rankings, "chess_final.csv", row.names = TRUE)

DATA 607 Project 1

Evan McLaughlin

2/23/2021

Background

Import and Clean Data

Extract appropriate data for subsetting

Now that we have our data extracted, we can extract the opponent data for the calculated field

Now that the data is cleaned, we can focus on calculating the average opponent rating

Now let’s organize our data one more time before printing