DS607-Project1

Project Outline

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

-Player’s Name -Player’s State, -Total Number of Points, -Player’s Pre-Rating, -Average Pre Chess Rating of Opponent

Setting up supporting functions

To deal with the data’s ETL process, I will create several r functions to avoid code repetition. These functions will be listed here.

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

## -- Attaching packages ------------------------------------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#returns a list of rows in a data frame
rows = function(tab) lapply(
  seq_len(nrow(tab)),
  function(i) unclass(tab[i,,drop=F])
)

# creates an empty data frame
create_empty_table <- function(num_rows, num_cols, type_vec) {
  frame <- data.frame(matrix(NA, nrow = num_rows, ncol = num_cols))
  for(i in 1:ncol(frame)) {
    print(type_vec[i])
    if(type_vec[i] == 'numeric') {frame[,i] <- as.numeric(frame[,i])}
    if(type_vec[i] == 'character') {frame[,i] <- as.character(frame[,i])}
    if(type_vec[i] == 'logical') {frame[,i] <- as.logical(frame[,i])}
    if(type_vec[i] == 'factor') {frame[,i] <- as.factor(frame[,i])}
  }
  return(frame)
}

# removes the pipe character and trims the strings
clean_text <- function(text ) { 
  text <- str_trim(gsub("\\|", "", text))
  return(text)
}

#gets the rating of an opponent based on the result
get_opp_rating <- function( df, result) { 
  #extract player's ID from result 
  opp_num <- as.numeric(str_extract_all(result, "\\d+"))
  opp_rating <- df[opp_num,]$pre_rating
  
  return(opp_rating)
}

# gets the average opponent rating by looking at each 
# opponent's ratings
get_avg_opponent_rtg <- function( df, id) { 
    opp_ratings <- c( get_opp_rating(df, df[id, ]$round1), 
                      get_opp_rating(df, df[id, ]$round2),
                      get_opp_rating(df, df[id, ]$round3),
                      get_opp_rating(df, df[id, ]$round4),
                      get_opp_rating(df, df[id, ]$round5),
                      get_opp_rating(df, df[id, ]$round6),
                      get_opp_rating(df, df[id, ]$round7))
    return(mean(opp_ratings, na.rm = TRUE))
}

Extract the data

The data lives in my github, and follows this format: ----------------------------------------------------------------------------------------- Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | ----------------------------------------------------------------------------------------- 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W | -----------------------------------------------------------------------------------------

Getting the data

We will first get the data into a dataframe called tournament. We will remove the lines with only dashes

#getting the data from the file
theURL <- "https://raw.githubusercontent.com/georg4re/DS607/master/project1/tournamentinfo.txt"

#using read.fwf will help me with the fixed width nature of the document
tournament <- read.fwf(theURL, width = c(7,34,6,6,6,6,6,6,6,6))

#removing the separating lines between player information
nEnd <- nrow(tournament)
if ((nEnd -1) %% 3 > 0 ) { 
  nEnd = nEnd - ((nEnd -1) %% 3) + 1
} 

removeRows = seq(from=1, to= nEnd, by=3)
tournament <- tournament %>% slice(-removeRows)

# Remove the title values
clean_tournament <- tournament %>% slice(-c(1,2))
head(clean_tournament,10)

##         V1                                 V2     V3     V4     V5     V6
## 1      1 |  GARY HUA                        | 6.0  | W  39| W  21| W  18|
## 2     ON |  15445895 / R: 1794   ->1817     | N:2  | W    | B    | W    |
## 3      2 |  DAKSHESH DARURI                 | 6.0  | W  63| W  58| L   4|
## 4     MI |  14598900 / R: 1553   ->1663     | N:2  | B    | W    | B    |
## 5      3 |  ADITYA BAJAJ                    | 6.0  | L   8| W  61| W  25|
## 6     MI |  14959604 / R: 1384   ->1640     | N:2  | W    | B    | W    |
## 7      4 |  PATRICK H SCHILLING             | 5.5  | W  23| D  28| W   2|
## 8     MI |  12616049 / R: 1716   ->1744     | N:2  | W    | B    | W    |
## 9      5 |  HANSHI ZUO                      | 5.5  | W  45| W  37| D  12|
## 10    MI |  14601533 / R: 1655   ->1690     | N:2  | B    | W    | B    |
##        V7     V8     V9    V10
## 1  W  14| W   7| D  12| D   4|
## 2  B    | W    | B    | W    |
## 3  W  17| W  16| W  20| W   7|
## 4  W    | B    | W    | B    |
## 5  W  21| W  11| W  13| W  12|
## 6  B    | W    | B    | W    |
## 7  W  26| D   5| W  19| D   1|
## 8  B    | W    | B    | B    |
## 9  D  13| D   4| W  14| W  17|
## 10 W    | B    | W    | B    |

Transform the data into a player related dataframe

With the pieces in place, we can now transform the data into a player-centric dataframe.

Create the empty receiving table: player_data

#set the column types
types <- c("numeric","character","character","character","character","character","character","character","character","character", "character", "character")

player_data <- create_empty_table(0, 12, types)

## [1] "numeric"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"
## [1] "character"

Let’s iterate thru the original data to get our values into player_data

line1 <- vector()
line2 <- vector()

for (i in 1 : nrow(clean_tournament)) {
  # If it's the second line, add the player info to the table
  if ((i %% 2) == 0) {
    line2 <- clean_tournament[i, ]

    player_data <- rbind(player_data, c(as.numeric(clean_text(line1$V1)), clean_text(line1$V2),
                                        clean_text(line1$V3), clean_text(line1$V4),
                                        clean_text(line1$V5), clean_text(line1$V6),
                                        clean_text(line1$V7), clean_text(line1$V8),
                                        clean_text(line1$V9), clean_text(line1$V10),
                                        clean_text(line2$V1), clean_text(line2$V2)))
  } 
  else { 
    line1 <- clean_tournament[i, ]
  }
}


titles <- c("player_num","name","total_pts","round1","round2","round3","round4","round5","round6","round7", "state", "ID_Rtg")

colnames(player_data) <- titles
#Ensure player_num is numeric 
player_data$player_num <- as.numeric(as.character(player_data$player_num))
head(player_data,5)

##   player_num                name total_pts round1 round2 round3 round4 round5
## 1          1            GARY HUA       6.0  W  39  W  21  W  18  W  14  W   7
## 2          2     DAKSHESH DARURI       6.0  W  63  W  58  L   4  W  17  W  16
## 3          3        ADITYA BAJAJ       6.0  L   8  W  61  W  25  W  21  W  11
## 4          4 PATRICK H SCHILLING       5.5  W  23  D  28  W   2  W  26  D   5
## 5          5          HANSHI ZUO       5.5  W  45  W  37  D  12  D  13  D   4
##   round6 round7 state                      ID_Rtg
## 1  D  12  D   4    ON 15445895 / R: 1794   ->1817
## 2  W  20  W   7    MI 14598900 / R: 1553   ->1663
## 3  W  13  W  12    MI 14959604 / R: 1384   ->1640
## 4  W  19  D   1    MI 12616049 / R: 1716   ->1744
## 5  W  14  W  17    MI 14601533 / R: 1655   ->1690

We now have the data somewhat in place. We can start cleaning it up by extracting the values we need.

Extract the ID, Pre Rating and Post Rating into their own columns

extracted_id_rtg <- data.frame( player_num= player_data$player_num,
                                uscf_id = str_extract(player_data$ID_Rtg, "\\d+"),
                                pre_rating = as.numeric(str_extract(player_data$ID_Rtg, 
                                                                    "(?<=R: ).\\d+(?-)")),
                                post_rating = as.numeric(
                                  str_extract(
                                    str_replace_all(player_data$ID_Rtg,"->","P:"),
                                    "(?<=P:).\\d+")),
                                stringsAsFactors=FALSE)

player_data <- merge(x= player_data, y = extracted_id_rtg, by="player_num", all.x = TRUE)
head(player_data,5)

##   player_num                name total_pts round1 round2 round3 round4 round5
## 1          1            GARY HUA       6.0  W  39  W  21  W  18  W  14  W   7
## 2          2     DAKSHESH DARURI       6.0  W  63  W  58  L   4  W  17  W  16
## 3          3        ADITYA BAJAJ       6.0  L   8  W  61  W  25  W  21  W  11
## 4          4 PATRICK H SCHILLING       5.5  W  23  D  28  W   2  W  26  D   5
## 5          5          HANSHI ZUO       5.5  W  45  W  37  D  12  D  13  D   4
##   round6 round7 state                      ID_Rtg  uscf_id pre_rating
## 1  D  12  D   4    ON 15445895 / R: 1794   ->1817 15445895       1794
## 2  W  20  W   7    MI 14598900 / R: 1553   ->1663 14598900       1553
## 3  W  13  W  12    MI 14959604 / R: 1384   ->1640 14959604       1384
## 4  W  19  D   1    MI 12616049 / R: 1716   ->1744 12616049       1716
## 5  W  14  W  17    MI 14601533 / R: 1655   ->1690 14601533       1655
##   post_rating
## 1        1817
## 2        1663
## 3        1640
## 4        1744
## 5        1690

Calculate Opponents Pre rating

Now, we will calculate the player’s opponents’ average pre-ratings and merge it to our player data table

avg_opp_rtg <- data.frame( player_num= "1",
                       avgOppRating= as.numeric(get_avg_opponent_rtg(player_data,1)),
                       stringsAsFactors=FALSE)

for (i in 2:nrow(player_data)) { 
  avg_opp_rtg <- rbind( avg_opp_rtg, c(as.character(i), 
                        as.numeric(get_avg_opponent_rtg(player_data,i))
                        ))  
}
player_data <- merge(x= player_data, y = avg_opp_rtg, by="player_num", all.x = TRUE)
head(player_data,5)

##   player_num                name total_pts round1 round2 round3 round4 round5
## 1          1            GARY HUA       6.0  W  39  W  21  W  18  W  14  W   7
## 2          2     DAKSHESH DARURI       6.0  W  63  W  58  L   4  W  17  W  16
## 3          3        ADITYA BAJAJ       6.0  L   8  W  61  W  25  W  21  W  11
## 4          4 PATRICK H SCHILLING       5.5  W  23  D  28  W   2  W  26  D   5
## 5          5          HANSHI ZUO       5.5  W  45  W  37  D  12  D  13  D   4
##   round6 round7 state                      ID_Rtg  uscf_id pre_rating
## 1  D  12  D   4    ON 15445895 / R: 1794   ->1817 15445895       1794
## 2  W  20  W   7    MI 14598900 / R: 1553   ->1663 14598900       1553
## 3  W  13  W  12    MI 14959604 / R: 1384   ->1640 14959604       1384
## 4  W  19  D   1    MI 12616049 / R: 1716   ->1744 12616049       1716
## 5  W  14  W  17    MI 14601533 / R: 1655   ->1690 14601533       1655
##   post_rating     avgOppRating
## 1        1817 1605.28571428571
## 2        1663 1469.28571428571
## 3        1640 1563.57142857143
## 4        1744 1573.57142857143
## 5        1690 1500.85714285714

Producing the resulting data frame

final_df <- data.frame(Name= player_data$name, 
                       State= player_data$state, 
                       total_pts= player_data$total_pts, 
                       pre_rating= player_data$pre_rating, 
                       opp_rating= as.numeric(as.character(player_data$avgOppRating)))
head(final_df,5)

##                  Name State total_pts pre_rating opp_rating
## 1            GARY HUA    ON       6.0       1794   1605.286
## 2     DAKSHESH DARURI    MI       6.0       1553   1469.286
## 3        ADITYA BAJAJ    MI       6.0       1384   1563.571
## 4 PATRICK H SCHILLING    MI       5.5       1716   1573.571
## 5          HANSHI ZUO    MI       5.5       1655   1500.857

Produce Final CSV file

write.csv(final_df, file = "ds607-gc-project1.csv")

9/19/2020

DS607-Project1

George Cruz

9/17/2020

Project Outline

Setting up supporting functions

Extract the data

Getting the data