Project1

# Libraries
#library(tidyverse)
library(RCurl)
library(dplyr)
library(stringr)
library(tidyr)

Import Tournament Data

The tournament text file is saved in GitHub. The data is saved to the table t_df and the columns are renamed.

#get URL and import data from GitHub
site <- getURL("https://raw.githubusercontent.com/ltcancel/Project1_Data607_F20/master/tounamentinfo.txt")
df <- read.delim(text = site, header = FALSE,sep = "|")

#create a tibble with the imported data
t_df <- as_tibble(df)

#rename columns
cnames <- c('Pair','PlayerName','TotalPts','Round1','Round2','Round3','Round4','Round5','Round6','Round7','drop')
colnames(t_df) <- cnames

head(t_df)

## # A tibble: 6 x 11
##   Pair  PlayerName TotalPts Round1 Round2 Round3 Round4 Round5 Round6
##   <fct> <fct>      <fct>    <fct>  <fct>  <fct>  <fct>  <fct>  <fct> 
## 1 "---~ ""         ""       ""     ""     ""     ""     ""     ""    
## 2 " Pa~ " Player ~ "Total"  "Roun~ "Roun~ "Roun~ "Roun~ "Roun~ "Roun~
## 3 " Nu~ " USCF ID~ " Pts "  "  1 ~ "  2 ~ "  3 ~ "  4 ~ "  5 ~ "  6 ~
## 4 "---~ ""         ""       ""     ""     ""     ""     ""     ""    
## 5 "   ~ " GARY HU~ "6.0  "  "W  3~ "W  2~ "W  1~ "W  1~ "W   ~ "D  1~
## 6 "   ~ " 1544589~ "N:2  "  "W   ~ "B   ~ "W   ~ "B   ~ "W   ~ "B   ~
## # ... with 2 more variables: Round7 <fct>, drop <lgl>

Data Cleanup

After the data is saved to a table we need to remove all unnecessary information.

#get all rows with the pair number and player name
df_filter1 <- t_df %>%
  filter(str_detect(Pair, regex("\\d"))) %>%
  filter(!str_detect(str_to_lower(TotalPts), "n"))

#get all rows that include the other player information
df_filter2 <- t_df %>%
  filter(str_detect(str_to_lower(Pair), "o|m")) %>%
  filter(!str_detect(str_to_lower(TotalPts), "p"))

#combine both tables
df_combined <- cbind(df_filter1[,1:10], df_filter2[,1:2])

cnames <- c("Playernum","PlayerName","TotalPts",'Round1','Round2','Round3','Round4','Round5','Round6','Round7','State','ratings')

#rename columns again
colnames(df_combined) <- cnames

head(df_combined)

##   Playernum                        PlayerName TotalPts Round1 Round2
## 1        1   GARY HUA                            6.0    W  39  W  21
## 2        2   DAKSHESH DARURI                     6.0    W  63  W  58
## 3        3   ADITYA BAJAJ                        6.0    L   8  W  61
## 4        4   PATRICK H SCHILLING                 5.5    W  23  D  28
## 5        5   HANSHI ZUO                          5.5    W  45  W  37
## 6        6   HANSEN SONG                         5.0    W  34  D  29
##   Round3 Round4 Round5 Round6 Round7  State
## 1  W  18  W  14  W   7  D  12  D   4    ON 
## 2  L   4  W  17  W  16  W  20  W   7    MI 
## 3  W  25  W  21  W  11  W  13  W  12    MI 
## 4  W   2  W  26  D   5  W  19  D   1    MI 
## 5  D  12  D  13  D   4  W  14  W  17    MI 
## 6  L  11  W  35  D  10  W  27  W  21    OH 
##                             ratings
## 1  15445895 / R: 1794   ->1817     
## 2  14598900 / R: 1553   ->1663     
## 3  14959604 / R: 1384   ->1640     
## 4  12616049 / R: 1716   ->1744     
## 5  14601533 / R: 1655   ->1690     
## 6  15055204 / R: 1686   ->1687

Some columns contain multiple types of information, so those will need to be split into their own columns.

#Separate column that holds pre and post ratings
df_combined <- df_combined %>%
  separate(ratings, c("drop", "keep"), ": ")

df_combined <- df_combined %>%
  separate(keep, c("Pre-Rating", "Post-Rating"), "->")

df_combined <- df_combined %>%
  separate(`Pre-Rating`, c("Pre-Rating", "drop2"), "P")

#select only the columns needed after splitting the data
df_combined <- df_combined %>%
  select(1:11,13,15)

head(df_combined)

##   Playernum                        PlayerName TotalPts Round1 Round2
## 1        1   GARY HUA                            6.0    W  39  W  21
## 2        2   DAKSHESH DARURI                     6.0    W  63  W  58
## 3        3   ADITYA BAJAJ                        6.0    L   8  W  61
## 4        4   PATRICK H SCHILLING                 5.5    W  23  D  28
## 5        5   HANSHI ZUO                          5.5    W  45  W  37
## 6        6   HANSEN SONG                         5.0    W  34  D  29
##   Round3 Round4 Round5 Round6 Round7  State Pre-Rating Post-Rating
## 1  W  18  W  14  W   7  D  12  D   4    ON     1794      1817     
## 2  L   4  W  17  W  16  W  20  W   7    MI     1553      1663     
## 3  W  25  W  21  W  11  W  13  W  12    MI     1384      1640     
## 4  W   2  W  26  D   5  W  19  D   1    MI     1716      1744     
## 5  D  12  D  13  D   4  W  14  W  17    MI     1655      1690     
## 6  L  11  W  35  D  10  W  27  W  21    OH     1686      1687

Additional splitting is needed for the columns that hold the player number of each of their opponents.

#split round 1
df_combined <- df_combined %>%
  separate(Round1, c("R1WL","R1Player"))

#split round 2
df_combined <- df_combined %>%
  separate(Round2,c("R2WL","R2Player"))

#split round 3
df_combined <- df_combined %>%
  separate(Round3,c("R3WL","R3Player"))

#split round 4
df_combined <- df_combined %>%
  separate(Round4,c("R4WL","R4Player"))

#split round 5
df_combined <- df_combined %>%
  separate(Round5,c("R5WL","R5Player"))

#split round 6
df_combined <- df_combined %>%
  separate(Round6,c("R6WL","R6Player"))

#split round 7
df_combined <- df_combined %>%
  separate(Round7,c("R7WL","R7Player"))

#Select only the columns needed after spliting the data
df_combined <- df_combined %>%
  select(1:3,5,7,9,11,13,15,17:19)

head(df_combined)

##   Playernum                        PlayerName TotalPts R1Player R2Player
## 1        1   GARY HUA                            6.0         39       21
## 2        2   DAKSHESH DARURI                     6.0         63       58
## 3        3   ADITYA BAJAJ                        6.0          8       61
## 4        4   PATRICK H SCHILLING                 5.5         23       28
## 5        5   HANSHI ZUO                          5.5         45       37
## 6        6   HANSEN SONG                         5.0         34       29
##   R3Player R4Player R5Player R6Player R7Player  State Pre-Rating
## 1       18       14        7       12        4    ON     1794   
## 2        4       17       16       20        7    MI     1553   
## 3       25       21       11       13       12    MI     1384   
## 4        2       26        5       19        1    MI     1716   
## 5       12       13        4       14       17    MI     1655   
## 6       11       35       10       27       21    OH     1686

Computing Averages

Here we need to do some additional work to get the average of all opponent’s pre-rating.

#Create a new table that only holds the player number and their pre-ratings
ratings <- df_combined %>%
  select(Playernum, `Pre-Rating`)

head(ratings)

##   Playernum Pre-Rating
## 1        1     1794   
## 2        2     1553   
## 3        3     1384   
## 4        4     1716   
## 5        5     1655   
## 6        6     1686

When I tried to compute the average for each row I realized that it was not working because the columns needed for the equation were all stored as “chr” data types. Here I convert all columns needed for joining and computing to an integer.

#convert data to an integer so it can be matched
ratings$Playernum <- as.integer(ratings$Playernum)

#loop through the main table to convert all player number columns into an int
for(i in 4:10){
  df_combined[,i] <- as.integer(df_combined[,i])
}

#join the ratings table and the df_combined table so we can see the pre-rating of each opponent
df_combined <- df_combined %>%
  inner_join(ratings, by = c("R1Player" = "Playernum")) %>%
  inner_join(ratings, by = c("R2Player" = "Playernum")) %>%
  inner_join(ratings, by = c("R3Player" = "Playernum")) %>%
  inner_join(ratings, by = c("R4Player" = "Playernum")) %>%
  inner_join(ratings, by = c("R5Player" = "Playernum")) %>%
  inner_join(ratings, by = c("R6Player" = "Playernum")) %>%
  inner_join(ratings, by = c("R7Player" = "Playernum"))

#convert new columns added from the join above to an integer so they can be averaged
for(i in 12:19){
  df_combined[,i] <- as.integer(df_combined[,i])
}


#compute the average of the pre-ratings for each row
df_combined <- mutate(df_combined, (rowSums(df_combined[,12:19], na.rm = TRUE))/7)

head(df_combined)

##   Playernum                        PlayerName TotalPts R1Player R2Player
## 1        1   GARY HUA                            6.0         39       21
## 2        2   DAKSHESH DARURI                     6.0         63       58
## 3        3   ADITYA BAJAJ                        6.0          8       61
## 4        5   HANSHI ZUO                          5.5         45       37
## 5        6   HANSEN SONG                         5.0         34       29
## 6        8   EZEKIEL HOUGHTON                    5.0          3       32
##   R3Player R4Player R5Player R6Player R7Player  State Pre-Rating.x
## 1       18       14        7       12        4    ON          1794
## 2        4       17       16       20        7    MI          1553
## 3       25       21       11       13       12    MI          1384
## 4       12       13        4       14       17    MI          1655
## 5       11       35       10       27       21    OH          1686
## 6       14        9       47       28       19    MI          1641
##   Pre-Rating.y Pre-Rating.x.x Pre-Rating.y.y Pre-Rating.x.x.x
## 1         1423           1595           1629             1666
## 2         1530           1092           1384             1604
## 3         1649            967           1229             1595
## 4         1199           1355           1712             1663
## 5         1449           1507           1365             1399
## 6         1553           1494           1666             1641
##   Pre-Rating.y.y.y Pre-Rating.x.x.x.x Pre-Rating.y.y.y.y
## 1             1686               1712               1384
## 2             1220               1564               1686
## 3             1365               1663               1712
## 4             1384               1666               1604
## 5             1411               1579               1595
## 6              377               1552               1600
##   (rowSums(df_combined[, 12:19], na.rm = TRUE))/7
## 1                                        1841.286
## 2                                        1661.857
## 3                                        1652.000
## 4                                        1748.286
## 5                                        1713.000
## 6                                        1646.286

Export to CSV

Now we create the final table and export it to a csv file.

#create the final tables only using the columns that need to be exported
final_table <- df_combined %>%
  select(2,3,11,12,20)

#remane final table
names <- c("PlayerName", "TotalPts", "State", "Pre-Rating", "OppAvgRating")
colnames(final_table) <- names

head(final_table)

##                          PlayerName TotalPts  State Pre-Rating
## 1  GARY HUA                            6.0      ON        1794
## 2  DAKSHESH DARURI                     6.0      MI        1553
## 3  ADITYA BAJAJ                        6.0      MI        1384
## 4  HANSHI ZUO                          5.5      MI        1655
## 5  HANSEN SONG                         5.0      OH        1686
## 6  EZEKIEL HOUGHTON                    5.0      MI        1641
##   OppAvgRating
## 1     1841.286
## 2     1661.857
## 3     1652.000
## 4     1748.286
## 5     1713.000
## 6     1646.286

#write to a csv file
write.csv(final_table,"FinalData.csv")

Project1

LeTicia Cancel

9/18/2020

Import Tournament Data

Data Cleanup

Computing Averages

Export to CSV