# Libraries
#library(tidyverse)
library(RCurl)
library(dplyr)
library(stringr)
library(tidyr)
The tournament text file is saved in GitHub. The data is saved to the table t_df and the columns are renamed.
#get URL and import data from GitHub
site <- getURL("https://raw.githubusercontent.com/ltcancel/Project1_Data607_F20/master/tounamentinfo.txt")
df <- read.delim(text = site, header = FALSE,sep = "|")
#create a tibble with the imported data
t_df <- as_tibble(df)
#rename columns
cnames <- c('Pair','PlayerName','TotalPts','Round1','Round2','Round3','Round4','Round5','Round6','Round7','drop')
colnames(t_df) <- cnames
head(t_df)
## # A tibble: 6 x 11
## Pair PlayerName TotalPts Round1 Round2 Round3 Round4 Round5 Round6
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 "---~ "" "" "" "" "" "" "" ""
## 2 " Pa~ " Player ~ "Total" "Roun~ "Roun~ "Roun~ "Roun~ "Roun~ "Roun~
## 3 " Nu~ " USCF ID~ " Pts " " 1 ~ " 2 ~ " 3 ~ " 4 ~ " 5 ~ " 6 ~
## 4 "---~ "" "" "" "" "" "" "" ""
## 5 " ~ " GARY HU~ "6.0 " "W 3~ "W 2~ "W 1~ "W 1~ "W ~ "D 1~
## 6 " ~ " 1544589~ "N:2 " "W ~ "B ~ "W ~ "B ~ "W ~ "B ~
## # ... with 2 more variables: Round7 <fct>, drop <lgl>
After the data is saved to a table we need to remove all unnecessary information.
#get all rows with the pair number and player name
df_filter1 <- t_df %>%
filter(str_detect(Pair, regex("\\d"))) %>%
filter(!str_detect(str_to_lower(TotalPts), "n"))
#get all rows that include the other player information
df_filter2 <- t_df %>%
filter(str_detect(str_to_lower(Pair), "o|m")) %>%
filter(!str_detect(str_to_lower(TotalPts), "p"))
#combine both tables
df_combined <- cbind(df_filter1[,1:10], df_filter2[,1:2])
cnames <- c("Playernum","PlayerName","TotalPts",'Round1','Round2','Round3','Round4','Round5','Round6','Round7','State','ratings')
#rename columns again
colnames(df_combined) <- cnames
head(df_combined)
## Playernum PlayerName TotalPts Round1 Round2
## 1 1 GARY HUA 6.0 W 39 W 21
## 2 2 DAKSHESH DARURI 6.0 W 63 W 58
## 3 3 ADITYA BAJAJ 6.0 L 8 W 61
## 4 4 PATRICK H SCHILLING 5.5 W 23 D 28
## 5 5 HANSHI ZUO 5.5 W 45 W 37
## 6 6 HANSEN SONG 5.0 W 34 D 29
## Round3 Round4 Round5 Round6 Round7 State
## 1 W 18 W 14 W 7 D 12 D 4 ON
## 2 L 4 W 17 W 16 W 20 W 7 MI
## 3 W 25 W 21 W 11 W 13 W 12 MI
## 4 W 2 W 26 D 5 W 19 D 1 MI
## 5 D 12 D 13 D 4 W 14 W 17 MI
## 6 L 11 W 35 D 10 W 27 W 21 OH
## ratings
## 1 15445895 / R: 1794 ->1817
## 2 14598900 / R: 1553 ->1663
## 3 14959604 / R: 1384 ->1640
## 4 12616049 / R: 1716 ->1744
## 5 14601533 / R: 1655 ->1690
## 6 15055204 / R: 1686 ->1687
Some columns contain multiple types of information, so those will need to be split into their own columns.
#Separate column that holds pre and post ratings
df_combined <- df_combined %>%
separate(ratings, c("drop", "keep"), ": ")
df_combined <- df_combined %>%
separate(keep, c("Pre-Rating", "Post-Rating"), "->")
df_combined <- df_combined %>%
separate(`Pre-Rating`, c("Pre-Rating", "drop2"), "P")
#select only the columns needed after splitting the data
df_combined <- df_combined %>%
select(1:11,13,15)
head(df_combined)
## Playernum PlayerName TotalPts Round1 Round2
## 1 1 GARY HUA 6.0 W 39 W 21
## 2 2 DAKSHESH DARURI 6.0 W 63 W 58
## 3 3 ADITYA BAJAJ 6.0 L 8 W 61
## 4 4 PATRICK H SCHILLING 5.5 W 23 D 28
## 5 5 HANSHI ZUO 5.5 W 45 W 37
## 6 6 HANSEN SONG 5.0 W 34 D 29
## Round3 Round4 Round5 Round6 Round7 State Pre-Rating Post-Rating
## 1 W 18 W 14 W 7 D 12 D 4 ON 1794 1817
## 2 L 4 W 17 W 16 W 20 W 7 MI 1553 1663
## 3 W 25 W 21 W 11 W 13 W 12 MI 1384 1640
## 4 W 2 W 26 D 5 W 19 D 1 MI 1716 1744
## 5 D 12 D 13 D 4 W 14 W 17 MI 1655 1690
## 6 L 11 W 35 D 10 W 27 W 21 OH 1686 1687
Additional splitting is needed for the columns that hold the player number of each of their opponents.
#split round 1
df_combined <- df_combined %>%
separate(Round1, c("R1WL","R1Player"))
#split round 2
df_combined <- df_combined %>%
separate(Round2,c("R2WL","R2Player"))
#split round 3
df_combined <- df_combined %>%
separate(Round3,c("R3WL","R3Player"))
#split round 4
df_combined <- df_combined %>%
separate(Round4,c("R4WL","R4Player"))
#split round 5
df_combined <- df_combined %>%
separate(Round5,c("R5WL","R5Player"))
#split round 6
df_combined <- df_combined %>%
separate(Round6,c("R6WL","R6Player"))
#split round 7
df_combined <- df_combined %>%
separate(Round7,c("R7WL","R7Player"))
#Select only the columns needed after spliting the data
df_combined <- df_combined %>%
select(1:3,5,7,9,11,13,15,17:19)
head(df_combined)
## Playernum PlayerName TotalPts R1Player R2Player
## 1 1 GARY HUA 6.0 39 21
## 2 2 DAKSHESH DARURI 6.0 63 58
## 3 3 ADITYA BAJAJ 6.0 8 61
## 4 4 PATRICK H SCHILLING 5.5 23 28
## 5 5 HANSHI ZUO 5.5 45 37
## 6 6 HANSEN SONG 5.0 34 29
## R3Player R4Player R5Player R6Player R7Player State Pre-Rating
## 1 18 14 7 12 4 ON 1794
## 2 4 17 16 20 7 MI 1553
## 3 25 21 11 13 12 MI 1384
## 4 2 26 5 19 1 MI 1716
## 5 12 13 4 14 17 MI 1655
## 6 11 35 10 27 21 OH 1686
Here we need to do some additional work to get the average of all opponent’s pre-rating.
#Create a new table that only holds the player number and their pre-ratings
ratings <- df_combined %>%
select(Playernum, `Pre-Rating`)
head(ratings)
## Playernum Pre-Rating
## 1 1 1794
## 2 2 1553
## 3 3 1384
## 4 4 1716
## 5 5 1655
## 6 6 1686
When I tried to compute the average for each row I realized that it was not working because the columns needed for the equation were all stored as “chr” data types. Here I convert all columns needed for joining and computing to an integer.
#convert data to an integer so it can be matched
ratings$Playernum <- as.integer(ratings$Playernum)
#loop through the main table to convert all player number columns into an int
for(i in 4:10){
df_combined[,i] <- as.integer(df_combined[,i])
}
#join the ratings table and the df_combined table so we can see the pre-rating of each opponent
df_combined <- df_combined %>%
inner_join(ratings, by = c("R1Player" = "Playernum")) %>%
inner_join(ratings, by = c("R2Player" = "Playernum")) %>%
inner_join(ratings, by = c("R3Player" = "Playernum")) %>%
inner_join(ratings, by = c("R4Player" = "Playernum")) %>%
inner_join(ratings, by = c("R5Player" = "Playernum")) %>%
inner_join(ratings, by = c("R6Player" = "Playernum")) %>%
inner_join(ratings, by = c("R7Player" = "Playernum"))
#convert new columns added from the join above to an integer so they can be averaged
for(i in 12:19){
df_combined[,i] <- as.integer(df_combined[,i])
}
#compute the average of the pre-ratings for each row
df_combined <- mutate(df_combined, (rowSums(df_combined[,12:19], na.rm = TRUE))/7)
head(df_combined)
## Playernum PlayerName TotalPts R1Player R2Player
## 1 1 GARY HUA 6.0 39 21
## 2 2 DAKSHESH DARURI 6.0 63 58
## 3 3 ADITYA BAJAJ 6.0 8 61
## 4 5 HANSHI ZUO 5.5 45 37
## 5 6 HANSEN SONG 5.0 34 29
## 6 8 EZEKIEL HOUGHTON 5.0 3 32
## R3Player R4Player R5Player R6Player R7Player State Pre-Rating.x
## 1 18 14 7 12 4 ON 1794
## 2 4 17 16 20 7 MI 1553
## 3 25 21 11 13 12 MI 1384
## 4 12 13 4 14 17 MI 1655
## 5 11 35 10 27 21 OH 1686
## 6 14 9 47 28 19 MI 1641
## Pre-Rating.y Pre-Rating.x.x Pre-Rating.y.y Pre-Rating.x.x.x
## 1 1423 1595 1629 1666
## 2 1530 1092 1384 1604
## 3 1649 967 1229 1595
## 4 1199 1355 1712 1663
## 5 1449 1507 1365 1399
## 6 1553 1494 1666 1641
## Pre-Rating.y.y.y Pre-Rating.x.x.x.x Pre-Rating.y.y.y.y
## 1 1686 1712 1384
## 2 1220 1564 1686
## 3 1365 1663 1712
## 4 1384 1666 1604
## 5 1411 1579 1595
## 6 377 1552 1600
## (rowSums(df_combined[, 12:19], na.rm = TRUE))/7
## 1 1841.286
## 2 1661.857
## 3 1652.000
## 4 1748.286
## 5 1713.000
## 6 1646.286
Now we create the final table and export it to a csv file.
#create the final tables only using the columns that need to be exported
final_table <- df_combined %>%
select(2,3,11,12,20)
#remane final table
names <- c("PlayerName", "TotalPts", "State", "Pre-Rating", "OppAvgRating")
colnames(final_table) <- names
head(final_table)
## PlayerName TotalPts State Pre-Rating
## 1 GARY HUA 6.0 ON 1794
## 2 DAKSHESH DARURI 6.0 MI 1553
## 3 ADITYA BAJAJ 6.0 MI 1384
## 4 HANSHI ZUO 5.5 MI 1655
## 5 HANSEN SONG 5.0 OH 1686
## 6 EZEKIEL HOUGHTON 5.0 MI 1641
## OppAvgRating
## 1 1841.286
## 2 1661.857
## 3 1652.000
## 4 1748.286
## 5 1713.000
## 6 1646.286
#write to a csv file
write.csv(final_table,"FinalData.csv")