Code that follows requires libraries stringr, dplyr, and tidyr.
library(stringr)
library(dplyr)
library(tidyr)
raw.data <- read.delim("https://raw.githubusercontent.com/aliceafriedman/DATA607_Proj1/master/tournamentinfo.txt", header = FALSE, sep = "|", dec = ".")
The data is spread into 2 rows. The following code splits the data into the first row and second row for variables pertaining to each player.
head(raw.data)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair
## 3 Num
## 4 -----------------------------------------------------------------------------------------
## 5 1
## 6 ON
## V2 V3 V4 V5 V6 V7 V8
## 1
## 2 Player Name Total Round Round Round Round Round
## 3 USCF ID / Rtg (Pre->Post) Pts 1 2 3 4 5
## 4
## 5 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 6 15445895 / R: 1794 ->1817 N:2 W B W B W
## V9 V10 V11
## 1 NA
## 2 Round Round NA
## 3 6 7 NA
## 4 NA
## 5 D 12 D 4 NA
## 6 B W NA
#code checks for the first row by looking for the player's ID, which is an integer
first_row_test <- str_detect(raw.data$V1, "[[:digit:]]{1,3}")
#code subsets rows meeting the first_row_test, starting with the first row of player data
first_rows <- raw.data[first_row_test,]
#code checks for the second row by matching a 2 letter code regex to find the rows with state info
second_row_test <- str_detect(raw.data$V1, "[[:upper:]]{2}")
#code subsets rows meeting the second_row_test, starting with the first row of player data
second_rows <- raw.data[second_row_test,]
head(first_rows)
## V1 V2 V3 V4 V5 V6 V7
## 5 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 8 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 11 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 14 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 17 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 20 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## V8 V9 V10 V11
## 5 W 7 D 12 D 4 NA
## 8 W 16 W 20 W 7 NA
## 11 W 11 W 13 W 12 NA
## 14 D 5 W 19 D 1 NA
## 17 D 4 W 14 W 17 NA
## 20 D 10 W 27 W 21 NA
tail(first_rows)
## V1 V2 V3 V4 V5 V6 V7
## 179 59 SEAN M MC CORMICK 2.0 L 41 B L 9 L 40
## 182 60 JULIA SHEN 1.5 L 33 L 34 D 45 D 42
## 185 61 JEZZEL FARKAS 1.5 L 32 L 3 W 54 L 47
## 188 62 ASHWIN BALAJI 1.0 W 55 U U U
## 191 63 THOMAS JOSEPH HOSMER 1.0 L 2 L 48 D 49 L 43
## 194 64 BEN LI 1.0 L 22 D 30 L 31 D 49
## V8 V9 V10 V11
## 179 L 43 W 54 L 44 NA
## 182 L 24 H U NA
## 185 D 42 L 30 L 37 NA
## 188 U U U NA
## 191 L 45 H U NA
## 194 L 46 L 42 L 54 NA
head(second_rows)
## V1 V2 V3 V4 V5 V6 V7
## 6 ON 15445895 / R: 1794 ->1817 N:2 W B W B
## 9 MI 14598900 / R: 1553 ->1663 N:2 B W B W
## 12 MI 14959604 / R: 1384 ->1640 N:2 W B W B
## 15 MI 12616049 / R: 1716 ->1744 N:2 W B W B
## 18 MI 14601533 / R: 1655 ->1690 N:2 B W B W
## 21 OH 15055204 / R: 1686 ->1687 N:3 W B W B
## V8 V9 V10 V11
## 6 W B W NA
## 9 B W B NA
## 12 W B W NA
## 15 W B B NA
## 18 B W B NA
## 21 B W B NA
tail(second_rows)
## V1 V2 V3 V4 V5 V6 V7
## 180 MI 12841036 / R: 853 -> 878 W B B
## 183 MI 14579262 / R: 967 -> 984 W B B W
## 186 ON 15771592 / R: 955P11-> 979P18 B W B W
## 189 MI 15219542 / R: 1530 ->1535 B
## 192 MI 15057092 / R: 1175 ->1125 W B W B
## 195 MI 15006561 / R: 1163 ->1112 B W W B
## V8 V9 V10 V11
## 180 W W B NA
## 183 B NA
## 186 B W B NA
## 189 NA
## 192 B NA
## 195 W B B NA
Then the data is recombined into a single dataframe.
#extract column names
colnames1 <- raw.data[2,]
colnames2 <- raw.data[3,]
newcolnames <- cbind(colnames1, colnames2)
#This removes extra spaces before and after the names
newcolnames <- lapply(newcolnames, trimws)
#create dataframe where each player's data in on one row
df <- cbind(first_rows, second_rows)
#convert column names to character
#note: this step is important or colnames function wil read the factor indices insted of the text
newcolnames <- lapply(newcolnames, as.character)
#apply original colnames--now on a single row--to dataframe
colnames(df) = newcolnames
#view results
head(df)
## Pair Player Name Total Round Round Round Round
## 5 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 8 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 11 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 14 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 17 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 20 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## Round Round Round NA Num USCF ID / Rtg (Pre->Post) Pts
## 5 W 7 D 12 D 4 NA ON 15445895 / R: 1794 ->1817 N:2
## 8 W 16 W 20 W 7 NA MI 14598900 / R: 1553 ->1663 N:2
## 11 W 11 W 13 W 12 NA MI 14959604 / R: 1384 ->1640 N:2
## 14 D 5 W 19 D 1 NA MI 12616049 / R: 1716 ->1744 N:2
## 17 D 4 W 14 W 17 NA MI 14601533 / R: 1655 ->1690 N:2
## 20 D 10 W 27 W 21 NA OH 15055204 / R: 1686 ->1687 N:3
## 1 2 3 4 5 6 7 NA
## 5 W B W B W B W NA
## 8 B W B W B W B NA
## 11 W B W B W B W NA
## 14 W B W B W B B NA
## 17 B W B W B W B NA
## 20 W B W B B W B NA
#This results in too many columns named "Round!" Need to rename
new_points_names <- paste0("Round ", colnames(df[,15:21]), " Pair")
colnames(df)[4:10] = new_points_names
new_round_names <- paste0("Round ", c(1:7))
colnames(df)[15:21] = new_round_names
#Drop empty columns
df <- df[c(1:10,12:21)]
#This produces a dataframe with approximately the original column names
head(df)
## Pair Player Name Total Round 1 Pair
## 5 1 GARY HUA 6.0 W 39
## 8 2 DAKSHESH DARURI 6.0 W 63
## 11 3 ADITYA BAJAJ 6.0 L 8
## 14 4 PATRICK H SCHILLING 5.5 W 23
## 17 5 HANSHI ZUO 5.5 W 45
## 20 6 HANSEN SONG 5.0 W 34
## Round 2 Pair Round 3 Pair Round 4 Pair Round 5 Pair Round 6 Pair
## 5 W 21 W 18 W 14 W 7 D 12
## 8 W 58 L 4 W 17 W 16 W 20
## 11 W 61 W 25 W 21 W 11 W 13
## 14 D 28 W 2 W 26 D 5 W 19
## 17 W 37 D 12 D 13 D 4 W 14
## 20 D 29 L 11 W 35 D 10 W 27
## Round 7 Pair Num USCF ID / Rtg (Pre->Post) Pts Round 1
## 5 D 4 ON 15445895 / R: 1794 ->1817 N:2 W
## 8 W 7 MI 14598900 / R: 1553 ->1663 N:2 B
## 11 W 12 MI 14959604 / R: 1384 ->1640 N:2 W
## 14 D 1 MI 12616049 / R: 1716 ->1744 N:2 W
## 17 W 17 MI 14601533 / R: 1655 ->1690 N:2 B
## 20 W 21 OH 15055204 / R: 1686 ->1687 N:3 W
## Round 2 Round 3 Round 4 Round 5 Round 6 Round 7
## 5 B W B W B W
## 8 W B W B W B
## 11 B W B W B W
## 14 B W B W B B
## 17 W B W B W B
## 20 B W B B W B
names(df)
## [1] "Pair" "Player Name"
## [3] "Total" "Round 1 Pair"
## [5] "Round 2 Pair" "Round 3 Pair"
## [7] "Round 4 Pair" "Round 5 Pair"
## [9] "Round 6 Pair" "Round 7 Pair"
## [11] "Num" "USCF ID / Rtg (Pre->Post)"
## [13] "Pts" "Round 1"
## [15] "Round 2" "Round 3"
## [17] "Round 4" "Round 5"
## [19] "Round 6" "Round 7"
#Create a subset of the data frame that contains all the info that can be extracted without a join
df_sub <- select(df, Pair, "Player Name", Num, Total, "USCF ID / Rtg (Pre->Post)") %>%
rename("PairID"=Pair, "Player Name"="Player Name", "State"=Num, "Total Points"=Total, "ID_Ratings"="USCF ID / Rtg (Pre->Post)") %>%
#Extract Pre- ratings from combined ID_Ratings column, based on #StartEnd <- str_locate(df_sub$ID_Ratings, "R:( )+?[[:digit:]]{3,4}"
mutate(Pre.Rating = str_sub(ID_Ratings, start=15)) %>%
mutate_all(str_trim) %>%
mutate(Pre.Rating = as.numeric(str_extract(Pre.Rating,"[[:digit:]]+"))) %>%
arrange(Pre.Rating) %>%
#Drop ID_Ratings
select(-ID_Ratings) %>%
glimpse()
## Observations: 64
## Variables: 5
## $ PairID <chr> "46", "59", "58", "52", "61", "60", "37", "51",...
## $ `Player Name` <chr> "JACOB ALEXANDER LAVALLEY", "SEAN M MC CORMICK"...
## $ State <chr> "MI", "MI", "MI", "MI", "ON", "MI", "MI", "MI",...
## $ `Total Points` <chr> "3.0", "2.0", "2.0", "2.5", "1.5", "1.5", "3.5"...
## $ Pre.Rating <dbl> 377, 853, 917, 935, 955, 967, 980, 1011, 1056, ...
#Reshape data into long format using tidyr
#Create new long-format table with pair information
df_pairs <- df %>% select(ends_with("Pair")) %>%
gather(key = "Round", value = "PairID", -Pair, na.rm = FALSE, convert = FALSE, factor_key = TRUE) %>%
arrange(Pair) %>%
mutate(PairID = str_extract(PairID, "[[:digit:]]+")) %>%
glimpse()
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Observations: 448
## Variables: 3
## $ Pair <fct> 1 , 1 , 1 , 1 , 1 , 1 , 1 ,...
## $ Round <fct> Round 1 Pair, Round 2 Pair, Round 3 Pair, Round 4 Pair,...
## $ PairID <chr> "39", "21", "18", "14", "7", "12", "4", "63", "58", "4"...
#Join with df_sub to get the Player.2's ratings
df_player2_avg_rating <- inner_join(df_pairs, df_sub) %>%
#Group by pair
group_by(Pair)
## Joining, by = "PairID"
#Summarize to obtain mean rating of opponents
Player.2.Mean <- df_player2_avg_rating %>% summarize("Average Opponent Rating" = as.integer(mean(Pre.Rating))) %>%
mutate (PairID = as.character(Pair)) %>%
mutate_all(str_trim) %>%
mutate_all(unlist) %>%
glimpse()
## Observations: 64
## Variables: 3
## $ Pair <chr> "1", "2", "3", "4", "5", "6", "7", "...
## $ `Average Opponent Rating` <chr> "1605", "1469", "1563", "1573", "150...
## $ PairID <chr> "1", "2", "3", "4", "5", "6", "7", "...
#Rejoin to df_sub for final answer!
Chess <- inner_join(df_sub, Player.2.Mean) %>%
select(-Pair) %>%
arrange(PairID) %>%
mutate(`Total Points`=as.numeric(`Total Points`),
Pre.Rating = as.numeric(Pre.Rating),
`Average Opponent Rating`=as.numeric(`Average Opponent Rating`)) %>%
glimpse()
## Joining, by = "PairID"
## Observations: 64
## Variables: 6
## $ PairID <chr> "1", "10", "11", "12", "13", "14", "...
## $ `Player Name` <chr> "GARY HUA", "ANVIT RAO", "CAMERON WI...
## $ State <chr> "ON", "MI", "MI", "MI", "MI", "MI", ...
## $ `Total Points` <dbl> 6.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, 4...
## $ Pre.Rating <dbl> 1794, 1365, 1712, 1663, 1666, 1610, ...
## $ `Average Opponent Rating` <dbl> 1605, 1554, 1467, 1506, 1497, 1515, ...
head(Chess)
## PairID Player Name State Total Points Pre.Rating
## 1 1 GARY HUA ON 6.0 1794
## 2 10 ANVIT RAO MI 5.0 1365
## 3 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 4 12 KENNETH J TACK MI 4.5 1663
## 5 13 TORRANCE HENRY JR MI 4.5 1666
## 6 14 BRADLEY SHAW MI 4.5 1610
## Average Opponent Rating
## 1 1605
## 2 1554
## 3 1467
## 4 1506
## 5 1497
## 6 1515
#Note file will write to location set up in your RStudio preferences.
write.csv(Chess, file="ChessScores.csv")
Requires ggplot2
library(ggplot2)
#What is the relationship between Total Points (e.g. how the player did at this tournament) and Pre-Tournament Rating?
Chess %>%
ggplot(aes(x=Pre.Rating, y=`Total Points`))+
geom_point()+
geom_smooth(method = "lm")+
labs(title="Pre-Tournament Ratings vs. Tournament Total Points",
x="Pre-Tournament Ratings",
y="Tournament Total Points")
#What is the relationship between Total Points (e.g. how the player did at this tournament) and Average Opponent Pre-Tournament Rating?
Chess %>%
ggplot(aes(x=`Average Opponent Rating`, y=`Total Points`))+
geom_point()+
geom_smooth(method = "lm")+
labs(title="Mean Opponent Ratings vs. Tournament Total Points",
x="Mean Oppoenent Pre-Tournament Ratings",
y="Tournament Total Points")
Not surpringly, better-rated players did better at this tournament. Somewhat surprisingly, players paired with better-rated opponents also did better at this tournament! This can likely be explained that as players advanced, they were more likely to be paired with their better-rated peers, who had also won the last round.