Step 1: Load required libraries and read text file into R

Code that follows requires libraries stringr, dplyr, and tidyr.

library(stringr)
library(dplyr)
library(tidyr)

raw.data <- read.delim("https://raw.githubusercontent.com/aliceafriedman/DATA607_Proj1/master/tournamentinfo.txt", header = FALSE, sep = "|", dec = ".")

Step 2: Read data from each series of rows into its own table

The data is spread into 2 rows. The following code splits the data into the first row and second row for variables pertaining to each player.

head(raw.data)

##                                                                                          V1
## 1 -----------------------------------------------------------------------------------------
## 2                                                                                     Pair 
## 3                                                                                     Num  
## 4 -----------------------------------------------------------------------------------------
## 5                                                                                        1 
## 6                                                                                       ON 
##                                  V2    V3    V4    V5    V6    V7    V8
## 1                                                                      
## 2  Player Name                      Total Round Round Round Round Round
## 3  USCF ID / Rtg (Pre->Post)         Pts    1     2     3     4     5  
## 4                                                                      
## 5  GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 6  15445895 / R: 1794   ->1817      N:2   W     B     W     B     W    
##      V9   V10 V11
## 1              NA
## 2 Round Round  NA
## 3   6     7    NA
## 4              NA
## 5 D  12 D   4  NA
## 6 B     W      NA

#code checks for the first row by looking for the player's ID, which is an integer
first_row_test <- str_detect(raw.data$V1, "[[:digit:]]{1,3}")
#code subsets rows meeting the first_row_test, starting with the first row of player data 
first_rows <- raw.data[first_row_test,]
#code checks for the second row by matching a 2 letter code regex to find the rows with state info
second_row_test <- str_detect(raw.data$V1, "[[:upper:]]{2}")
#code subsets rows meeting the second_row_test, starting with the first row of player data
second_rows <- raw.data[second_row_test,]

head(first_rows)

##        V1                                V2    V3    V4    V5    V6    V7
## 5      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14
## 8      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17
## 11     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21
## 14     4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26
## 17     5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13
## 20     6   HANSEN SONG                      5.0   W  34 D  29 L  11 W  35
##       V8    V9   V10 V11
## 5  W   7 D  12 D   4  NA
## 8  W  16 W  20 W   7  NA
## 11 W  11 W  13 W  12  NA
## 14 D   5 W  19 D   1  NA
## 17 D   4 W  14 W  17  NA
## 20 D  10 W  27 W  21  NA

tail(first_rows)

##         V1                                V2    V3    V4    V5    V6    V7
## 179    59   SEAN M MC CORMICK                2.0   L  41 B     L   9 L  40
## 182    60   JULIA SHEN                       1.5   L  33 L  34 D  45 D  42
## 185    61   JEZZEL FARKAS                    1.5   L  32 L   3 W  54 L  47
## 188    62   ASHWIN BALAJI                    1.0   W  55 U     U     U    
## 191    63   THOMAS JOSEPH HOSMER             1.0   L   2 L  48 D  49 L  43
## 194    64   BEN LI                           1.0   L  22 D  30 L  31 D  49
##        V8    V9   V10 V11
## 179 L  43 W  54 L  44  NA
## 182 L  24 H     U      NA
## 185 D  42 L  30 L  37  NA
## 188 U     U     U      NA
## 191 L  45 H     U      NA
## 194 L  46 L  42 L  54  NA

head(second_rows)

##        V1                                V2    V3    V4    V5    V6    V7
## 6     ON   15445895 / R: 1794   ->1817      N:2   W     B     W     B    
## 9     MI   14598900 / R: 1553   ->1663      N:2   B     W     B     W    
## 12    MI   14959604 / R: 1384   ->1640      N:2   W     B     W     B    
## 15    MI   12616049 / R: 1716   ->1744      N:2   W     B     W     B    
## 18    MI   14601533 / R: 1655   ->1690      N:2   B     W     B     W    
## 21    OH   15055204 / R: 1686   ->1687      N:3   W     B     W     B    
##       V8    V9   V10 V11
## 6  W     B     W      NA
## 9  B     W     B      NA
## 12 W     B     W      NA
## 15 W     B     B      NA
## 18 B     W     B      NA
## 21 B     W     B      NA

tail(second_rows)

##         V1                                V2    V3    V4    V5    V6    V7
## 180    MI   12841036 / R:  853   -> 878            W           B     B    
## 183    MI   14579262 / R:  967   -> 984            W     B     B     W    
## 186    ON   15771592 / R:  955P11-> 979P18         B     W     B     W    
## 189    MI   15219542 / R: 1530   ->1535            B                      
## 192    MI   15057092 / R: 1175   ->1125            W     B     W     B    
## 195    MI   15006561 / R: 1163   ->1112            B     W     W     B    
##        V8    V9   V10 V11
## 180 W     W     B      NA
## 183 B                  NA
## 186 B     W     B      NA
## 189                    NA
## 192 B                  NA
## 195 W     B     B      NA

Step 3: Combine data into single dataframe and add coloumn names.

Then the data is recombined into a single dataframe.

#extract column names
colnames1 <- raw.data[2,]
colnames2 <- raw.data[3,]
newcolnames <- cbind(colnames1, colnames2)

#This removes extra spaces before and after the names
newcolnames <- lapply(newcolnames, trimws)

#create dataframe where each player's data in on one row
df <- cbind(first_rows, second_rows)

#convert column names to character
#note: this step is important or colnames function wil read the factor indices insted of the text
newcolnames <- lapply(newcolnames, as.character)

#apply original colnames--now on a single row--to dataframe
colnames(df) = newcolnames

#view results
head(df)

##      Pair                       Player Name Total Round Round Round Round
## 5      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14
## 8      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17
## 11     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21
## 14     4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26
## 17     5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13
## 20     6   HANSEN SONG                      5.0   W  34 D  29 L  11 W  35
##    Round Round Round NA    Num         USCF ID / Rtg (Pre->Post)   Pts
## 5  W   7 D  12 D   4 NA    ON   15445895 / R: 1794   ->1817      N:2  
## 8  W  16 W  20 W   7 NA    MI   14598900 / R: 1553   ->1663      N:2  
## 11 W  11 W  13 W  12 NA    MI   14959604 / R: 1384   ->1640      N:2  
## 14 D   5 W  19 D   1 NA    MI   12616049 / R: 1716   ->1744      N:2  
## 17 D   4 W  14 W  17 NA    MI   14601533 / R: 1655   ->1690      N:2  
## 20 D  10 W  27 W  21 NA    OH   15055204 / R: 1686   ->1687      N:3  
##        1     2     3     4     5     6     7 NA
## 5  W     B     W     B     W     B     W     NA
## 8  B     W     B     W     B     W     B     NA
## 11 W     B     W     B     W     B     W     NA
## 14 W     B     W     B     W     B     B     NA
## 17 B     W     B     W     B     W     B     NA
## 20 W     B     W     B     B     W     B     NA

#This results in too many columns named "Round!" Need to rename
new_points_names <- paste0("Round ", colnames(df[,15:21]), " Pair")
colnames(df)[4:10] = new_points_names
new_round_names <- paste0("Round ", c(1:7))
colnames(df)[15:21] = new_round_names

#Drop empty columns
df <- df[c(1:10,12:21)]

#This produces a dataframe with approximately the original column names
head(df)

##      Pair                       Player Name Total Round 1 Pair
## 5      1   GARY HUA                         6.0          W  39
## 8      2   DAKSHESH DARURI                  6.0          W  63
## 11     3   ADITYA BAJAJ                     6.0          L   8
## 14     4   PATRICK H SCHILLING              5.5          W  23
## 17     5   HANSHI ZUO                       5.5          W  45
## 20     6   HANSEN SONG                      5.0          W  34
##    Round 2 Pair Round 3 Pair Round 4 Pair Round 5 Pair Round 6 Pair
## 5         W  21        W  18        W  14        W   7        D  12
## 8         W  58        L   4        W  17        W  16        W  20
## 11        W  61        W  25        W  21        W  11        W  13
## 14        D  28        W   2        W  26        D   5        W  19
## 17        W  37        D  12        D  13        D   4        W  14
## 20        D  29        L  11        W  35        D  10        W  27
##    Round 7 Pair    Num         USCF ID / Rtg (Pre->Post)   Pts Round 1
## 5         D   4    ON   15445895 / R: 1794   ->1817      N:2     W    
## 8         W   7    MI   14598900 / R: 1553   ->1663      N:2     B    
## 11        W  12    MI   14959604 / R: 1384   ->1640      N:2     W    
## 14        D   1    MI   12616049 / R: 1716   ->1744      N:2     W    
## 17        W  17    MI   14601533 / R: 1655   ->1690      N:2     B    
## 20        W  21    OH   15055204 / R: 1686   ->1687      N:3     W    
##    Round 2 Round 3 Round 4 Round 5 Round 6 Round 7
## 5    B       W       B       W       B       W    
## 8    W       B       W       B       W       B    
## 11   B       W       B       W       B       W    
## 14   B       W       B       W       B       B    
## 17   W       B       W       B       W       B    
## 20   B       W       B       B       W       B

names(df)

##  [1] "Pair"                      "Player Name"              
##  [3] "Total"                     "Round 1 Pair"             
##  [5] "Round 2 Pair"              "Round 3 Pair"             
##  [7] "Round 4 Pair"              "Round 5 Pair"             
##  [9] "Round 6 Pair"              "Round 7 Pair"             
## [11] "Num"                       "USCF ID / Rtg (Pre->Post)"
## [13] "Pts"                       "Round 1"                  
## [15] "Round 2"                   "Round 3"                  
## [17] "Round 4"                   "Round 5"                  
## [19] "Round 6"                   "Round 7"

Step 4: Calculate opponents average pre-tournament scores.

#Create a subset of the data frame that contains all the info that can be extracted without a join
df_sub <- select(df, Pair, "Player Name", Num, Total, "USCF ID / Rtg (Pre->Post)") %>%  
  rename("PairID"=Pair, "Player Name"="Player Name", "State"=Num, "Total Points"=Total, "ID_Ratings"="USCF ID / Rtg (Pre->Post)") %>%  
#Extract Pre- ratings from combined ID_Ratings column, based on #StartEnd <- str_locate(df_sub$ID_Ratings, "R:( )+?[[:digit:]]{3,4}"
  mutate(Pre.Rating = str_sub(ID_Ratings, start=15)) %>%
  mutate_all(str_trim) %>%
  mutate(Pre.Rating = as.numeric(str_extract(Pre.Rating,"[[:digit:]]+"))) %>%
  arrange(Pre.Rating) %>%
#Drop ID_Ratings
  select(-ID_Ratings) %>%
  glimpse()

## Observations: 64
## Variables: 5
## $ PairID         <chr> "46", "59", "58", "52", "61", "60", "37", "51",...
## $ `Player Name`  <chr> "JACOB ALEXANDER LAVALLEY", "SEAN M MC CORMICK"...
## $ State          <chr> "MI", "MI", "MI", "MI", "ON", "MI", "MI", "MI",...
## $ `Total Points` <chr> "3.0", "2.0", "2.0", "2.5", "1.5", "1.5", "3.5"...
## $ Pre.Rating     <dbl> 377, 853, 917, 935, 955, 967, 980, 1011, 1056, ...

#Reshape data into long format using tidyr
#Create new long-format table with pair information
df_pairs <- df %>% select(ends_with("Pair")) %>%
  gather(key = "Round", value = "PairID", -Pair, na.rm = FALSE, convert = FALSE, factor_key = TRUE) %>%
  arrange(Pair) %>%
  mutate(PairID = str_extract(PairID, "[[:digit:]]+")) %>%
  glimpse()

## Warning: attributes are not identical across measure variables;
## they will be dropped

## Observations: 448
## Variables: 3
## $ Pair   <fct>     1 ,     1 ,     1 ,     1 ,     1 ,     1 ,     1 ,...
## $ Round  <fct> Round 1 Pair, Round 2 Pair, Round 3 Pair, Round 4 Pair,...
## $ PairID <chr> "39", "21", "18", "14", "7", "12", "4", "63", "58", "4"...

#Join with df_sub to get the Player.2's ratings
df_player2_avg_rating <- inner_join(df_pairs, df_sub) %>%
#Group by pair
  group_by(Pair)

## Joining, by = "PairID"

#Summarize to  obtain mean rating of opponents
Player.2.Mean <-  df_player2_avg_rating %>% summarize("Average Opponent Rating" = as.integer(mean(Pre.Rating))) %>%
  mutate (PairID = as.character(Pair)) %>%
  mutate_all(str_trim) %>%
  mutate_all(unlist) %>%
  glimpse()

## Observations: 64
## Variables: 3
## $ Pair                      <chr> "1", "2", "3", "4", "5", "6", "7", "...
## $ `Average Opponent Rating` <chr> "1605", "1469", "1563", "1573", "150...
## $ PairID                    <chr> "1", "2", "3", "4", "5", "6", "7", "...

Final Step: Rejoin to dataframe for final answer, and write to CSV!

#Rejoin to df_sub for final answer!
Chess <- inner_join(df_sub, Player.2.Mean) %>%
  select(-Pair) %>%
  arrange(PairID) %>%
  mutate(`Total Points`=as.numeric(`Total Points`), 
         Pre.Rating = as.numeric(Pre.Rating),
         `Average Opponent Rating`=as.numeric(`Average Opponent Rating`)) %>%
  glimpse()

## Joining, by = "PairID"

## Observations: 64
## Variables: 6
## $ PairID                    <chr> "1", "10", "11", "12", "13", "14", "...
## $ `Player Name`             <chr> "GARY HUA", "ANVIT RAO", "CAMERON WI...
## $ State                     <chr> "ON", "MI", "MI", "MI", "MI", "MI", ...
## $ `Total Points`            <dbl> 6.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, 4...
## $ Pre.Rating                <dbl> 1794, 1365, 1712, 1663, 1666, 1610, ...
## $ `Average Opponent Rating` <dbl> 1605, 1554, 1467, 1506, 1497, 1515, ...

head(Chess)

##   PairID              Player Name State Total Points Pre.Rating
## 1      1                 GARY HUA    ON          6.0       1794
## 2     10                ANVIT RAO    MI          5.0       1365
## 3     11 CAMERON WILLIAM MC LEMAN    MI          4.5       1712
## 4     12           KENNETH J TACK    MI          4.5       1663
## 5     13        TORRANCE HENRY JR    MI          4.5       1666
## 6     14             BRADLEY SHAW    MI          4.5       1610
##   Average Opponent Rating
## 1                    1605
## 2                    1554
## 3                    1467
## 4                    1506
## 5                    1497
## 6                    1515

#Note file will write to location set up in your RStudio preferences.
write.csv(Chess, file="ChessScores.csv")

Data Visualization

Requires ggplot2

library(ggplot2)

#What is the relationship between Total Points (e.g. how the player did at this tournament) and Pre-Tournament Rating?
Chess %>% 
  ggplot(aes(x=Pre.Rating, y=`Total Points`))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(title="Pre-Tournament Ratings vs. Tournament Total Points",
       x="Pre-Tournament Ratings",
       y="Tournament Total Points")

#What is the relationship between Total Points (e.g. how the player did at this tournament) and Average Opponent Pre-Tournament Rating?
Chess %>% 
  ggplot(aes(x=`Average Opponent Rating`, y=`Total Points`))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(title="Mean Opponent Ratings vs. Tournament Total Points",
       x="Mean Oppoenent Pre-Tournament Ratings",
       y="Tournament Total Points")

Not surpringly, better-rated players did better at this tournament. Somewhat surprisingly, players paired with better-rated opponents also did better at this tournament! This can likely be explained that as players advanced, they were more likely to be paired with their better-rated peers, who had also won the last round.

Project 1