Title DATA 607 Project 1
Author Rose Koh
Date 2018/02/21
Descriptions Process unstructured data into structure, use regular expressions.
Rpub Rpub Link
Github Github Link

Project intro

# e.g. of Average Pre Chess Rating of Opponents
# for Gary Hua, the opponents numbers are: (39, 21, 18, 14, 7, 12, 4)
mean(c(1436, 1563, 1600, 1610, 1649, 1663, 1716))
## [1] 1605.286
library(stringr)
library(DT)
library(ggplot2)

Load Data

con <- ("./tournamentinfo.txt")
tourinfo <- readLines(con)
head(tourinfo, 10)
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

Preprocessing

Divide the structure
# The current data shows 2 lines as a set.
# What we are going to do is make separate the odd/even rows index, so we can pull them out as we want.

# remove first 4 rows that I don't need
info <- tourinfo[-c(0:4)]

# remove unnecessary spaces
info <- info[sapply(info, nchar) > 0]

# divide odd / even rows into separate set of lines
odd <- c(seq(1, length(info), 3))
odd_a <- info[odd]

even <- c(seq(2, length(info), 3))
even_a <- info[even]
Use regex
# use regex to extract the only required information.

# name
name <- str_extract(odd_a, "\\s+([[:alpha:]- ]+)\\b\\s*\\|")
name <- gsub(name, pattern = "|", replacement = "", fixed = T)
# strip the space
name <- trimws(name)

# state
state <- str_extract(even_a, "[[:alpha:]]{2}")

# total_points
total_points <- str_extract(odd_a, "[[:digit:]]+\\.[[:digit:]]")
total_points <- as.numeric(as.character(total_points))

# pre_rating
pre_rating <- str_extract(even_a, ".\\: \\s?[[:digit:]]{3,4}")
pre_rating <- gsub(pre_rating, pattern = "R: ", replacement = "", fixed = T)
pre_rating <- as.numeric(as.character(pre_rating))

# opponent_number to extract opponents pair number per player
opponent_number <- str_extract_all(odd_a, "[[:digit:]]{1,2}\\|")
opponent_number <- str_extract_all(opponent_number, "[[:digit:]]{1,2}")
opponent_number <- lapply(opponent_number, as.numeric)
Use for loop to calculate the Average pre chess rating of opponents
# calculate Average Pre Chess Rating of Opponents and store that in a list
opp_avg_rating <- list()
for (i in 1:length(opponent_number)){
  opp_avg_rating[i] <- round(mean(pre_rating[unlist(opponent_number[i])]),2)
}
opp_avg_rating <- lapply(opp_avg_rating, as.numeric)
opp_avg_rating <- data.frame(unlist(opp_avg_rating))

Final dataset

# create initial data frame
df <- cbind.data.frame(name, state, total_points, pre_rating, opp_avg_rating)
colnames(df) <- c("Name", "State", "Total_points", "Pre_rating", "Avg_pre_chess_rating_of_opponents")
str(df)
## 'data.frame':    64 obs. of  5 variables:
##  $ Name                             : Factor w/ 64 levels "ADITYA BAJAJ",..: 24 12 1 51 28 27 23 21 59 5 ...
##  $ State                            : Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
##  $ Total_points                     : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ Pre_rating                       : num  1794 1553 1384 1716 1655 ...
##  $ Avg_pre_chess_rating_of_opponents: num  1605 1469 1564 1574 1501 ...
datatable(df)

Visualization

summary(df)
##                    Name    State    Total_points     Pre_rating  
##  ADITYA BAJAJ        : 1   MI:55   Min.   :1.000   Min.   : 377  
##  ALAN BUI            : 1   OH: 1   1st Qu.:2.500   1st Qu.:1227  
##  ALEX KONG           : 1   ON: 8   Median :3.500   Median :1407  
##  AMIYATOSH PWNANANDAM: 1           Mean   :3.438   Mean   :1378  
##  ANVIT RAO           : 1           3rd Qu.:4.000   3rd Qu.:1583  
##  ASHWIN BALAJI       : 1           Max.   :6.000   Max.   :1794  
##  (Other)             :58                                         
##  Avg_pre_chess_rating_of_opponents
##  Min.   :1107                     
##  1st Qu.:1310                     
##  Median :1382                     
##  Mean   :1379                     
##  3rd Qu.:1481                     
##  Max.   :1605                     
## 
library(ggthemes)
library(plotly)
ggplotly(ggplot(df, aes(pre_rating, opp_avg_rating, color = State, group = State)) + 
           geom_point(aes(size = Total_points, shape = State)) + 
           geom_abline() +
           labs(title = "Relationship between players' pre-rating and their opponents' average pre-rating",
                x = "Player's pre-rating",
                y = "Opponents' average pre-rating")
           )
summary(df$Total_points)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.500   3.500   3.438   4.000   6.000
summary(df$Pre_rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     377    1227    1407    1378    1583    1794
summary(df$Avg_pre_chess_rating_of_opponents)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1107    1310    1382    1379    1481    1605

Export the csv file

# Goal: export structured dataset into csv file.
write.csv(df, "Chesstable.csv")