Title DATA 607 Project 1 Author Rose Koh Date 2018/02/21 Descriptions Process unstructured data into structure, use regular expressions. Rpub Rpub Link Github Github Link
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information:
For all of the players:
# e.g. of Average Pre Chess Rating of Opponents
# for Gary Hua, the opponents numbers are: (39, 21, 18, 14, 7, 12, 4)
mean(c(1436, 1563, 1600, 1610, 1649, 1663, 1716))
## [1] 1605.286
library(stringr)
library(DT)
library(ggplot2)
con <- ("./tournamentinfo.txt")
tourinfo <- readLines(con)
head(tourinfo, 10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
# The current data shows 2 lines as a set.
# What we are going to do is make separate the odd/even rows index, so we can pull them out as we want.
# remove first 4 rows that I don't need
info <- tourinfo[-c(0:4)]
# remove unnecessary spaces
info <- info[sapply(info, nchar) > 0]
# divide odd / even rows into separate set of lines
odd <- c(seq(1, length(info), 3))
odd_a <- info[odd]
even <- c(seq(2, length(info), 3))
even_a <- info[even]
# use regex to extract the only required information.
# name
name <- str_extract(odd_a, "\\s+([[:alpha:]- ]+)\\b\\s*\\|")
name <- gsub(name, pattern = "|", replacement = "", fixed = T)
# strip the space
name <- trimws(name)
# state
state <- str_extract(even_a, "[[:alpha:]]{2}")
# total_points
total_points <- str_extract(odd_a, "[[:digit:]]+\\.[[:digit:]]")
total_points <- as.numeric(as.character(total_points))
# pre_rating
pre_rating <- str_extract(even_a, ".\\: \\s?[[:digit:]]{3,4}")
pre_rating <- gsub(pre_rating, pattern = "R: ", replacement = "", fixed = T)
pre_rating <- as.numeric(as.character(pre_rating))
# opponent_number to extract opponents pair number per player
opponent_number <- str_extract_all(odd_a, "[[:digit:]]{1,2}\\|")
opponent_number <- str_extract_all(opponent_number, "[[:digit:]]{1,2}")
opponent_number <- lapply(opponent_number, as.numeric)
# calculate Average Pre Chess Rating of Opponents and store that in a list
opp_avg_rating <- list()
for (i in 1:length(opponent_number)){
opp_avg_rating[i] <- round(mean(pre_rating[unlist(opponent_number[i])]),2)
}
opp_avg_rating <- lapply(opp_avg_rating, as.numeric)
opp_avg_rating <- data.frame(unlist(opp_avg_rating))
# create initial data frame
df <- cbind.data.frame(name, state, total_points, pre_rating, opp_avg_rating)
colnames(df) <- c("Name", "State", "Total_points", "Pre_rating", "Avg_pre_chess_rating_of_opponents")
str(df)
## 'data.frame': 64 obs. of 5 variables:
## $ Name : Factor w/ 64 levels "ADITYA BAJAJ",..: 24 12 1 51 28 27 23 21 59 5 ...
## $ State : Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
## $ Total_points : num 6 6 6 5.5 5.5 5 5 5 5 5 ...
## $ Pre_rating : num 1794 1553 1384 1716 1655 ...
## $ Avg_pre_chess_rating_of_opponents: num 1605 1469 1564 1574 1501 ...
datatable(df)
summary(df)
## Name State Total_points Pre_rating
## ADITYA BAJAJ : 1 MI:55 Min. :1.000 Min. : 377
## ALAN BUI : 1 OH: 1 1st Qu.:2.500 1st Qu.:1227
## ALEX KONG : 1 ON: 8 Median :3.500 Median :1407
## AMIYATOSH PWNANANDAM: 1 Mean :3.438 Mean :1378
## ANVIT RAO : 1 3rd Qu.:4.000 3rd Qu.:1583
## ASHWIN BALAJI : 1 Max. :6.000 Max. :1794
## (Other) :58
## Avg_pre_chess_rating_of_opponents
## Min. :1107
## 1st Qu.:1310
## Median :1382
## Mean :1379
## 3rd Qu.:1481
## Max. :1605
##
library(ggthemes)
library(plotly)
ggplotly(ggplot(df, aes(pre_rating, opp_avg_rating, color = State, group = State)) +
geom_point(aes(size = Total_points, shape = State)) +
geom_abline() +
labs(title = "Relationship between players' pre-rating and their opponents' average pre-rating",
x = "Player's pre-rating",
y = "Opponents' average pre-rating")
)
summary(df$Total_points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.500 3.500 3.438 4.000 6.000
summary(df$Pre_rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 377 1227 1407 1378 1583 1794
summary(df$Avg_pre_chess_rating_of_opponents)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1107 1310 1382 1379 1481 1605
# Goal: export structured dataset into csv file.
write.csv(df, "Chesstable.csv")