Team: Anthony, Christina, David
Our team decided to use the following libraries to clean and select the data:
stringr magrtitr dplyer DT
In this section. We used choose the built-in R csv loader using “|” as a seperator. This still left a lot of information to clean. Regular expressions were used to remove dashes and NA rows.
library(stringr)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(DT)
#chess_data3<-read.csv("/Users/davidapolinar/Dropbox/CUNYProjects/Srping2019/Data607/project1/tournamentinfo.txt", sep = "|", header = FALSE)
raw_chess_data <- read.csv("https://raw.githubusercontent.com/dapolloxp/data607/master/project1/tournamentinfo.txt", sep = "|", header = FALSE)
# Remove all dashes
cleaned <- raw_chess_data[!grepl("-", raw_chess_data[,1]),]
# Remove ending NAs
cleaned <- cleaned[,1:10]
# Remove headers and store into new DF
player.info <- cleaned[3:nrow(cleaned),]
# Extract first column with numbers
player.info.1 <- cleaned[grepl("\\d", cleaned[,1]),]
# Convert Factors to characters
for(i in 4:10)
{
player.info.1[,i]<- as.character((player.info.1[,i]))
}
# First two columns containing State and USCF ID
player.info.2 <- cleaned[!grepl("\\d", cleaned[,1]),1:2]
# trim first two rows
player.info.2 <- player.info.2[3:nrow(player.info.2),]
# merge tables - first remove factors for rating column to merge
player.info.2[,2] <- as.character(player.info.2[,2])
final.player.info <- cbind(player.info.1,player.info.2)
# Add names
colnames(final.player.info) <- c("Pair Num","Player Name","Total Pts","Round 1", "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7", "State", "USCF ID / Rtg (Pre -> Post)")
# convert data types
final.player.info[,1] <- as.numeric(as.character(final.player.info[,1]))
final.player.info[,3] <- as.numeric(as.character(final.player.info[,3]))
# extract pre-rating from string
final.player.info$prerating<- as.numeric(str_extract(str_extract(final.player.info[,12], "R:\\s*[\\d]{3,4}"), "([0-9]+)"))
#final.player.info
x<- final.player.info[,4:10]
#final.player.info$OpponentPreRating <- c(1:nrow(final.player.info))
get.rating.by.player.num <- function(z)
{
return(as.numeric(final.player.info %>% filter(`Pair Num` == z) %>% select(prerating)))
}
calculate.avg.pre.rating <- function(df.players)
{
df.avg <- data.frame()
for( i in 1:nrow(df.players))
{
o <- as.numeric(c(apply(df.players[i,1:length(df.players)], 1, function(x)str_extract(x, "[0-9]+"))))
f<- sapply(o, get.rating.by.player.num)
df.avg <- rbind(df.avg, mean(f, na.rm = TRUE))
}
return(round(df.avg))
}
prerating.avgs <- calculate.avg.pre.rating(x)
colnames(prerating.avgs) <- c("OpponentPreRating")
final.player.info <- cbind(final.player.info, prerating.avgs)
Because we chose to store the data in a final dataframe, this made it possible for us to make queries and leverage the power of dplyr.
# select top 10 rows
select(final.player.info, "Player Name", "State", "Total Pts", "OpponentPreRating") %>% top_n(10)
## Selecting by OpponentPreRating
## Player Name State Total Pts OpponentPreRating
## 1 GARY HUA ON 6.0 1605
## 2 ADITYA BAJAJ MI 6.0 1564
## 3 PATRICK H SCHILLING MI 5.5 1574
## 4 HANSEN SONG OH 5.0 1519
## 5 STEFANO LEE ON 5.0 1523
## 6 ANVIT RAO MI 5.0 1554
## 7 BRADLEY SHAW MI 4.5 1515
## 8 MAX ZHU ON 3.5 1507
## 9 SOFIA ADINA STANESCU-BELLU MI 3.5 1522
## 10 BRIAN LIU MI 3.0 1539
datatable(select(final.player.info, "Player Name", "State", "Total Pts", "OpponentPreRating"))
To analyze the data in this section, we start with a histogram to see the total points distribution. Looking at the point distribution, the data appears to be close to normal.
ggplot(data=final.player.info) + geom_histogram (aes(x=`Total Pts`),color="black", fill="lightblue") + labs(title="Points Frequency plot",x="Points", y = "Point Frequency ")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.