Overview

Team: Anthony, Christina, David

Our team decided to use the following libraries to clean and select the data:

stringr magrtitr dplyer DT

Data Cleansing

In this section. We used choose the built-in R csv loader using “|” as a seperator. This still left a lot of information to clean. Regular expressions were used to remove dashes and NA rows.

library(stringr)
library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(DT)

#chess_data3<-read.csv("/Users/davidapolinar/Dropbox/CUNYProjects/Srping2019/Data607/project1/tournamentinfo.txt", sep = "|", header = FALSE)
raw_chess_data <- read.csv("https://raw.githubusercontent.com/dapolloxp/data607/master/project1/tournamentinfo.txt", sep = "|", header = FALSE)

# Remove all dashes
cleaned <- raw_chess_data[!grepl("-", raw_chess_data[,1]),]
# Remove ending NAs
cleaned <- cleaned[,1:10]
# Remove headers and store into new DF
player.info <- cleaned[3:nrow(cleaned),]

# Extract first column with numbers
player.info.1 <- cleaned[grepl("\\d", cleaned[,1]),]
# Convert Factors to characters
for(i in 4:10)
{
  player.info.1[,i]<- as.character((player.info.1[,i]))
}
# First two columns containing State and USCF ID
player.info.2 <- cleaned[!grepl("\\d", cleaned[,1]),1:2]
# trim first two rows
player.info.2 <- player.info.2[3:nrow(player.info.2),]
# merge tables - first remove factors for rating column to merge
player.info.2[,2] <- as.character(player.info.2[,2])

final.player.info <- cbind(player.info.1,player.info.2)
# Add names
colnames(final.player.info) <- c("Pair Num","Player Name","Total Pts","Round 1", "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7", "State", "USCF ID / Rtg (Pre -> Post)")
# convert data types
final.player.info[,1] <- as.numeric(as.character(final.player.info[,1]))
final.player.info[,3] <- as.numeric(as.character(final.player.info[,3]))

# extract pre-rating from string
final.player.info$prerating<- as.numeric(str_extract(str_extract(final.player.info[,12], "R:\\s*[\\d]{3,4}"), "([0-9]+)"))
#final.player.info
x<- final.player.info[,4:10]

#final.player.info$OpponentPreRating <- c(1:nrow(final.player.info))
get.rating.by.player.num <- function(z)
{
  return(as.numeric(final.player.info %>% filter(`Pair Num` == z) %>% select(prerating)))
}

calculate.avg.pre.rating <- function(df.players)
{
  df.avg <- data.frame()
  
 for( i in 1:nrow(df.players))
  {
   o <- as.numeric(c(apply(df.players[i,1:length(df.players)], 1, function(x)str_extract(x, "[0-9]+"))))
   f<- sapply(o, get.rating.by.player.num)

  df.avg <- rbind(df.avg, mean(f, na.rm = TRUE))
  
 }
  return(round(df.avg))
}

prerating.avgs <- calculate.avg.pre.rating(x)
colnames(prerating.avgs) <- c("OpponentPreRating")
final.player.info <- cbind(final.player.info, prerating.avgs)

Data Selection and Visualization

Because we chose to store the data in a final dataframe, this made it possible for us to make queries and leverage the power of dplyr.

# select top 10 rows
select(final.player.info, "Player Name", "State", "Total Pts", "OpponentPreRating") %>% top_n(10)

## Selecting by OpponentPreRating

##                          Player Name  State Total Pts OpponentPreRating
## 1   GARY HUA                            ON        6.0              1605
## 2   ADITYA BAJAJ                        MI        6.0              1564
## 3   PATRICK H SCHILLING                 MI        5.5              1574
## 4   HANSEN SONG                         OH        5.0              1519
## 5   STEFANO LEE                         ON        5.0              1523
## 6   ANVIT RAO                           MI        5.0              1554
## 7   BRADLEY SHAW                        MI        4.5              1515
## 8   MAX ZHU                             ON        3.5              1507
## 9   SOFIA ADINA STANESCU-BELLU          MI        3.5              1522
## 10  BRIAN LIU                           MI        3.0              1539

datatable(select(final.player.info, "Player Name", "State", "Total Pts", "OpponentPreRating"))

To analyze the data in this section, we start with a histogram to see the total points distribution. Looking at the point distribution, the data appears to be close to normal.

ggplot(data=final.player.info) + geom_histogram (aes(x=`Total Pts`),color="black", fill="lightblue") + labs(title="Points Frequency plot",x="Points", y = "Point Frequency ")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data607 Assignment

David Apolinar, Anthony Muñoz, Christina

2/14/2019

Overview

Data Cleansing

Data Selection and Visualization