CUNY 607 Project 1

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605

Obtaining the raw file from GitHub.

library(RCurl)

## Loading required package: bitops

raw.file <- getURL("https://raw.githubusercontent.com/jcp9010/MSDA/master/tournamentinfo.txt")

Using RegEx to obtain all the player names.

library(stringr)
player_names <- unlist(str_extract_all(raw.file, "[A-Z]+ [A-Z]+ ([A-Z-]+)? ([A-Z]+)?"))
states <- unlist(str_extract_all(raw.file, "[[:upper:]]. \\|"))
states <- str_replace_all(states, " \\|", "")
points <- as.numeric(unlist(str_extract_all(raw.file, "(\\d)\\.(\\d)")))
prerating <- unlist(str_extract_all(raw.file, "R:\\s+?(\\d)+"))
prerating <- str_replace_all(prerating, "R:\\s+", "")
prerating <- as.numeric(prerating)

splitted <- unlist(str_split(raw.file, "\\|\\s+(MI\\s|ON\\s|OH\\s)"))

player.opponent <- vector()
for (i in 1:(length(splitted)-1)){
  temp <- unlist(str_extract_all(splitted[i], "(W|D|L)\\s+(\\d)+"))
  temp2 <- as.numeric(str_replace_all(temp, "(W|D|L)\\s+", ""))
  temp3 <- str_extract(splitted[temp2[1:length(temp2)]+1], "R:\\s+(\\d)+")
  temp4 <- round(sum(as.numeric(str_replace_all(temp3, "R:\\s+", "")))/length(temp3),0)
  player.opponent <- c(player.opponent, temp4)
}

Create a data frame with these data points extracted.

Chess.Players <- data.frame(Player.Name = player_names, Player.State = states, Total.Points = points, Avg.Prerating = prerating, Avg.Opp.Prerating = player.opponent)
write.csv(Chess.Players, file = "ChessPlayers.csv")
head(Chess.Players)

##            Player.Name Player.State Total.Points Avg.Prerating
## 1           GARY HUA             ON          6.0          1794
## 2    DAKSHESH DARURI             MI          6.0          1553
## 3       ADITYA BAJAJ             MI          6.0          1384
## 4 PATRICK H SCHILLING            MI          5.5          1716
## 5         HANSHI ZUO             MI          5.5          1655
## 6        HANSEN SONG             OH          5.0          1686
##   Avg.Opp.Prerating
## 1              1605
## 2              1469
## 3              1564
## 4              1574
## 5              1501
## 6              1519

dim(Chess.Players)

## [1] 64  5

Data Analysis:

summary(Chess.Players)

##                  Player.Name Player.State  Total.Points   Avg.Prerating 
##  ADITYA BAJAJ          : 1   MI:55        Min.   :1.000   Min.   : 377  
##  ALAN BUI              : 1   OH: 1        1st Qu.:2.500   1st Qu.:1227  
##  ALEX KONG             : 1   ON: 8        Median :3.500   Median :1407  
##  AMIYATOSH PWNANANDAM  : 1                Mean   :3.438   Mean   :1378  
##  ANVIT RAO             : 1                3rd Qu.:4.000   3rd Qu.:1583  
##  ASHWIN BALAJI         : 1                Max.   :6.000   Max.   :1794  
##  (Other)               :58                                              
##  Avg.Opp.Prerating
##  Min.   :1107     
##  1st Qu.:1310     
##  Median :1382     
##  Mean   :1379     
##  3rd Qu.:1481     
##  Max.   :1605     
##

How many players from each state?

table(Chess.Players$Player.State)

## 
## MI OH ON 
## 55  1  8

Average pre-rating players on a Boxplot?

summary(Chess.Players$Avg.Prerating)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     377    1227    1407    1378    1583    1794

boxplot(Chess.Players$Avg.Prerating, ylab = "Prerating Score", main = "Average Player's Prerating Score", col = "lightblue")

Player’s Total Number of Points

hist(Chess.Players$Total.Points, prob = TRUE, xlab = "Total Points", main = "Histogram for Total Points", col = 'pink')
x <- seq(1, 6, by = .1)
y <- dnorm(x, mean = mean(Chess.Players$Total.Points), sd = sd(Chess.Players$Total.Points))
lines(x, y, type = 'l', lwd = 2, col = 'blue')

qqnorm(Chess.Players$Total.Points)
qqline(Chess.Players$Total.Points)

Taken from the CUNY 606 Homework Assignment 3: qqnormsim() function.

qqnormsim <- function (dat) 
{
    par(mfrow = c(3, 3))
    qqnorm(dat, main = "Normal QQ Plot (Data)")
    qqline(dat)
    for (i in 1:8) {
        simnorm <- rnorm(n = length(dat), mean = mean(dat), sd = sd(dat))
        qqnorm(simnorm, main = "Normal QQ Plot (Sim)")
        qqline(simnorm)
    }
    par(mfrow = c(1, 1))
}

qqnormsim(Chess.Players$Total.Points)

This simulation attempts to see how well the data fits to a normal distribution.

Scatterplot: Player’s Pre-Rating vs. Average Pre Chess Rating of Opponents

plot(Chess.Players$Avg.Prerating, Chess.Players$Avg.Opp.Prerating, xlab = "Average Prerating of Chess Player", ylab = "Average Opponent Prerating", main = "Pre rating of Chess Player vs. Opponent's Pre rating", pch = 19)
abline(lm(Chess.Players$Avg.Opp.Prerating~Chess.Players$Avg.Prerating), col="red", lwd = 1)

The above attempts to create the best fit line in the scatterplot. Though, many of the data points appears to deviate from the best fit line.

Will use the ggplot2 package for different types of images:

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.2

Will use a scatterplot from the ggplot2 package.

pl <- ggplot(Chess.Players, aes(x = Avg.Prerating, y = Avg.Opp.Prerating))
pl2 <- pl + geom_point(alpha = 0.5, size = 2, color = 'blue')
pl3 <- pl2 + geom_quantile(color = 'red')
pl4 <- pl3 + labs(title = "Pre rating of Chess Player vs. Opponent's Pre rating", x = "Average Prerating of Chess Player", y = "Average Opponent Prerating")
pl4

## Loading required package: SparseM

## Warning: package 'SparseM' was built under R version 3.3.2

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## Smoothing formula not specified. Using: y ~ x

The above attempts to create the quantiles on the graph.

CUNY 607 Project 1

Joel Park

2/21/2017