Project 1

The project can be read from the PDF posted on github.com

Loading libraries

The necessary libraries.

library(stringr)
library(knitr)
library(dplyr)

First we have to load the data. I’m going to load it from github.com so that the project will run regardless of the comptuer directory.

Loading data

theUrl <- "https://raw.githubusercontent.com/kaiserxc/DATA_607/master/Project%201/tournamentinfo.txt"
txtFile <- read.csv(theUrl, header = FALSE)
txtFile[c(1:10),] # This command was used because head() did not behave.

##  [1] ----------------------------------------------------------------------------------------- 
##  [2]  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
##  [3]  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
##  [4] ----------------------------------------------------------------------------------------- 
##  [5]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| 
##  [6]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    | 
##  [7] ----------------------------------------------------------------------------------------- 
##  [8]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7| 
##  [9]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    | 
## [10] ----------------------------------------------------------------------------------------- 
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...

tail(txtFile) # Looking at the tail function it appears to work in the MD

##                                                                                            V1
## 191    63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |
## 192    MI | 15057092 / R: 1175   ->1125     |     |W    |B    |W    |B    |B    |     |     |
## 193 -----------------------------------------------------------------------------------------
## 194    64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|
## 195    MI | 15006561 / R: 1163   ->1112     |     |B    |W    |W    |B    |W    |B    |B    |
## 196 -----------------------------------------------------------------------------------------

# file but not here. I'll leave both in to show diff methods.

Cleaning data

The girst 4 rows contain info on the turnament and are superflous to our project.

txtFile1 <-txtFile[-c(1:4),]
txtFile1 %>% head()

## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [3] -----------------------------------------------------------------------------------------
## [4]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [5]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...

class(txtFile1)

## [1] "factor"

We need to change this to character

txtFile1 <- as.character(txtFile1)
class(txtFile1)

## [1] "character"

Success!

We need the first and second rows out of every three (discard the third). We also need to seperate them.

name <- txtFile1[seq(from = 1, to = length(txtFile1), by = 3)]
rating <- txtFile1[seq(from = 2, to = length(txtFile1), by = 3)]

Extracting relevant data

p_id <- as.integer(str_extract(name, "\\d+")) # could have just done c(1:64)
p_name <- str_trim(str_extract(name, "(\\w+\\s){2,3}")) #This works because name is the first instance.
p_point <- as.numeric(str_extract(name, "\\d.\\d"))
p_rate <- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"), "\\d+")) 
state <- str_extract(rating, "\\w\\w")
opp_id <- str_extract_all(str_extract_all(name, "\\d+\\|"), "\\d+")

Turning the list into a data frame

Thanks to this

n.obs <- sapply(opp_id, length)
seq.max <- seq_len(max(n.obs))
mat <- t(sapply(opp_id, "[", i = seq.max)) 
df2 <- as.data.frame(mat)

Replacing the id’s with ratings

Special thanks to this stack overflow for helping me.

key <- data.frame(p_id, p_rate)
df3 <- df2
df3[] <- key$p_rate[match(unlist(df3), key$p_id)]
kable(head(df3))

V1	V2	V3	V4	V5	V6	V7
1436	1563	1600	1610	1649	1663	1716
1175	917	1716	1629	1604	1595	1649
1641	955	1745	1563	1712	1666	1663
1363	1507	1553	1579	1655	1564	1794
1242	980	1663	1666	1716	1610	1629
1399	1602	1712	1438	1365	1552	1563

Calculating the average

df3$oppAvg <- round(apply(df3, 1, mean, na.rm=TRUE))
kable(head(df3))

V1	V2	V3	V4	V5	V6	V7	oppAvg
1436	1563	1600	1610	1649	1663	1716	1605
1175	917	1716	1629	1604	1595	1649	1469
1641	955	1745	1563	1712	1666	1663	1564
1363	1507	1553	1579	1655	1564	1794	1574
1242	980	1663	1666	1716	1610	1629	1501
1399	1602	1712	1438	1365	1552	1563	1519

df <- data_frame(p_id, p_name, state, p_point, p_rate, df3$oppAvg )
colnames(df) <-  c("ID", "Player_Name", "State", "Point", "Pre_Match_Rating", "Opponent_Average")
kable(head(df, 10))

ID	Player_Name	State	Point	Pre_Match_Rating	Opponent_Average
1	GARY HUA	ON	6.0	1794	1605
2	DAKSHESH DARURI	MI	6.0	1553	1469
3	ADITYA BAJAJ	MI	6.0	1384	1564
4	PATRICK H SCHILLING	MI	5.5	1716	1574
5	HANSHI ZUO	MI	5.5	1655	1501
6	HANSEN SONG	OH	5.0	1686	1519
7	GARY DEE SWATHELL	MI	5.0	1649	1372
8	EZEKIEL HOUGHTON	MI	5.0	1641	1468
9	STEFANO LEE	ON	5.0	1411	1523
10	ANVIT RAO	MI	5.0	1365	1554

Write the CSV

write.csv(df, file = "/Users/kailukowiak/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 1/clean_tournamentinfo.txt")

Regressions

Does rank impact how weak your opponents are?

fit <- lm(Opponent_Average ~ Pre_Match_Rating, data = df)
summary(fit)

## 
## Call:
## lm(formula = Opponent_Average ~ Pre_Match_Rating, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -259.34  -67.43   13.95   83.48  184.67 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.201e+03  7.745e+01  15.510   <2e-16 ***
## Pre_Match_Rating 1.287e-01  5.519e-02   2.332    0.023 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 116.3 on 62 degrees of freedom
## Multiple R-squared:  0.08062,    Adjusted R-squared:  0.06579 
## F-statistic: 5.437 on 1 and 62 DF,  p-value: 0.02298

cor(x = df$Pre_Match_Rating, y = df$Opponent_Average)

## [1] 0.2839375

Less so than I would have thought.

fit <- lm(Point ~ Pre_Match_Rating, data = df)
summary(fit)

## 
## Call:
## lm(formula = Point ~ Pre_Match_Rating, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86623 -0.56507 -0.06999  0.34002  2.54694 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.4635332  0.6562549  -0.706    0.483    
## Pre_Match_Rating  0.0028299  0.0004676   6.052 9.09e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9853 on 62 degrees of freedom
## Multiple R-squared:  0.3714, Adjusted R-squared:  0.3612 
## F-statistic: 36.63 on 1 and 62 DF,  p-value: 9.093e-08

No surprise here. Being a good player means you score more.

Graphs

library(ggplot2)
qplot(df$Pre_Match_Rating)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Looks like a histogram isn’t the best.

qplot(x = df$ID, y= df$Pre_Match_Rating, geom = "point")

Interesting, there seems to be a downward trend.

ggplot(NULL, aes(x=rowSums(!is.na(df3)))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The highest number of non participation is in the last round.

Project1

Kai Lukowiak

2017-09-20