Project 1

The project can be read from the PDF posted on github.com

Loading libraries

The necessary libraries.

library(stringr)
library(knitr)
library(dplyr)

First we have to load the data. I’m going to load it from github.com so that the project will run regardless of the comptuer directory.

Loading data

theUrl <- "https://raw.githubusercontent.com/kaiserxc/DATA_607/master/Project%201/tournamentinfo.txt"
txtFile <- read.csv(theUrl, header = FALSE)
txtFile[c(1:10),] # This command was used because head() did not behave.
##  [1] ----------------------------------------------------------------------------------------- 
##  [2]  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
##  [3]  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
##  [4] ----------------------------------------------------------------------------------------- 
##  [5]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| 
##  [6]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    | 
##  [7] ----------------------------------------------------------------------------------------- 
##  [8]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7| 
##  [9]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    | 
## [10] ----------------------------------------------------------------------------------------- 
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...
tail(txtFile) # Looking at the tail function it appears to work in the MD
##                                                                                            V1
## 191    63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |
## 192    MI | 15057092 / R: 1175   ->1125     |     |W    |B    |W    |B    |B    |     |     |
## 193 -----------------------------------------------------------------------------------------
## 194    64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|
## 195    MI | 15006561 / R: 1163   ->1112     |     |B    |W    |W    |B    |W    |B    |B    |
## 196 -----------------------------------------------------------------------------------------
# file but not here. I'll leave both in to show diff methods.

Cleaning data

The girst 4 rows contain info on the turnament and are superflous to our project.

txtFile1 <-txtFile[-c(1:4),]
txtFile1 %>% head() 
## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [3] -----------------------------------------------------------------------------------------
## [4]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [5]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...
class(txtFile1)
## [1] "factor"

We need to change this to character

txtFile1 <- as.character(txtFile1)
class(txtFile1)
## [1] "character"

Success!

We need the first and second rows out of every three (discard the third). We also need to seperate them.

name <- txtFile1[seq(from = 1, to = length(txtFile1), by = 3)]
rating <- txtFile1[seq(from = 2, to = length(txtFile1), by = 3)]

Extracting relevant data

p_id <- as.integer(str_extract(name, "\\d+")) # could have just done c(1:64)
p_name <- str_trim(str_extract(name, "(\\w+\\s){2,3}")) #This works because name is the first instance.
p_point <- as.numeric(str_extract(name, "\\d.\\d"))
p_rate <- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"), "\\d+")) 
state <- str_extract(rating, "\\w\\w")
opp_id <- str_extract_all(str_extract_all(name, "\\d+\\|"), "\\d+")

Turning the list into a data frame

Thanks to this

n.obs <- sapply(opp_id, length)
seq.max <- seq_len(max(n.obs))
mat <- t(sapply(opp_id, "[", i = seq.max)) 
df2 <- as.data.frame(mat)

Replacing the id’s with ratings

Special thanks to this stack overflow for helping me.

key <- data.frame(p_id, p_rate)
df3 <- df2
df3[] <- key$p_rate[match(unlist(df3), key$p_id)]
kable(head(df3))
V1 V2 V3 V4 V5 V6 V7
1436 1563 1600 1610 1649 1663 1716
1175 917 1716 1629 1604 1595 1649
1641 955 1745 1563 1712 1666 1663
1363 1507 1553 1579 1655 1564 1794
1242 980 1663 1666 1716 1610 1629
1399 1602 1712 1438 1365 1552 1563

Calculating the average

df3$oppAvg <- round(apply(df3, 1, mean, na.rm=TRUE))
kable(head(df3))
V1 V2 V3 V4 V5 V6 V7 oppAvg
1436 1563 1600 1610 1649 1663 1716 1605
1175 917 1716 1629 1604 1595 1649 1469
1641 955 1745 1563 1712 1666 1663 1564
1363 1507 1553 1579 1655 1564 1794 1574
1242 980 1663 1666 1716 1610 1629 1501
1399 1602 1712 1438 1365 1552 1563 1519
df <- data_frame(p_id, p_name, state, p_point, p_rate, df3$oppAvg )
colnames(df) <-  c("ID", "Player_Name", "State", "Point", "Pre_Match_Rating", "Opponent_Average")
kable(head(df, 10))
ID Player_Name State Point Pre_Match_Rating Opponent_Average
1 GARY HUA ON 6.0 1794 1605
2 DAKSHESH DARURI MI 6.0 1553 1469
3 ADITYA BAJAJ MI 6.0 1384 1564
4 PATRICK H SCHILLING MI 5.5 1716 1574
5 HANSHI ZUO MI 5.5 1655 1501
6 HANSEN SONG OH 5.0 1686 1519
7 GARY DEE SWATHELL MI 5.0 1649 1372
8 EZEKIEL HOUGHTON MI 5.0 1641 1468
9 STEFANO LEE ON 5.0 1411 1523
10 ANVIT RAO MI 5.0 1365 1554

Write the CSV

write.csv(df, file = "/Users/kailukowiak/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 1/clean_tournamentinfo.txt")

Regressions

Does rank impact how weak your opponents are?

fit <- lm(Opponent_Average ~ Pre_Match_Rating, data = df)
summary(fit)
## 
## Call:
## lm(formula = Opponent_Average ~ Pre_Match_Rating, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -259.34  -67.43   13.95   83.48  184.67 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.201e+03  7.745e+01  15.510   <2e-16 ***
## Pre_Match_Rating 1.287e-01  5.519e-02   2.332    0.023 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 116.3 on 62 degrees of freedom
## Multiple R-squared:  0.08062,    Adjusted R-squared:  0.06579 
## F-statistic: 5.437 on 1 and 62 DF,  p-value: 0.02298
cor(x = df$Pre_Match_Rating, y = df$Opponent_Average)
## [1] 0.2839375

Less so than I would have thought.

fit <- lm(Point ~ Pre_Match_Rating, data = df)
summary(fit)
## 
## Call:
## lm(formula = Point ~ Pre_Match_Rating, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86623 -0.56507 -0.06999  0.34002  2.54694 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.4635332  0.6562549  -0.706    0.483    
## Pre_Match_Rating  0.0028299  0.0004676   6.052 9.09e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9853 on 62 degrees of freedom
## Multiple R-squared:  0.3714, Adjusted R-squared:  0.3612 
## F-statistic: 36.63 on 1 and 62 DF,  p-value: 9.093e-08

No surprise here. Being a good player means you score more.

Graphs

library(ggplot2)
qplot(df$Pre_Match_Rating)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Looks like a histogram isn’t the best.

qplot(x = df$ID, y= df$Pre_Match_Rating, geom = "point")

Interesting, there seems to be a downward trend.

ggplot(NULL, aes(x=rowSums(!is.na(df3)))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The highest number of non participation is in the last round.