The project can be read from the PDF posted on github.com
The necessary libraries.
library(stringr)
library(knitr)
library(dplyr)First we have to load the data. I’m going to load it from github.com so that the project will run regardless of the comptuer directory.
theUrl <- "https://raw.githubusercontent.com/kaiserxc/DATA_607/master/Project%201/tournamentinfo.txt"
txtFile <- read.csv(theUrl, header = FALSE)
txtFile[c(1:10),] # This command was used because head() did not behave.## [1] -----------------------------------------------------------------------------------------
## [2] Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## [3] Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## [4] -----------------------------------------------------------------------------------------
## [5] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [6] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [7] -----------------------------------------------------------------------------------------
## [8] 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## [9] MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## [10] -----------------------------------------------------------------------------------------
## 131 Levels: 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ...
tail(txtFile) # Looking at the tail function it appears to work in the MD## V1
## 191 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |
## 192 MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |
## 193 -----------------------------------------------------------------------------------------
## 194 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|
## 195 MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |
## 196 -----------------------------------------------------------------------------------------
# file but not here. I'll leave both in to show diff methods.The girst 4 rows contain info on the turnament and are superflous to our project.
txtFile1 <-txtFile[-c(1:4),]
txtFile1 %>% head() ## [1] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [2] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [3] -----------------------------------------------------------------------------------------
## [4] 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## [5] MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ...
class(txtFile1)## [1] "factor"
We need to change this to character
txtFile1 <- as.character(txtFile1)
class(txtFile1)## [1] "character"
Success!
We need the first and second rows out of every three (discard the third). We also need to seperate them.
name <- txtFile1[seq(from = 1, to = length(txtFile1), by = 3)]
rating <- txtFile1[seq(from = 2, to = length(txtFile1), by = 3)]p_id <- as.integer(str_extract(name, "\\d+")) # could have just done c(1:64)
p_name <- str_trim(str_extract(name, "(\\w+\\s){2,3}")) #This works because name is the first instance.
p_point <- as.numeric(str_extract(name, "\\d.\\d"))
p_rate <- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"), "\\d+"))
state <- str_extract(rating, "\\w\\w")
opp_id <- str_extract_all(str_extract_all(name, "\\d+\\|"), "\\d+")Thanks to this
n.obs <- sapply(opp_id, length)
seq.max <- seq_len(max(n.obs))
mat <- t(sapply(opp_id, "[", i = seq.max))
df2 <- as.data.frame(mat)Special thanks to this stack overflow for helping me.
key <- data.frame(p_id, p_rate)
df3 <- df2
df3[] <- key$p_rate[match(unlist(df3), key$p_id)]
kable(head(df3))| V1 | V2 | V3 | V4 | V5 | V6 | V7 |
|---|---|---|---|---|---|---|
| 1436 | 1563 | 1600 | 1610 | 1649 | 1663 | 1716 |
| 1175 | 917 | 1716 | 1629 | 1604 | 1595 | 1649 |
| 1641 | 955 | 1745 | 1563 | 1712 | 1666 | 1663 |
| 1363 | 1507 | 1553 | 1579 | 1655 | 1564 | 1794 |
| 1242 | 980 | 1663 | 1666 | 1716 | 1610 | 1629 |
| 1399 | 1602 | 1712 | 1438 | 1365 | 1552 | 1563 |
df3$oppAvg <- round(apply(df3, 1, mean, na.rm=TRUE))
kable(head(df3))| V1 | V2 | V3 | V4 | V5 | V6 | V7 | oppAvg |
|---|---|---|---|---|---|---|---|
| 1436 | 1563 | 1600 | 1610 | 1649 | 1663 | 1716 | 1605 |
| 1175 | 917 | 1716 | 1629 | 1604 | 1595 | 1649 | 1469 |
| 1641 | 955 | 1745 | 1563 | 1712 | 1666 | 1663 | 1564 |
| 1363 | 1507 | 1553 | 1579 | 1655 | 1564 | 1794 | 1574 |
| 1242 | 980 | 1663 | 1666 | 1716 | 1610 | 1629 | 1501 |
| 1399 | 1602 | 1712 | 1438 | 1365 | 1552 | 1563 | 1519 |
df <- data_frame(p_id, p_name, state, p_point, p_rate, df3$oppAvg )
colnames(df) <- c("ID", "Player_Name", "State", "Point", "Pre_Match_Rating", "Opponent_Average")
kable(head(df, 10))| ID | Player_Name | State | Point | Pre_Match_Rating | Opponent_Average |
|---|---|---|---|---|---|
| 1 | GARY HUA | ON | 6.0 | 1794 | 1605 |
| 2 | DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| 3 | ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| 4 | PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| 5 | HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| 6 | HANSEN SONG | OH | 5.0 | 1686 | 1519 |
| 7 | GARY DEE SWATHELL | MI | 5.0 | 1649 | 1372 |
| 8 | EZEKIEL HOUGHTON | MI | 5.0 | 1641 | 1468 |
| 9 | STEFANO LEE | ON | 5.0 | 1411 | 1523 |
| 10 | ANVIT RAO | MI | 5.0 | 1365 | 1554 |
write.csv(df, file = "/Users/kailukowiak/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 1/clean_tournamentinfo.txt")Does rank impact how weak your opponents are?
fit <- lm(Opponent_Average ~ Pre_Match_Rating, data = df)
summary(fit)##
## Call:
## lm(formula = Opponent_Average ~ Pre_Match_Rating, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -259.34 -67.43 13.95 83.48 184.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.201e+03 7.745e+01 15.510 <2e-16 ***
## Pre_Match_Rating 1.287e-01 5.519e-02 2.332 0.023 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 116.3 on 62 degrees of freedom
## Multiple R-squared: 0.08062, Adjusted R-squared: 0.06579
## F-statistic: 5.437 on 1 and 62 DF, p-value: 0.02298
cor(x = df$Pre_Match_Rating, y = df$Opponent_Average)## [1] 0.2839375
Less so than I would have thought.
fit <- lm(Point ~ Pre_Match_Rating, data = df)
summary(fit)##
## Call:
## lm(formula = Point ~ Pre_Match_Rating, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.86623 -0.56507 -0.06999 0.34002 2.54694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4635332 0.6562549 -0.706 0.483
## Pre_Match_Rating 0.0028299 0.0004676 6.052 9.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9853 on 62 degrees of freedom
## Multiple R-squared: 0.3714, Adjusted R-squared: 0.3612
## F-statistic: 36.63 on 1 and 62 DF, p-value: 9.093e-08
No surprise here. Being a good player means you score more.
library(ggplot2)
qplot(df$Pre_Match_Rating)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Looks like a histogram isn’t the best.
qplot(x = df$ID, y= df$Pre_Match_Rating, geom = "point")Interesting, there seems to be a downward trend.
ggplot(NULL, aes(x=rowSums(!is.na(df3)))) + geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The highest number of non participation is in the last round.