library(stringr)
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#get the data from my
Raw_text <- read.csv("https://raw.githubusercontent.com/crarnouts/CUNY-MSDS/master/tournamentinfo.txt", header = FALSE)
head(Raw_text)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 3 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 4 -----------------------------------------------------------------------------------------
## 5 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 6 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
tail(Raw_text)
## V1
## 191 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |
## 192 MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |
## 193 -----------------------------------------------------------------------------------------
## 194 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|
## 195 MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |
## 196 -----------------------------------------------------------------------------------------
#Raw_text1 is just a list version of Raw_text
Raw_text1<- Raw_text[-c(1:4),]
head(Raw_text1)
## [1] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [2] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [3] -----------------------------------------------------------------------------------------
## [4] 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## [5] MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...
Raw_text1%>% head()
## [1] 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## [2] ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## [3] -----------------------------------------------------------------------------------------
## [4] 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## [5] MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...
#both of the statements above do the same thing one just uses the pipe operator to pass the arguement into the function
class(Raw_text1)
## [1] "factor"
#we are taking subsets of the list in these next two variables
name<- Raw_text1[seq(from = 1, to = length(Raw_text1), by = 3)]
#we are grabbing every third row
rating<- Raw_text1[seq(from = 2, to = length(Raw_text1), by = 3)]
#we are not grabbing the ranking rows of the dataset
dashes<- Raw_text1[seq(from = 3, to = length(Raw_text1), by = 3)]
#this is just a throw away subset to check that I'm getting rid of what I think I am
##Extracting Data for different criteria
player_id<- as.integer(str_extract(name,"\\d+"))
#extracting the player id out of the text file
player_name <- str_trim(str_extract(name, "(\\w+\\s){2,3}"))
#extracts word characters to space character 2 or 3 times depending on the number of names and then removes the spaces with the trim function
p_point<- as.numeric(str_extract(name, "\\d.\\d"))
p_rating<- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"),"\\d+"))
post_rating <- as.integer(str_extract_all(str_extract_all(str_extract_all(rating, "\\D\\d{3,4}\\D"),"[>]\\d+"),"\\d+"))
#this extracts out their rating specifically
state <- str_extract(rating, "\\w{2,2}")
#extracts the state out of the rating strings
opponent_id1 <- str_extract_all(name,"\\d+\\|")
opponent_id <-str_extract_all(opponent_id1,"\\d+")
##Turning list into Dataset
obs<- sapply(opponent_id,length)
matches_played<-seq_len(max(obs))
#Using the matches played vector to collect each opponent from each player
mat<-t(sapply(opponent_id, "[", i =matches_played))
#transposing each column vector into a row to give us a list of who every player played
df2<-as.data.frame(mat)
##Replacing Id with rankings
key <- data.frame(player_id, p_rating)
df3<-df2
#reference table to match player ids to player ratings
df3[]<-key$p_rating[match(unlist(df3), key$player_id)]
kable(head(df3))
1436 |
1563 |
1600 |
1610 |
1649 |
1663 |
1716 |
1175 |
917 |
1716 |
1629 |
1604 |
1595 |
1649 |
1641 |
955 |
1745 |
1563 |
1712 |
1666 |
1663 |
1363 |
1507 |
1553 |
1579 |
1655 |
1564 |
1794 |
1242 |
980 |
1663 |
1666 |
1716 |
1610 |
1629 |
1399 |
1602 |
1712 |
1438 |
1365 |
1552 |
1563 |
#Calculating the average opponent rating
df3$oppAvg <- round(apply(df3, 1, mean, na.rm=TRUE))
kable(head(df3))
1436 |
1563 |
1600 |
1610 |
1649 |
1663 |
1716 |
1605 |
1175 |
917 |
1716 |
1629 |
1604 |
1595 |
1649 |
1469 |
1641 |
955 |
1745 |
1563 |
1712 |
1666 |
1663 |
1564 |
1363 |
1507 |
1553 |
1579 |
1655 |
1564 |
1794 |
1574 |
1242 |
980 |
1663 |
1666 |
1716 |
1610 |
1629 |
1501 |
1399 |
1602 |
1712 |
1438 |
1365 |
1552 |
1563 |
1519 |
df<-data.frame(player_id, player_name, state,p_point, p_rating,post_rating, df3$oppAvg)
colnames(df)<-c("ID","name", "State", "Point", "Pre_Match_Rating","Post_Rating", "Opponent_Average")
kable(head(df,10))
1 |
GARY HUA |
ON |
6.0 |
1794 |
1817 |
1605 |
2 |
DAKSHESH DARURI |
MI |
6.0 |
1553 |
1663 |
1469 |
3 |
ADITYA BAJAJ |
MI |
6.0 |
1384 |
1640 |
1564 |
4 |
PATRICK H SCHILLING |
MI |
5.5 |
1716 |
1744 |
1574 |
5 |
HANSHI ZUO |
MI |
5.5 |
1655 |
1690 |
1501 |
6 |
HANSEN SONG |
OH |
5.0 |
1686 |
1687 |
1519 |
7 |
GARY DEE SWATHELL |
MI |
5.0 |
1649 |
1673 |
1372 |
8 |
EZEKIEL HOUGHTON |
MI |
5.0 |
1641 |
1657 |
1468 |
9 |
STEFANO LEE |
ON |
5.0 |
1411 |
1564 |
1523 |
10 |
ANVIT RAO |
MI |
5.0 |
1365 |
1544 |
1554 |
## A bit of variable analysis
cor(x=df$Pre_Match_Rating, y =df$Opponent_Average)
## [1] 0.2839375
cor(x=df$Pre_Match_Rating, y=df$Point)
## [1] 0.6093942
#having a higher Pre_Match Ranking infers that you will score more points
df$Rating_Change <- df$Post_Rating - df$Pre_Match_Rating
cor(x=df$Rating_Change, y = df$Point)
## [1] 0.416644
write.csv(df, file = "C:/Temp/Chess.csv", row.names= FALSE)