library(stringr)
library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

#get the data from my 
Raw_text <- read.csv("https://raw.githubusercontent.com/crarnouts/CUNY-MSDS/master/tournamentinfo.txt", header = FALSE)
head(Raw_text)
##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 3  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 4  -----------------------------------------------------------------------------------------
## 5      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 6     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
tail(Raw_text)
##                                                                                            V1
## 191    63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |
## 192    MI | 15057092 / R: 1175   ->1125     |     |W    |B    |W    |B    |B    |     |     |
## 193 -----------------------------------------------------------------------------------------
## 194    64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|
## 195    MI | 15006561 / R: 1163   ->1112     |     |B    |W    |W    |B    |W    |B    |B    |
## 196 -----------------------------------------------------------------------------------------
#Raw_text1 is just a list version of Raw_text
Raw_text1<- Raw_text[-c(1:4),]
head(Raw_text1)
## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [3] -----------------------------------------------------------------------------------------
## [4]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [5]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...
Raw_text1%>% head()
## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [3] -----------------------------------------------------------------------------------------
## [4]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [5]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...
#both of the statements above do the same thing one just uses the pipe operator to pass the arguement into the function

class(Raw_text1)
## [1] "factor"
#we are taking subsets of the list in these next two variables
name<- Raw_text1[seq(from = 1, to = length(Raw_text1), by = 3)]
#we are grabbing every third row
rating<- Raw_text1[seq(from = 2, to = length(Raw_text1), by = 3)]
#we are not grabbing the ranking rows of the dataset
dashes<- Raw_text1[seq(from = 3, to = length(Raw_text1), by = 3)]
#this is just a throw away subset to check that I'm getting rid of what I think I am

##Extracting Data for different criteria

player_id<- as.integer(str_extract(name,"\\d+"))
#extracting the player id out of the text file 
player_name <- str_trim(str_extract(name, "(\\w+\\s){2,3}"))
#extracts word characters to space character 2 or 3 times depending on the number of names and then removes the spaces with the trim function
p_point<- as.numeric(str_extract(name, "\\d.\\d"))
p_rating<- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"),"\\d+"))
post_rating <- as.integer(str_extract_all(str_extract_all(str_extract_all(rating, "\\D\\d{3,4}\\D"),"[>]\\d+"),"\\d+"))
#this extracts out their rating specifically 
state <- str_extract(rating, "\\w{2,2}")
#extracts the state out of the rating strings
opponent_id1 <- str_extract_all(name,"\\d+\\|")
opponent_id <-str_extract_all(opponent_id1,"\\d+")


##Turning list into Dataset
obs<- sapply(opponent_id,length)
matches_played<-seq_len(max(obs))
#Using the matches played vector to collect each opponent from each player
mat<-t(sapply(opponent_id, "[", i =matches_played))
#transposing each column vector into a row to give us a list of who every player played
df2<-as.data.frame(mat)


##Replacing Id with rankings
key <- data.frame(player_id, p_rating)
df3<-df2
#reference table to match player ids to player ratings
df3[]<-key$p_rating[match(unlist(df3), key$player_id)]
kable(head(df3))
V1 V2 V3 V4 V5 V6 V7
1436 1563 1600 1610 1649 1663 1716
1175 917 1716 1629 1604 1595 1649
1641 955 1745 1563 1712 1666 1663
1363 1507 1553 1579 1655 1564 1794
1242 980 1663 1666 1716 1610 1629
1399 1602 1712 1438 1365 1552 1563
#Calculating the average opponent rating 
df3$oppAvg <- round(apply(df3, 1, mean, na.rm=TRUE))
kable(head(df3))
V1 V2 V3 V4 V5 V6 V7 oppAvg
1436 1563 1600 1610 1649 1663 1716 1605
1175 917 1716 1629 1604 1595 1649 1469
1641 955 1745 1563 1712 1666 1663 1564
1363 1507 1553 1579 1655 1564 1794 1574
1242 980 1663 1666 1716 1610 1629 1501
1399 1602 1712 1438 1365 1552 1563 1519
df<-data.frame(player_id, player_name, state,p_point, p_rating,post_rating, df3$oppAvg)
colnames(df)<-c("ID","name", "State", "Point", "Pre_Match_Rating","Post_Rating", "Opponent_Average")

kable(head(df,10))
ID name State Point Pre_Match_Rating Post_Rating Opponent_Average
1 GARY HUA ON 6.0 1794 1817 1605
2 DAKSHESH DARURI MI 6.0 1553 1663 1469
3 ADITYA BAJAJ MI 6.0 1384 1640 1564
4 PATRICK H SCHILLING MI 5.5 1716 1744 1574
5 HANSHI ZUO MI 5.5 1655 1690 1501
6 HANSEN SONG OH 5.0 1686 1687 1519
7 GARY DEE SWATHELL MI 5.0 1649 1673 1372
8 EZEKIEL HOUGHTON MI 5.0 1641 1657 1468
9 STEFANO LEE ON 5.0 1411 1564 1523
10 ANVIT RAO MI 5.0 1365 1544 1554
## A bit of variable analysis
cor(x=df$Pre_Match_Rating, y =df$Opponent_Average)
## [1] 0.2839375
cor(x=df$Pre_Match_Rating, y=df$Point)
## [1] 0.6093942
#having a higher Pre_Match Ranking infers that you will score more points 

df$Rating_Change <- df$Post_Rating - df$Pre_Match_Rating
cor(x=df$Rating_Change, y = df$Point)
## [1] 0.416644
write.csv(df, file = "C:/Temp/Chess.csv", row.names= FALSE)