DATA 607 Week 4 - Project 1

library(stringr)
library(knitr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

#get the data from my 
Raw_text <- read.csv("https://raw.githubusercontent.com/crarnouts/CUNY-MSDS/master/tournamentinfo.txt", header = FALSE)
head(Raw_text)

##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 3  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 4  -----------------------------------------------------------------------------------------
## 5      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 6     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |

tail(Raw_text)

##                                                                                            V1
## 191    63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |
## 192    MI | 15057092 / R: 1175   ->1125     |     |W    |B    |W    |B    |B    |     |     |
## 193 -----------------------------------------------------------------------------------------
## 194    64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|
## 195    MI | 15006561 / R: 1163   ->1112     |     |B    |W    |W    |B    |W    |B    |B    |
## 196 -----------------------------------------------------------------------------------------

#Raw_text1 is just a list version of Raw_text
Raw_text1<- Raw_text[-c(1:4),]
head(Raw_text1)

## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [3] -----------------------------------------------------------------------------------------
## [4]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [5]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...

Raw_text1%>% head()

## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [3] -----------------------------------------------------------------------------------------
## [4]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [5]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...

#both of the statements above do the same thing one just uses the pipe operator to pass the arguement into the function

class(Raw_text1)

## [1] "factor"

#we are taking subsets of the list in these next two variables
name<- Raw_text1[seq(from = 1, to = length(Raw_text1), by = 3)]
#we are grabbing every third row
rating<- Raw_text1[seq(from = 2, to = length(Raw_text1), by = 3)]
#we are not grabbing the ranking rows of the dataset
dashes<- Raw_text1[seq(from = 3, to = length(Raw_text1), by = 3)]
#this is just a throw away subset to check that I'm getting rid of what I think I am

##Extracting Data for different criteria

player_id<- as.integer(str_extract(name,"\\d+"))
#extracting the player id out of the text file 
player_name <- str_trim(str_extract(name, "(\\w+\\s){2,3}"))
#extracts word characters to space character 2 or 3 times depending on the number of names and then removes the spaces with the trim function
p_point<- as.numeric(str_extract(name, "\\d.\\d"))
p_rating<- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"),"\\d+"))
post_rating <- as.integer(str_extract_all(str_extract_all(str_extract_all(rating, "\\D\\d{3,4}\\D"),"[>]\\d+"),"\\d+"))
#this extracts out their rating specifically 
state <- str_extract(rating, "\\w{2,2}")
#extracts the state out of the rating strings
opponent_id1 <- str_extract_all(name,"\\d+\\|")
opponent_id <-str_extract_all(opponent_id1,"\\d+")


##Turning list into Dataset
obs<- sapply(opponent_id,length)
matches_played<-seq_len(max(obs))
#Using the matches played vector to collect each opponent from each player
mat<-t(sapply(opponent_id, "[", i =matches_played))
#transposing each column vector into a row to give us a list of who every player played
df2<-as.data.frame(mat)


##Replacing Id with rankings
key <- data.frame(player_id, p_rating)
df3<-df2
#reference table to match player ids to player ratings
df3[]<-key$p_rating[match(unlist(df3), key$player_id)]
kable(head(df3))

V1	V2	V3	V4	V5	V6	V7
1436	1563	1600	1610	1649	1663	1716
1175	917	1716	1629	1604	1595	1649
1641	955	1745	1563	1712	1666	1663
1363	1507	1553	1579	1655	1564	1794
1242	980	1663	1666	1716	1610	1629
1399	1602	1712	1438	1365	1552	1563

#Calculating the average opponent rating 
df3$oppAvg <- round(apply(df3, 1, mean, na.rm=TRUE))
kable(head(df3))

V1	V2	V3	V4	V5	V6	V7	oppAvg
1436	1563	1600	1610	1649	1663	1716	1605
1175	917	1716	1629	1604	1595	1649	1469
1641	955	1745	1563	1712	1666	1663	1564
1363	1507	1553	1579	1655	1564	1794	1574
1242	980	1663	1666	1716	1610	1629	1501
1399	1602	1712	1438	1365	1552	1563	1519

df<-data.frame(player_id, player_name, state,p_point, p_rating,post_rating, df3$oppAvg)
colnames(df)<-c("ID","name", "State", "Point", "Pre_Match_Rating","Post_Rating", "Opponent_Average")

kable(head(df,10))

ID	name	State	Point	Pre_Match_Rating	Post_Rating	Opponent_Average
1	GARY HUA	ON	6.0	1794	1817	1605
2	DAKSHESH DARURI	MI	6.0	1553	1663	1469
3	ADITYA BAJAJ	MI	6.0	1384	1640	1564
4	PATRICK H SCHILLING	MI	5.5	1716	1744	1574
5	HANSHI ZUO	MI	5.5	1655	1690	1501
6	HANSEN SONG	OH	5.0	1686	1687	1519
7	GARY DEE SWATHELL	MI	5.0	1649	1673	1372
8	EZEKIEL HOUGHTON	MI	5.0	1641	1657	1468
9	STEFANO LEE	ON	5.0	1411	1564	1523
10	ANVIT RAO	MI	5.0	1365	1544	1554

## A bit of variable analysis
cor(x=df$Pre_Match_Rating, y =df$Opponent_Average)

## [1] 0.2839375

cor(x=df$Pre_Match_Rating, y=df$Point)

## [1] 0.6093942

#having a higher Pre_Match Ranking infers that you will score more points 

df$Rating_Change <- df$Post_Rating - df$Pre_Match_Rating
cor(x=df$Rating_Change, y = df$Point)

## [1] 0.416644

write.csv(df, file = "C:/Temp/Chess.csv", row.names= FALSE)

DATA 607 Week 4 - Project 1

Corey Arnouts

September 23, 2018