1 Project

1.1 Load Following Libraries

stringr
dplyr
reshape2
htmlTable

1.2 Download The Text File From Github

# Download File
#--------------
# https://stackoverflow.com/questions/23028760/download-a-file-from-https-using-download-file

url<-"https://raw.githubusercontent.com/jameskuruvilla/DATA607/master/tournament.txt"

r_file<- "tournament.txt"

downloader::download(url, r_file)

# Read text File into vector
#---------------------------
#https://stackoverflow.com/questions/23001548/dealing-with-readlines-function-in-r


v_conn <-file(r_file,open="r")

tournament.raw <- readLines(v_conn, warn = FALSE)

close(v_conn)

1.3 Replace ‘->’ with ‘>>’ and Replace ‘-’ with “” and then cleanup empty lines and 2 heading lines

#use gsub() to replace all the text you don't want with an empty string.
#-----------------------------------------------------------------------
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r

tournament <- gsub("-{3,}", "",gsub("->", ">>",tournament.raw))

# Following code also will work the same way
#tournament <- str_replace_all(str_replace_all(tournament.raw, "->", ">>"), "-{3,}", "")

tournament <- tournament[tournament != ""] # Remove the emtpy lines
tournament <- tournament[-(1:2)]           # Remove the First 2 lines as those are headings

head(tournament)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   >>1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [4] "   MI | 14598900 / R: 1553   >>1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [5] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [6] "   MI | 14959604 / R: 1384   >>1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

1.4 Split the 2 lines of a player information into 2 vectors.

# =====================================
# Str_sub OR substr OR substring can be used to cut the specific character set from the strng 
# https://www.rdocumentation.org/packages/stringr/versions/1.1.0/topics/str_sub
# http://rfunction.com/archives/1692
# =====================================

# extract only the lines starts with a digit
tournament.fmt1<- tournament[str_detect(substr(tournament, 1, 6), "[0-9]")]   

# extract only the lines starts with an alphabet
tournament.fmt2 <- tournament[str_detect(str_sub(tournament, 1, 6), "[A-Z]{2,2}")]

1.5 Parse both vectors (lines) into two dataframes and combine those by columns using cbind.

# Extract column values from these 2 vectors.
fmt1.df <- data.frame(   Player_num   = as.numeric(str_sub(tournament.fmt1, 1,  6)),
                         Player_name  = str_trim(str_sub(tournament.fmt1, 8,  40), side="both"),
                         Total_pts    = as.numeric(str_sub(tournament.fmt1, 42, 46)),
                         Round1       = str_sub(tournament.fmt1, 48, 52),
                         Round2       = str_sub(tournament.fmt1, 54, 58),
                         Round3       = str_sub(tournament.fmt1, 60, 64),
                         Round4       = str_sub(tournament.fmt1, 66, 70),
                         Round5       = str_sub(tournament.fmt1, 72, 76),
                         Round6       = str_sub(tournament.fmt1, 78, 82),
                         Round7       = str_sub(tournament.fmt1, 84, 88), stringsAsFactors=FALSE)

head(fmt1.df)

fmt2.df <- data.frame(   Player_state  = str_trim(str_sub(tournament.fmt2, 1,  6), side="both"),
                 Uscf_id       = str_extract(str_sub(tournament.fmt2, 8,  40), "\\d+"),
                 Pre_rating    = as.numeric(str_extract(str_sub(tournament.fmt2, 8,  40), "(?<=R: ).\\d+(?=)")),
                 Post_rating   = as.numeric(str_extract(str_sub(tournament.fmt2, 8,  40), "(?<=>>).\\d+(?=)")),
                 stringsAsFactors=FALSE)
head(fmt2.df)

# Combine both data frames by columns
# https://stat.ethz.ch/R-manual/R-devel/library/methods/html/cbind2.html 
tournament.df <-cbind(fmt1.df, fmt2.df)

head(tournament.df)

1.6 Steps to arrive at the final result

# Select all columns except round columns
player_df <- select(tournament.df, Player_num:Total_pts, Player_state:Post_rating)  #Select requires the dplyr package

# Select only player number and all the rounds the player played
rounds <- tournament.df %>% select(Player_num, Round1:Round7)

head(rounds)

# Following link says how melt works 
# https://tgmstat.wordpress.com/2013/10/31/reshape-and-aggregate-data-with-the-r-package-reshape2/

melt_rounds <- rounds%>% melt(id.var=c("Player_num"), value.name="Result_opp")

head(melt_rounds)

mut_melt_rounds <- 
    melt_rounds%>% #Add the columns round, Result and opp_num (opponent Number)
    mutate(Round = as.numeric(gsub("Round", "",variable)), # Replace "round" with "" in the column 'variable'
    Result  = str_extract(Result_opp, "^\\w+"),            # Extract the begginning word
    Opp_num = as.numeric(str_extract(Result_opp, "\\d+$")) # Extract the ending Digits.
                          ) 
head(mut_melt_rounds)

#sel_mut_melt_rounds <- mut_melt_rounds %>% select(c(Player_num,Round,Result,Opp_num)) # 

sel_mut_melt_rounds <- mut_melt_rounds %>% select(-c(variable, Result_opp)) # Remove the columns 'variable' and 'Result_opp'

head(sel_mut_melt_rounds);head(player_df )

# Join sel_mut_melt_rounds and player_df on op_num and Payer_num 
# Following Link gives more information on Joins
# http://dplyr.tidyverse.org/reference/join.html 

join_sel_mut_melt_rds <- sel_mut_melt_rounds %>% 
                          inner_join(select(player_df, Player_num, Pre_rating,Post_rating), by = c("Opp_num" = "Player_num"))

head(join_sel_mut_melt_rds)

sel_p <- join_sel_mut_melt_rds %>% select(Player_num, Round, Result, Opp_num, Pre_rating)

head(sel_p)

sort_sel_p <- sel_p %>% arrange(Player_num, Round) ;head(sort_sel_p)

names(sort_sel_p)[names(sort_sel_p) == "Pre_rating"] <- "Opp_pre_rating" ;head(sort_sel_p)

1.7 Aggreate to Find the Average pre-rating of Opponents

player.opp_avg_rating <- sort_sel_p%>%group_by(Player_num) %>% summarise(Opp_avg_pre_rating = round(mean(Opp_pre_rating)))

player_df <- player.opp_avg_rating %>% inner_join(player_df, by="Player_num")

player_df_final <- player_df %>%  select(Player_name, Player_state, Total_pts,Pre_rating, Opp_avg_pre_rating)

2 Result : Chess Player Details

htmlTable(player_df_final)

	Player_name	Player_state	Total_pts	Pre_rating	Opp_avg_pre_rating
1	GARY HUA	ON	6	1794	1605
2	DAKSHESH DARURI	MI	6	1553	1469
3	ADITYA BAJAJ	MI	6	1384	1564
4	PATRICK H SCHILLING	MI	5.5	1716	1574
5	HANSHI ZUO	MI	5.5	1655	1501
6	HANSEN SONG	OH	5	1686	1519
7	GARY DEE SWATHELL	MI	5	1649	1372
8	EZEKIEL HOUGHTON	MI	5	1641	1468
9	STEFANO LEE	ON	5	1411	1523
10	ANVIT RAO	MI	5	1365	1554
11	CAMERON WILLIAM MC LEMAN	MI	4.5	1712	1468
12	KENNETH J TACK	MI	4.5	1663	1506
13	TORRANCE HENRY JR	MI	4.5	1666	1498
14	BRADLEY SHAW	MI	4.5	1610	1515
15	ZACHARY JAMES HOUGHTON	MI	4.5	1220	1484
16	MIKE NIKITIN	MI	4	1604	1386
17	RONALD GRZEGORCZYK	MI	4	1629	1499
18	DAVID SUNDEEN	MI	4	1600	1480
19	DIPANKAR ROY	MI	4	1564	1426
20	JASON ZHENG	MI	4	1595	1411
21	DINH DANG BUI	ON	4	1563	1470
22	EUGENE L MCCLURE	MI	4	1555	1300
23	ALAN BUI	ON	4	1363	1214
24	MICHAEL R ALDRICH	MI	4	1229	1357
25	LOREN SCHWIEBERT	MI	3.5	1745	1363
26	MAX ZHU	ON	3.5	1579	1507
27	GAURAV GIDWANI	MI	3.5	1552	1222
28	SOFIA ADINA STANESCU-BELLU	MI	3.5	1507	1522
29	CHIEDOZIE OKORIE	MI	3.5	1602	1314
30	GEORGE AVERY JONES	ON	3.5	1522	1144
31	RISHI SHETTY	MI	3.5	1494	1260
32	JOSHUA PHILIP MATHEWS	ON	3.5	1441	1379
33	JADE GE	MI	3.5	1449	1277
34	MICHAEL JEFFERY THOMAS	MI	3.5	1399	1375
35	JOSHUA DAVID LEE	MI	3.5	1438	1150
36	SIDDHARTH JHA	MI	3.5	1355	1388
37	AMIYATOSH PWNANANDAM	MI	3.5	980	1385
38	BRIAN LIU	MI	3	1423	1539
39	JOEL R HENDON	MI	3	1436	1430
40	FOREST ZHANG	MI	3	1348	1391
41	KYLE WILLIAM MURPHY	MI	3	1403	1248
42	JARED GE	MI	3	1332	1150
43	ROBERT GLEN VASEY	MI	3	1283	1107
44	JUSTIN D SCHILLING	MI	3	1199	1327
45	DEREK YAN	MI	3	1242	1152
46	JACOB ALEXANDER LAVALLEY	MI	3	377	1358
47	ERIC WRIGHT	MI	2.5	1362	1392
48	DANIEL KHAIN	MI	2.5	1382	1356
49	MICHAEL J MARTIN	MI	2.5	1291	1286
50	SHIVAM JHA	MI	2.5	1056	1296
51	TEJAS AYYAGARI	MI	2.5	1011	1356
52	ETHAN GUO	MI	2.5	935	1495
53	JOSE C YBARRA	MI	2	1393	1345
54	LARRY HODGE	MI	2	1270	1206
55	ALEX KONG	MI	2	1186	1406
56	MARISA RICCI	MI	2	1153	1414
57	MICHAEL LU	MI	2	1092	1363
58	VIRAJ MOHILE	MI	2	917	1391
59	SEAN M MC CORMICK	MI	2	853	1319
60	JULIA SHEN	MI	1.5	967	1330
61	JEZZEL FARKAS	ON	1.5	955	1327
62	ASHWIN BALAJI	MI	1	1530	1186
63	THOMAS JOSEPH HOSMER	MI	1	1175	1350
64	BEN LI	MI	1	1163	1263

The table is exported into a file ’CHESS PLAYERS.csv using following command into the local drive and then loaded into GitHub. The following link gives the output from GitHub.

write.csv(player_df_final, "CHESS PLAYERS.csv", row.names=FALSE)

DATA607 - Project 1

James Kuruvilla

2017-09-25

1 Project

1.1 Load Following Libraries

1.2 Download The Text File From Github

1.3 Replace ‘->’ with ‘>>’ and Replace ‘-’ with “” and then cleanup empty lines and 2 heading lines

1.4 Split the 2 lines of a player information into 2 vectors.

1.5 Parse both vectors (lines) into two dataframes and combine those by columns using cbind.

1.6 Steps to arrive at the final result

1.7 Aggreate to Find the Average pre-rating of Opponents

2 Result : Chess Player Details

2.1 Link for Chess Player Details