1 Project
1.1 Load Following Libraries
- stringr
- dplyr
- reshape2
- htmlTable
1.2 Download The Text File From Github
# Download File
#--------------
# https://stackoverflow.com/questions/23028760/download-a-file-from-https-using-download-file
url<-"https://raw.githubusercontent.com/jameskuruvilla/DATA607/master/tournament.txt"
r_file<- "tournament.txt"
downloader::download(url, r_file)
# Read text File into vector
#---------------------------
#https://stackoverflow.com/questions/23001548/dealing-with-readlines-function-in-r
v_conn <-file(r_file,open="r")
tournament.raw <- readLines(v_conn, warn = FALSE)
close(v_conn)
1.3 Replace ‘->’ with ‘>>’ and Replace ‘-’ with “” and then cleanup empty lines and 2 heading lines
#use gsub() to replace all the text you don't want with an empty string.
#-----------------------------------------------------------------------
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
tournament <- gsub("-{3,}", "",gsub("->", ">>",tournament.raw))
# Following code also will work the same way
#tournament <- str_replace_all(str_replace_all(tournament.raw, "->", ">>"), "-{3,}", "")
tournament <- tournament[tournament != ""] # Remove the emtpy lines
tournament <- tournament[-(1:2)] # Remove the First 2 lines as those are headings
head(tournament)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 >>1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 >>1663 |N:2 |B |W |B |W |B |W |B |"
## [5] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [6] " MI | 14959604 / R: 1384 >>1640 |N:2 |W |B |W |B |W |B |W |"
1.4 Split the 2 lines of a player information into 2 vectors.
# =====================================
# Str_sub OR substr OR substring can be used to cut the specific character set from the strng
# https://www.rdocumentation.org/packages/stringr/versions/1.1.0/topics/str_sub
# http://rfunction.com/archives/1692
# =====================================
# extract only the lines starts with a digit
tournament.fmt1<- tournament[str_detect(substr(tournament, 1, 6), "[0-9]")]
# extract only the lines starts with an alphabet
tournament.fmt2 <- tournament[str_detect(str_sub(tournament, 1, 6), "[A-Z]{2,2}")]
1.5 Parse both vectors (lines) into two dataframes and combine those by columns using cbind.
# Extract column values from these 2 vectors.
fmt1.df <- data.frame( Player_num = as.numeric(str_sub(tournament.fmt1, 1, 6)),
Player_name = str_trim(str_sub(tournament.fmt1, 8, 40), side="both"),
Total_pts = as.numeric(str_sub(tournament.fmt1, 42, 46)),
Round1 = str_sub(tournament.fmt1, 48, 52),
Round2 = str_sub(tournament.fmt1, 54, 58),
Round3 = str_sub(tournament.fmt1, 60, 64),
Round4 = str_sub(tournament.fmt1, 66, 70),
Round5 = str_sub(tournament.fmt1, 72, 76),
Round6 = str_sub(tournament.fmt1, 78, 82),
Round7 = str_sub(tournament.fmt1, 84, 88), stringsAsFactors=FALSE)
head(fmt1.df)
fmt2.df <- data.frame( Player_state = str_trim(str_sub(tournament.fmt2, 1, 6), side="both"),
Uscf_id = str_extract(str_sub(tournament.fmt2, 8, 40), "\\d+"),
Pre_rating = as.numeric(str_extract(str_sub(tournament.fmt2, 8, 40), "(?<=R: ).\\d+(?=)")),
Post_rating = as.numeric(str_extract(str_sub(tournament.fmt2, 8, 40), "(?<=>>).\\d+(?=)")),
stringsAsFactors=FALSE)
head(fmt2.df)
# Combine both data frames by columns
# https://stat.ethz.ch/R-manual/R-devel/library/methods/html/cbind2.html
tournament.df <-cbind(fmt1.df, fmt2.df)
head(tournament.df)
1.6 Steps to arrive at the final result
# Select all columns except round columns
player_df <- select(tournament.df, Player_num:Total_pts, Player_state:Post_rating) #Select requires the dplyr package
# Select only player number and all the rounds the player played
rounds <- tournament.df %>% select(Player_num, Round1:Round7)
head(rounds)
# Following link says how melt works
# https://tgmstat.wordpress.com/2013/10/31/reshape-and-aggregate-data-with-the-r-package-reshape2/
melt_rounds <- rounds%>% melt(id.var=c("Player_num"), value.name="Result_opp")
head(melt_rounds)
mut_melt_rounds <-
melt_rounds%>% #Add the columns round, Result and opp_num (opponent Number)
mutate(Round = as.numeric(gsub("Round", "",variable)), # Replace "round" with "" in the column 'variable'
Result = str_extract(Result_opp, "^\\w+"), # Extract the begginning word
Opp_num = as.numeric(str_extract(Result_opp, "\\d+$")) # Extract the ending Digits.
)
head(mut_melt_rounds)
#sel_mut_melt_rounds <- mut_melt_rounds %>% select(c(Player_num,Round,Result,Opp_num)) #
sel_mut_melt_rounds <- mut_melt_rounds %>% select(-c(variable, Result_opp)) # Remove the columns 'variable' and 'Result_opp'
head(sel_mut_melt_rounds);head(player_df )
# Join sel_mut_melt_rounds and player_df on op_num and Payer_num
# Following Link gives more information on Joins
# http://dplyr.tidyverse.org/reference/join.html
join_sel_mut_melt_rds <- sel_mut_melt_rounds %>%
inner_join(select(player_df, Player_num, Pre_rating,Post_rating), by = c("Opp_num" = "Player_num"))
head(join_sel_mut_melt_rds)
sel_p <- join_sel_mut_melt_rds %>% select(Player_num, Round, Result, Opp_num, Pre_rating)
head(sel_p)
sort_sel_p <- sel_p %>% arrange(Player_num, Round) ;head(sort_sel_p)
names(sort_sel_p)[names(sort_sel_p) == "Pre_rating"] <- "Opp_pre_rating" ;head(sort_sel_p)
1.7 Aggreate to Find the Average pre-rating of Opponents
player.opp_avg_rating <- sort_sel_p%>%group_by(Player_num) %>% summarise(Opp_avg_pre_rating = round(mean(Opp_pre_rating)))
player_df <- player.opp_avg_rating %>% inner_join(player_df, by="Player_num")
player_df_final <- player_df %>% select(Player_name, Player_state, Total_pts,Pre_rating, Opp_avg_pre_rating)
2 Result : Chess Player Details
htmlTable(player_df_final)
Player_name | Player_state | Total_pts | Pre_rating | Opp_avg_pre_rating | |
---|---|---|---|---|---|
1 | GARY HUA | ON | 6 | 1794 | 1605 |
2 | DAKSHESH DARURI | MI | 6 | 1553 | 1469 |
3 | ADITYA BAJAJ | MI | 6 | 1384 | 1564 |
4 | PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
5 | HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
6 | HANSEN SONG | OH | 5 | 1686 | 1519 |
7 | GARY DEE SWATHELL | MI | 5 | 1649 | 1372 |
8 | EZEKIEL HOUGHTON | MI | 5 | 1641 | 1468 |
9 | STEFANO LEE | ON | 5 | 1411 | 1523 |
10 | ANVIT RAO | MI | 5 | 1365 | 1554 |
11 | CAMERON WILLIAM MC LEMAN | MI | 4.5 | 1712 | 1468 |
12 | KENNETH J TACK | MI | 4.5 | 1663 | 1506 |
13 | TORRANCE HENRY JR | MI | 4.5 | 1666 | 1498 |
14 | BRADLEY SHAW | MI | 4.5 | 1610 | 1515 |
15 | ZACHARY JAMES HOUGHTON | MI | 4.5 | 1220 | 1484 |
16 | MIKE NIKITIN | MI | 4 | 1604 | 1386 |
17 | RONALD GRZEGORCZYK | MI | 4 | 1629 | 1499 |
18 | DAVID SUNDEEN | MI | 4 | 1600 | 1480 |
19 | DIPANKAR ROY | MI | 4 | 1564 | 1426 |
20 | JASON ZHENG | MI | 4 | 1595 | 1411 |
21 | DINH DANG BUI | ON | 4 | 1563 | 1470 |
22 | EUGENE L MCCLURE | MI | 4 | 1555 | 1300 |
23 | ALAN BUI | ON | 4 | 1363 | 1214 |
24 | MICHAEL R ALDRICH | MI | 4 | 1229 | 1357 |
25 | LOREN SCHWIEBERT | MI | 3.5 | 1745 | 1363 |
26 | MAX ZHU | ON | 3.5 | 1579 | 1507 |
27 | GAURAV GIDWANI | MI | 3.5 | 1552 | 1222 |
28 | SOFIA ADINA STANESCU-BELLU | MI | 3.5 | 1507 | 1522 |
29 | CHIEDOZIE OKORIE | MI | 3.5 | 1602 | 1314 |
30 | GEORGE AVERY JONES | ON | 3.5 | 1522 | 1144 |
31 | RISHI SHETTY | MI | 3.5 | 1494 | 1260 |
32 | JOSHUA PHILIP MATHEWS | ON | 3.5 | 1441 | 1379 |
33 | JADE GE | MI | 3.5 | 1449 | 1277 |
34 | MICHAEL JEFFERY THOMAS | MI | 3.5 | 1399 | 1375 |
35 | JOSHUA DAVID LEE | MI | 3.5 | 1438 | 1150 |
36 | SIDDHARTH JHA | MI | 3.5 | 1355 | 1388 |
37 | AMIYATOSH PWNANANDAM | MI | 3.5 | 980 | 1385 |
38 | BRIAN LIU | MI | 3 | 1423 | 1539 |
39 | JOEL R HENDON | MI | 3 | 1436 | 1430 |
40 | FOREST ZHANG | MI | 3 | 1348 | 1391 |
41 | KYLE WILLIAM MURPHY | MI | 3 | 1403 | 1248 |
42 | JARED GE | MI | 3 | 1332 | 1150 |
43 | ROBERT GLEN VASEY | MI | 3 | 1283 | 1107 |
44 | JUSTIN D SCHILLING | MI | 3 | 1199 | 1327 |
45 | DEREK YAN | MI | 3 | 1242 | 1152 |
46 | JACOB ALEXANDER LAVALLEY | MI | 3 | 377 | 1358 |
47 | ERIC WRIGHT | MI | 2.5 | 1362 | 1392 |
48 | DANIEL KHAIN | MI | 2.5 | 1382 | 1356 |
49 | MICHAEL J MARTIN | MI | 2.5 | 1291 | 1286 |
50 | SHIVAM JHA | MI | 2.5 | 1056 | 1296 |
51 | TEJAS AYYAGARI | MI | 2.5 | 1011 | 1356 |
52 | ETHAN GUO | MI | 2.5 | 935 | 1495 |
53 | JOSE C YBARRA | MI | 2 | 1393 | 1345 |
54 | LARRY HODGE | MI | 2 | 1270 | 1206 |
55 | ALEX KONG | MI | 2 | 1186 | 1406 |
56 | MARISA RICCI | MI | 2 | 1153 | 1414 |
57 | MICHAEL LU | MI | 2 | 1092 | 1363 |
58 | VIRAJ MOHILE | MI | 2 | 917 | 1391 |
59 | SEAN M MC CORMICK | MI | 2 | 853 | 1319 |
60 | JULIA SHEN | MI | 1.5 | 967 | 1330 |
61 | JEZZEL FARKAS | ON | 1.5 | 955 | 1327 |
62 | ASHWIN BALAJI | MI | 1 | 1530 | 1186 |
63 | THOMAS JOSEPH HOSMER | MI | 1 | 1175 | 1350 |
64 | BEN LI | MI | 1 | 1163 | 1263 |
The table is exported into a file ’CHESS PLAYERS.csv using following command into the local drive and then loaded into GitHub. The following link gives the output from GitHub.
write.csv(player_df_final, "CHESS PLAYERS.csv", row.names=FALSE)