## Warning in readLines("https://raw.githubusercontent.com/arunk13/MSDA-
## Assignments/master/IS607Fall2015/Project1/tournamentinfo.txt"): incomplete
## final line found on 'https://raw.githubusercontent.com/arunk13/MSDA-
## Assignments/master/IS607Fall2015/Project1/tournamentinfo.txt'
head(tournamentinfo_master, 10);
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
#Reference: http://stackoverflow.com/questions/9068397/import-text-file-as-single-character-string
tournamentinfo <- tournamentinfo_master; #Creating copy of master data, I will be working on this copy.
The dashed lines are not useful for my analysis and hence I will remove it.
tournamentinfo <- str_replace_all(tournamentinfo, "^-+$", "");
tournamentinfo <- tournamentinfo[sapply(tournamentinfo, nchar) > 0]; #Removing the empty vector elements
I know the various data column names and hence keeping the headers in the file is not needed.
tournamentinfo <- tournamentinfo[-c(1:2)];
Lets see how the data is looking now:
head(tournamentinfo, 4);
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
The data of a single player is spread across two vector elements. I will first restructure the data , so that each vector element has the complete details of a single player. For this: 1. Mark the actual end of line for each player data by appending an indicator.
#Function adds a * at the end of every vector element at even index. which is the actual end of line.
append_separator <- function(x){
row_index <- which(tournamentinfo == x);
if(row_index %% 2 == 0){
x <- paste(x, "*");
}else{
x;
}
}
tournamentinfo <- sapply(tournamentinfo, append_separator);
tournamentinfo <- paste(tournamentinfo, collapse = "");
tournamentinfo <- unlist(strsplit(tournamentinfo, split = "\\*"));
Lets see how the data is looking now:
head(tournamentinfo, 4);
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W | "
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7| MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B | "
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12| MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W | "
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1| MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B | "
tournamentinfo <- unlist(str_extract_all(tournamentinfo, "^(.+?\\|){12}"));
Lets see how the data is looking now:
head(tournamentinfo, 1);
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 ->1817 |"
Now, as I have the data cleaned up, I can grab the various data elements from the string using regular expression.
#Expression to grab the ID.
id <- "^\\s+(\\d+)\\s+\\|";
name <- "\\s+([[:alnum:]- ]+)\\b\\s*\\|";
total_points <- "(\\d+\\.?\\d*)\\s*\\|";
#Expression to grab result and opponent number. The data is in the format "|W 39|""
res_opp <- "(\\w)\\s*(\\d{1,})?\\|";
state <- "\\s*(\\w{2})\\s*\\|";
#Expression to avoid some part which I dont need.
avoid <- "\\s*.+R:";
pre_rating <- "\\s*(\\d{1,}).*->";
post_rating <- "\\s*(\\d{1,}).*\\|";
#Creating the final regex string.
regex_pattern <- paste(id,name,total_points,res_opp,res_opp,res_opp,res_opp,res_opp,res_opp,res_opp,state,avoid,pre_rating,post_rating, sep = "");
#Using str_match , getting the data and putting into a data frame. Using stringsAsFactors is very important, as i faced a lot of trouble using the data in further calculations as the default opponent numbers were in factor format.
tournamentinfo_final <- as.data.frame(str_match(tournamentinfo, regex_pattern)[,-1], stringsAsFactors = FALSE);
Let’s look at the neat data now.
head(tournamentinfo_final, n = 2);
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12 D 4
## 2 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16 W 20 W 7
## V18 V19 V20
## 1 ON 1794 1817
## 2 MI 1553 1663
I am not renaming the columns yet as there are many columns in the data set which I dont need.
The average pre-chess rating of opponents is calculated by adding all the pre-tournament opponents’ ratings and dividing by the total number of games played.
#Function to calculate the average pre-chess rating of opponents
calc_avg_pre_opprating <- function(x){
total_pre_rating <- 0;
total_opp <- 0; #Counter for number of games
opp_columns <- c(5, 7,9,11,13,15,17); #The columns where opponent ID are stored for each player.
for(i in opp_columns){
index <- as.character(x[i][1,1]); #The opponent ID which can be used to find the opponents data row.
if(!is.na(index)){
total_pre_rating = total_pre_rating + as.numeric(tournamentinfo_final[index,19]); #The opponent ID is used to get player's pre-game rating
total_opp = total_opp + 1;
}
}
round(total_pre_rating/total_opp, 0); # The average
}
tournamentinfo_final <- adply(tournamentinfo_final,.margins = 1, .fun = calc_avg_pre_opprating);
I will subset the data frame to include only the rows required.
tournamentinfo_final <- tournamentinfo_final[c(1,17,2,18,20)];
names(tournamentinfo_final) <- c("Player Name", "State", "Rating", "Pre Rating", "Avg Opp Pre-Rating");
head(tournamentinfo_final, 6);
## Player Name State Rating Pre Rating Avg Opp Pre-Rating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
Writing the data into csv.
write.csv(tournamentinfo_final, file = "ChessReport.csv");