Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be:
Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts,including assessing relative strength of employment candidates by human resource departments.You may substitute another text file (or set of text files, or data scraped from web pages) of similar or greater complexity, and create your own assignment and solution. You may work in a small team. All of your code should be in an R markdown file (and published to rpubs.com); with your data accessible for the person running the script.
tournamentinfo <- "https://raw.githubusercontent.com/spitakiss/Data607_Project1/master/tournamentinfo.txt"
library(stringr)
results <- readLines(tournamentinfo,warn = FALSE)
# extract player's name
Player_Name <- matrix(unlist(str_extract_all(results, "\\w+[^USCF|a-z] ?\\w+ \\w+")))
Player_Name
##       [,1]                      
##  [1,] "GARY HUA"                
##  [2,] "DAKSHESH DARURI"         
##  [3,] "ADITYA BAJAJ"            
##  [4,] "PATRICK H SCHILLING"     
##  [5,] "HANSHI ZUO"              
##  [6,] "HANSEN SONG"             
##  [7,] "GARY DEE SWATHELL"       
##  [8,] "EZEKIEL HOUGHTON"        
##  [9,] "STEFANO LEE"             
## [10,] "ANVIT RAO"               
## [11,] "CAMERON WILLIAM MC"      
## [12,] "KENNETH J TACK"          
## [13,] "TORRANCE HENRY JR"       
## [14,] "BRADLEY SHAW"            
## [15,] "ZACHARY JAMES HOUGHTON"  
## [16,] "MIKE NIKITIN"            
## [17,] "RONALD GRZEGORCZYK"      
## [18,] "DAVID SUNDEEN"           
## [19,] "DIPANKAR ROY"            
## [20,] "JASON ZHENG"             
## [21,] "DINH DANG BUI"           
## [22,] "EUGENE L MCCLURE"        
## [23,] "ALAN BUI"                
## [24,] "MICHAEL R ALDRICH"       
## [25,] "LOREN SCHWIEBERT"        
## [26,] "MAX ZHU"                 
## [27,] "GAURAV GIDWANI"          
## [28,] "SOFIA ADINA STANESCU"    
## [29,] "CHIEDOZIE OKORIE"        
## [30,] "GEORGE AVERY JONES"      
## [31,] "RISHI SHETTY"            
## [32,] "JOSHUA PHILIP MATHEWS"   
## [33,] "JADE GE"                 
## [34,] "MICHAEL JEFFERY THOMAS"  
## [35,] "JOSHUA DAVID LEE"        
## [36,] "SIDDHARTH JHA"           
## [37,] "AMIYATOSH PWNANANDAM"    
## [38,] "BRIAN LIU"               
## [39,] "JOEL R HENDON"           
## [40,] "FOREST ZHANG"            
## [41,] "KYLE WILLIAM MURPHY"     
## [42,] "JARED GE"                
## [43,] "ROBERT GLEN VASEY"       
## [44,] "JUSTIN D SCHILLING"      
## [45,] "DEREK YAN"               
## [46,] "JACOB ALEXANDER LAVALLEY"
## [47,] "ERIC WRIGHT"             
## [48,] "DANIEL KHAIN"            
## [49,] "MICHAEL J MARTIN"        
## [50,] "SHIVAM JHA"              
## [51,] "TEJAS AYYAGARI"          
## [52,] "ETHAN GUO"               
## [53,] "JOSE C YBARRA"           
## [54,] "LARRY HODGE"             
## [55,] "ALEX KONG"               
## [56,] "MARISA RICCI"            
## [57,] "MICHAEL LU"              
## [58,] "VIRAJ MOHILE"            
## [59,] "SEAN M MC"               
## [60,] "JULIA SHEN"              
## [61,] "JEZZEL FARKAS"           
## [62,] "ASHWIN BALAJI"           
## [63,] "THOMAS JOSEPH HOSMER"    
## [64,] "BEN LI"
# extract Player's State
Player_state <- matrix(unlist(str_extract_all(results, "(?:^|\\W)ON | MI | OH(?:$|\\W)")))
Player_state
##       [,1]  
##  [1,] " ON "
##  [2,] " MI "
##  [3,] " MI "
##  [4,] " MI "
##  [5,] " MI "
##  [6,] " OH "
##  [7,] " MI "
##  [8,] " MI "
##  [9,] " ON "
## [10,] " MI "
## [11,] " MI "
## [12,] " MI "
## [13,] " MI "
## [14,] " MI "
## [15,] " MI "
## [16,] " MI "
## [17,] " MI "
## [18,] " MI "
## [19,] " MI "
## [20,] " MI "
## [21,] " ON "
## [22,] " MI "
## [23,] " ON "
## [24,] " MI "
## [25,] " MI "
## [26,] " ON "
## [27,] " MI "
## [28,] " MI "
## [29,] " MI "
## [30,] " ON "
## [31,] " MI "
## [32,] " ON "
## [33,] " MI "
## [34,] " MI "
## [35,] " MI "
## [36,] " MI "
## [37,] " MI "
## [38,] " MI "
## [39,] " MI "
## [40,] " MI "
## [41,] " MI "
## [42,] " MI "
## [43,] " MI "
## [44,] " MI "
## [45,] " MI "
## [46,] " MI "
## [47,] " MI "
## [48,] " MI "
## [49,] " MI "
## [50,] " MI "
## [51,] " MI "
## [52,] " MI "
## [53,] " MI "
## [54,] " MI "
## [55,] " MI "
## [56,] " MI "
## [57,] " MI "
## [58,] " MI "
## [59,] " MI "
## [60,] " MI "
## [61,] " ON "
## [62,] " MI "
## [63,] " MI "
## [64,] " MI "
# extract Total number of points
Tot_Points <- matrix(unlist(str_extract_all(results, "\\d\\.\\d")))
Tot_Points
##       [,1] 
##  [1,] "6.0"
##  [2,] "6.0"
##  [3,] "6.0"
##  [4,] "5.5"
##  [5,] "5.5"
##  [6,] "5.0"
##  [7,] "5.0"
##  [8,] "5.0"
##  [9,] "5.0"
## [10,] "5.0"
## [11,] "4.5"
## [12,] "4.5"
## [13,] "4.5"
## [14,] "4.5"
## [15,] "4.5"
## [16,] "4.0"
## [17,] "4.0"
## [18,] "4.0"
## [19,] "4.0"
## [20,] "4.0"
## [21,] "4.0"
## [22,] "4.0"
## [23,] "4.0"
## [24,] "4.0"
## [25,] "3.5"
## [26,] "3.5"
## [27,] "3.5"
## [28,] "3.5"
## [29,] "3.5"
## [30,] "3.5"
## [31,] "3.5"
## [32,] "3.5"
## [33,] "3.5"
## [34,] "3.5"
## [35,] "3.5"
## [36,] "3.5"
## [37,] "3.5"
## [38,] "3.0"
## [39,] "3.0"
## [40,] "3.0"
## [41,] "3.0"
## [42,] "3.0"
## [43,] "3.0"
## [44,] "3.0"
## [45,] "3.0"
## [46,] "3.0"
## [47,] "2.5"
## [48,] "2.5"
## [49,] "2.5"
## [50,] "2.5"
## [51,] "2.5"
## [52,] "2.5"
## [53,] "2.0"
## [54,] "2.0"
## [55,] "2.0"
## [56,] "2.0"
## [57,] "2.0"
## [58,] "2.0"
## [59,] "2.0"
## [60,] "1.5"
## [61,] "1.5"
## [62,] "1.0"
## [63,] "1.0"
## [64,] "1.0"
# extract Player's pre rating
Pre_Rate1 <- unlist(str_extract_all(results, "(R:\\s*)(\\d+)"))
Player_Pre_Rating <- matrix(unlist(str_extract_all(Pre_Rate1, "(\\d+)")))
Player_Pre_Rating <- as.numeric(Player_Pre_Rating)
Player_Pre_Rating
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153
## [57] 1092  917  853  967  955 1530 1175 1163
# extract the full line of text that starts with "|" and a number.
avg_opponent_step1 <- unlist(str_extract_all(results, "\\|[0-9].*"))

avg_opponent_step1[16]
## [1] "|4.0  |D  10|W  15|H    |W  39|L   2|W  36|U    |"
# used str_replace to replace empty spaces with 00 in the data.
avg_opponent_step2 <- str_replace_all(avg_opponent_step1, "\\s{1,2}\\|","00|")

avg_opponent_step2[16]
## [1] "|4.000|D  10|W  15|H  00|W  39|L   2|W  36|U  00|"
# extract the number of opponent for each round.
avg_opponent_step3 <- (str_extract_all(avg_opponent_step2, "\\s\\d{1,2}"))

avg_opponent_step3[16]
## [[1]]
## [1] " 10" " 15" " 00" " 39" " 2"  " 36" " 00"
# Calculation of the Average Pre Chess Rating of Opponents

#converting the list into a matrix 
matrix_opponent <- matrix(unlist(avg_opponent_step3), byrow=TRUE, nrow=length(avg_opponent_step3) )
dim(matrix_opponent)
## [1] 64  7
new_matrix <- t(apply(matrix_opponent, 1,as.numeric)) 
dim(new_matrix)
## [1] 64  7
#replacing each element in the matrix with actual rating for opponents and adding NA if its 0
for (i in 1:nrow(new_matrix)) 
  {
  for (j in 1:ncol(new_matrix))
    {
    if (new_matrix[i,j] == 0){
      new_matrix[i,j] = NA
    }
    else{
      new_matrix[i,j] <- Player_Pre_Rating[new_matrix[i,j]]
    }
  }
}
# calculating the mean of each row to get the average opponent ratings.
avg_opponents <- c(round(rowMeans(new_matrix, na.rm = TRUE)))
avg_opponents
##  [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263
# Putting all the information together to create a new data frame!
df <- data.frame(Player_Name, Player_state, Tot_Points, Player_Pre_Rating, avg_opponents)
colnames(df) <- c("Players Name","Players State", "Total Points", "Players Pre-Rating", "Opponents Avg-Ratings")

# Exporting the result into a CSV file.
write.csv(df, "Chess_tournamentinfo.csv", row.names=FALSE)