week4/Project1/607

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605

Initializing packages and reading the text file from local host.

library(knitr)
library(stringr)
library(plyr)
setwd("~/Desktop/MSDA/proj1")
textdata <- readLines("tournamentinfo1.txt")
textdata <- textdata[c(-1:-3)] # Remove first three lines as they read headers
head(textdata)

## [1] "-----------------------------------------------------------------------------------------"
## [2] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [3] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [6] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

Extracting Player’s Name using REGEX.

Note: Always check for dimentions or row names as they should be matched throughout project

Player_name <- unlist(str_extract_all(textdata,"\\w+[[:alpha:]] ?\\w+ \\w+ "))
Player_name

##  [1] "GARY HUA "                 "DAKSHESH DARURI "         
##  [3] "ADITYA BAJAJ "             "PATRICK H SCHILLING "     
##  [5] "HANSHI ZUO "               "HANSEN SONG "             
##  [7] "GARY DEE SWATHELL "        "EZEKIEL HOUGHTON "        
##  [9] "STEFANO LEE "              "ANVIT RAO "               
## [11] "CAMERON WILLIAM MC "       "KENNETH J TACK "          
## [13] "TORRANCE HENRY JR "        "BRADLEY SHAW "            
## [15] "ZACHARY JAMES HOUGHTON "   "MIKE NIKITIN "            
## [17] "RONALD GRZEGORCZYK "       "DAVID SUNDEEN "           
## [19] "DIPANKAR ROY "             "JASON ZHENG "             
## [21] "DINH DANG BUI "            "EUGENE L MCCLURE "        
## [23] "ALAN BUI "                 "MICHAEL R ALDRICH "       
## [25] "LOREN SCHWIEBERT "         "MAX ZHU "                 
## [27] "GAURAV GIDWANI "           "SOFIA ADINA "             
## [29] "CHIEDOZIE OKORIE "         "GEORGE AVERY JONES "      
## [31] "RISHI SHETTY "             "JOSHUA PHILIP MATHEWS "   
## [33] "JADE GE "                  "MICHAEL JEFFERY THOMAS "  
## [35] "JOSHUA DAVID LEE "         "SIDDHARTH JHA "           
## [37] "AMIYATOSH PWNANANDAM "     "BRIAN LIU "               
## [39] "JOEL R HENDON "            "FOREST ZHANG "            
## [41] "KYLE WILLIAM MURPHY "      "JARED GE "                
## [43] "ROBERT GLEN VASEY "        "JUSTIN D SCHILLING "      
## [45] "DEREK YAN "                "JACOB ALEXANDER LAVALLEY "
## [47] "ERIC WRIGHT "              "DANIEL KHAIN "            
## [49] "MICHAEL J MARTIN "         "SHIVAM JHA "              
## [51] "TEJAS AYYAGARI "           "ETHAN GUO "               
## [53] "JOSE C YBARRA "            "LARRY HODGE "             
## [55] "ALEX KONG "                "MARISA RICCI "            
## [57] "MICHAEL LU "               "VIRAJ MOHILE "            
## [59] "SEAN M MC "                "JULIA SHEN "              
## [61] "JEZZEL FARKAS "            "ASHWIN BALAJI "           
## [63] "THOMAS JOSEPH HOSMER "     "BEN LI "

Extracting Player’s State

Player_state <- unlist(str_extract_all(textdata,"(?:^|\\W)ON | MI | OH "))
Player_state

##  [1] " ON " " MI " " MI " " MI " " MI " " OH " " MI " " MI " " ON " " MI "
## [11] " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI "
## [21] " ON " " MI " " ON " " MI " " MI " " ON " " MI " " MI " " MI " " ON "
## [31] " MI " " ON " " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI "
## [41] " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI "
## [51] " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI " " MI "
## [61] " ON " " MI " " MI " " MI "

Total Number of Points

Total_number_points <- unlist(str_extract_all(textdata,"\\d\\.\\d"))
Total_number_points

##  [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"

Extracting pre-rating points

Pre_rating_points  <- unlist(str_extract_all(textdata,"[[O-R]][[:punct:]]\\s* (\\d+)"))
Pre_rating_points1 <- unlist(str_replace(Pre_rating_points,"R: "," "))
Pre_rating_points1 <- as.numeric(Pre_rating_points1)
Pre_rating_points1

##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153
## [57] 1092  917  853  967  955 1530 1175 1163

Extracting the player number played for each of seven rounds against a particular player and Relpacing blank places with Zeros.

output3<- unlist(str_extract_all(textdata,"\\|[[:number:]].*"))
output2 <- unlist(str_replace_all(output3,"\\s{1,2}\\|","00"))
output1 <- (str_extract_all(output2,"\\s\\d{1,2}"))
head(output1)

## [[1]]
## [1] " 39" " 21" " 18" " 14" " 7"  " 12" " 4" 
## 
## [[2]]
## [1] " 63" " 58" " 4"  " 17" " 16" " 20" " 7" 
## 
## [[3]]
## [1] " 8"  " 61" " 25" " 21" " 11" " 13" " 12"
## 
## [[4]]
## [1] " 23" " 28" " 2"  " 26" " 5"  " 19" " 1" 
## 
## [[5]]
## [1] " 45" " 37" " 12" " 13" " 4"  " 14" " 17"
## 
## [[6]]
## [1] " 34" " 29" " 11" " 35" " 10" " 27" " 21"

Formation of a dataframe with help of matrix above

matrix1 <- matrix(unlist(output1),byrow = TRUE,nrow = length(output1)) 
matrix2 <-t(apply(matrix1,1,as.numeric)) #converting character into numeric dataframe.
matrix2 <- data.frame(matrix2)
matrix2[matrix2 == 0] <- NA
sub_dataset <- data.frame(cbind(Pre_rating_points1,matrix2))
head(sub_dataset)

##   Pre_rating_points1 X1 X2 X3 X4 X5 X6 X7
## 1               1794 39 21 18 14  7 12  4
## 2               1553 63 58  4 17 16 20  7
## 3               1384  8 61 25 21 11 13 12
## 4               1716 23 28  2 26  5 19  1
## 5               1655 45 37 12 13  4 14 17
## 6               1686 34 29 11 35 10 27 21

Replace values with the corresponding pre rating points for each of seven columns.

 I tried with if-else loop but it got stucked and output was Null."For" loop worked.

for (i in 1:64) {
  sub_dataset$X1[i] <- (sub_dataset$Pre_rating[sub_dataset$X1[i]])
  sub_dataset$X2[i] <- (sub_dataset$Pre_rating[sub_dataset$X2[i]])
  sub_dataset$X3[i] <- (sub_dataset$Pre_rating[sub_dataset$X3[i]])
  sub_dataset$X4[i] <- (sub_dataset$Pre_rating[sub_dataset$X4[i]])
  sub_dataset$X5[i] <- (sub_dataset$Pre_rating[sub_dataset$X5[i]])
  sub_dataset$X6[i] <- (sub_dataset$Pre_rating[sub_dataset$X6[i]])
  sub_dataset$X7[i] <- (sub_dataset$Pre_rating[sub_dataset$X7[i]])
}

head(sub_dataset)

##   Pre_rating_points1   X1   X2   X3   X4   X5   X6   X7
## 1               1794 1436 1563 1600 1610 1649 1663 1716
## 2               1553 1175  917 1716 1629 1604 1595 1649
## 3               1384 1641  955 1745 1563 1712 1666 1663
## 4               1716 1363 1507 1553 1579 1655 1564 1794
## 5               1655 1242  980 1663 1666 1716 1610 1629
## 6               1686 1399 1602 1712 1438 1365 1552 1563

Taking Averages of all seven columns.

for (i in 1:64) {
sub_dataset$Average[i] <- rowMeans(sub_dataset[i,2:8],na.rm = TRUE)
}
head(sub_dataset)

##   Pre_rating_points1   X1   X2   X3   X4   X5   X6   X7  Average
## 1               1794 1436 1563 1600 1610 1649 1663 1716 1605.286
## 2               1553 1175  917 1716 1629 1604 1595 1649 1469.286
## 3               1384 1641  955 1745 1563 1712 1666 1663 1563.571
## 4               1716 1363 1507 1553 1579 1655 1564 1794 1573.571
## 5               1655 1242  980 1663 1666 1716 1610 1629 1500.857
## 6               1686 1399 1602 1712 1438 1365 1552 1563 1518.714

Required Dataset

dataset <- data.frame(Player_name,Player_state,Total_number_points,sub_dataset$Pre_rating,sub_dataset$Average)
head(dataset)

##            Player_name Player_state Total_number_points
## 1            GARY HUA           ON                  6.0
## 2     DAKSHESH DARURI           MI                  6.0
## 3        ADITYA BAJAJ           MI                  6.0
## 4 PATRICK H SCHILLING           MI                  5.5
## 5          HANSHI ZUO           MI                  5.5
## 6         HANSEN SONG           OH                  5.0
##   sub_dataset.Pre_rating sub_dataset.Average
## 1                   1794            1605.286
## 2                   1553            1469.286
## 3                   1384            1563.571
## 4                   1716            1573.571
## 5                   1655            1500.857
## 6                   1686            1518.714

write.csv(dataset,"Tournament.csv")