Load the tidyverse package

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load the tibble package

library(tibble)

Import the tournament text file

Create a vector entitled names which will assign column names to the chess tibble.

#create the names vector
names <- c("Number","PrePost","Pts","R1","R2","R3","R4","R5","R6","R7","Blank")

#use the read_delim function to import data into the chess tibble
chess <- read_delim(file="C:\\Users\\lizza\\Documents\\CUNY - Data Analytics\\DATA 607_Revised\\Week 4\\Project\\tournamentinfo.txt", col_names = names, delim="|", skip=3)
## Parsed with column specification:
## cols(
##   Number = col_character(),
##   PrePost = col_character(),
##   Pts = col_character(),
##   R1 = col_character(),
##   R2 = col_character(),
##   R3 = col_character(),
##   R4 = col_character(),
##   R5 = col_character(),
##   R6 = col_character(),
##   R7 = col_character(),
##   Blank = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 65 parsing failures.
## row # A tibble: 5 x 5 col     row col   expected   actual    file                                    expected   <int> <chr> <chr>      <chr>     <chr>                                   actual 1     1 <NA>  11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ file 2     4 <NA>  11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ row 3     7 <NA>  11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ col 4    10 <NA>  11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ expected 5    13 <NA>  11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
#take a look at the tibble using the glimpse option
glimpse(chess)
## Observations: 193
## Variables: 11
## $ Number  <chr> "-----------------------------------------------------...
## $ PrePost <chr> NA, " GARY HUA                        ", " 15445895 / ...
## $ Pts     <chr> NA, "6.0  ", "N:2  ", NA, "6.0  ", "N:2  ", NA, "6.0  ...
## $ R1      <chr> NA, "W  39", "W    ", NA, "W  63", "B    ", NA, "L   8...
## $ R2      <chr> NA, "W  21", "B    ", NA, "W  58", "W    ", NA, "W  61...
## $ R3      <chr> NA, "W  18", "W    ", NA, "L   4", "B    ", NA, "W  25...
## $ R4      <chr> NA, "W  14", "B    ", NA, "W  17", "W    ", NA, "W  21...
## $ R5      <chr> NA, "W   7", "W    ", NA, "W  16", "B    ", NA, "W  11...
## $ R6      <chr> NA, "D  12", "B    ", NA, "W  20", "W    ", NA, "W  13...
## $ R7      <chr> NA, "D   4", "W    ", NA, "W   7", "B    ", NA, "W  12...
## $ Blank   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

After importing the tibble, we see that every 3rd row doesn’t have any usable data. To clean the data we create a vector entitled nodata which uses the seq feature starting at the 1st row ending at the 193rd by 3. Then we create chess_clean which excludes the findings of nodata

nodata <- seq(1,193,3)
chess_clean <- chess[-c(nodata),]
dim(chess_clean)
## [1] 128  11

chess_clean has a more manageable 128 observations and 11 variables.

Extract the Player’s Name from the chess_clean tibble

Use str_extract_all along with regular expressions to pull the players names from the PrePost variable.

Player_Name <- unlist(str_extract_all(chess_clean$PrePost, "\\w+[A-Z] ?\\w+ \\w+"))

#check classification & view the Player_Name variable
class(Player_Name)
## [1] "character"
Player_Name
##  [1] "GARY HUA"                 "DAKSHESH DARURI"         
##  [3] "ADITYA BAJAJ"             "PATRICK H SCHILLING"     
##  [5] "HANSHI ZUO"               "HANSEN SONG"             
##  [7] "GARY DEE SWATHELL"        "EZEKIEL HOUGHTON"        
##  [9] "STEFANO LEE"              "ANVIT RAO"               
## [11] "CAMERON WILLIAM MC"       "KENNETH J TACK"          
## [13] "TORRANCE HENRY JR"        "BRADLEY SHAW"            
## [15] "ZACHARY JAMES HOUGHTON"   "MIKE NIKITIN"            
## [17] "RONALD GRZEGORCZYK"       "DAVID SUNDEEN"           
## [19] "DIPANKAR ROY"             "JASON ZHENG"             
## [21] "DINH DANG BUI"            "EUGENE L MCCLURE"        
## [23] "ALAN BUI"                 "MICHAEL R ALDRICH"       
## [25] "LOREN SCHWIEBERT"         "MAX ZHU"                 
## [27] "GAURAV GIDWANI"           "SOFIA ADINA STANESCU"    
## [29] "CHIEDOZIE OKORIE"         "GEORGE AVERY JONES"      
## [31] "RISHI SHETTY"             "JOSHUA PHILIP MATHEWS"   
## [33] "JADE GE"                  "MICHAEL JEFFERY THOMAS"  
## [35] "JOSHUA DAVID LEE"         "SIDDHARTH JHA"           
## [37] "AMIYATOSH PWNANANDAM"     "BRIAN LIU"               
## [39] "JOEL R HENDON"            "FOREST ZHANG"            
## [41] "KYLE WILLIAM MURPHY"      "JARED GE"                
## [43] "ROBERT GLEN VASEY"        "JUSTIN D SCHILLING"      
## [45] "DEREK YAN"                "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT"              "DANIEL KHAIN"            
## [49] "MICHAEL J MARTIN"         "SHIVAM JHA"              
## [51] "TEJAS AYYAGARI"           "ETHAN GUO"               
## [53] "JOSE C YBARRA"            "LARRY HODGE"             
## [55] "ALEX KONG"                "MARISA RICCI"            
## [57] "MICHAEL LU"               "VIRAJ MOHILE"            
## [59] "SEAN M MC"                "JULIA SHEN"              
## [61] "JEZZEL FARKAS"            "ASHWIN BALAJI"           
## [63] "THOMAS JOSEPH HOSMER"     "BEN LI"

Extract the Player’s State from the chess_clean tibble

Use str_extract_all along with regular expressions to pull the players state from the Number variable.

Player_State <- unlist(str_extract_all(chess_clean$Number,"[A-Z]{1,2}"))

#check classification & print Player_State
class(Player_State)
## [1] "character"
Player_State
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"

Extract the total number of points from the chess_clean tibble

Tot_Points <- (unlist(str_extract_all(chess_clean$Pts, "[0-9]\\.[0-9]")))

#convert Tot_Points to a numeric variable
Tot_Points <- as.numeric(Tot_Points)

#check classification & print Tot_Points
class(Tot_Points)
## [1] "numeric"
Tot_Points
##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0
## [18] 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
## [35] 3.5 3.5 3.5 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5
## [52] 2.5 2.0 2.0 2.0 2.0 2.0 2.0 2.0 1.5 1.5 1.0 1.0 1.0

Extract the Player’s Pre-Rating from the chess_clean tibble

#use str_extract_all & regular expressions to pull PrePost data
Player_Pre <- unlist(str_extract_all(chess_clean$PrePost,":[:blank:]+[0-9]{1,4}"))

#view the first 5 data elements
head(Player_Pre,n=5)
## [1] ": 1794" ": 1553" ": 1384" ": 1716" ": 1655"

As you can see the colon precedes the PrePost number, next we use the str_sub feature to remove the colon and the blank spaces. Next we trim the data using str_trim since several entries have 3 digits instead of 4.

Player_Pre <- str_sub(Player_Pre, start = 3, end=6)
Player_Pre
##  [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" " 980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" " 377" "1362" "1382" "1291" "1056"
## [51] "1011" " 935" "1393" "1270" "1186" "1153" "1092" " 917" " 853" " 967"
## [61] " 955" "1530" "1175" "1163"
Player_Pre <- str_trim(Player_Pre)

#convert Player_Pre to a numeric variable
Player_Pre <- as.numeric(Player_Pre)

#check classification & print Player_Pre
class(Player_Pre)
## [1] "numeric"
Player_Pre
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153
## [57] 1092  917  853  967  955 1530 1175 1163

Extract opposing players by round

In order to extract the opponents data in a cleaner environment a vector entitled even was created that would begin at the number 2 to 128 counting by 2 to mirror chess_clean which has 128 variables; our output is added to chess_opp

#create the even vector along with the chess_opp data frame
even <- seq(2,128,2)
chess_opp <- chess_clean[-c(even),]

Extract the opposing players’ number

R1 Extraction Use str_extract_all & regular expressions to pull the opposing players’ number.

R1_O <- unlist(str_extract_all(chess_opp$R1, "[[:alnum:] ]{1,}"))
R1_O
##  [1] "W  39" "W  63" "L   8" "W  23" "W  45" "W  34" "W  57" "W   3"
##  [9] "W  25" "D  16" "D  38" "W  42" "W  36" "W  54" "D  19" "D  10"
## [17] "W  48" "W  47" "D  15" "L  40" "W  43" "W  64" "L   4" "L  28"
## [25] "L   9" "W  49" "W  51" "W  24" "W  50" "L  52" "L  58" "W  61"
## [33] "W  60" "L   6" "L  46" "L  13" "B    " "D  11" "L   1" "W  20"
## [41] "W  59" "L  12" "L  21" "B    " "L   5" "W  35" "L  18" "L  17"
## [49] "L  26" "L  29" "L  27" "W  30" "H    " "L  14" "L  62" "H    "
## [57] "L   7" "W  31" "L  41" "L  33" "L  32" "W  55" "L   2" "L  22"

In certain cases we have a character followed by blanks which will generate an NA value but will fail to line up properly. In order to keep the values consistent we had a 0 to missing values using the str_replace_all feature

R1_O <- unlist(str_replace_all(R1_O, "\\w\\s..\\D","0"))
R1_O
##  [1] "W  39" "W  63" "L   8" "W  23" "W  45" "W  34" "W  57" "W   3"
##  [9] "W  25" "D  16" "D  38" "W  42" "W  36" "W  54" "D  19" "D  10"
## [17] "W  48" "W  47" "D  15" "L  40" "W  43" "W  64" "L   4" "L  28"
## [25] "L   9" "W  49" "W  51" "W  24" "W  50" "L  52" "L  58" "W  61"
## [33] "W  60" "L   6" "L  46" "L  13" "0"     "D  11" "L   1" "W  20"
## [41] "W  59" "L  12" "L  21" "0"     "L   5" "W  35" "L  18" "L  17"
## [49] "L  26" "L  29" "L  27" "W  30" "0"     "L  14" "L  62" "0"    
## [57] "L   7" "W  31" "L  41" "L  33" "L  32" "W  55" "L   2" "L  22"

Next use str_replace_all again to remove all data elements with the characters WLD

R1_O <- str_replace_all(R1_O, "[WLD]","")
R1_O
##  [1] "  39" "  63" "   8" "  23" "  45" "  34" "  57" "   3" "  25" "  16"
## [11] "  38" "  42" "  36" "  54" "  19" "  10" "  48" "  47" "  15" "  40"
## [21] "  43" "  64" "   4" "  28" "   9" "  49" "  51" "  24" "  50" "  52"
## [31] "  58" "  61" "  60" "   6" "  46" "  13" "0"    "  11" "   1" "  20"
## [41] "  59" "  12" "  21" "0"    "   5" "  35" "  18" "  17" "  26" "  29"
## [51] "  27" "  30" "0"    "  14" "  62" "0"    "   7" "  31" "  41" "  33"
## [61] "  32" "  55" "   2" "  22"

Next we use the str_trim feature in order to remove the extra padding

R1_O <- str_trim(R1_O)
R1_O
##  [1] "39" "63" "8"  "23" "45" "34" "57" "3"  "25" "16" "38" "42" "36" "54"
## [15] "19" "10" "48" "47" "15" "40" "43" "64" "4"  "28" "9"  "49" "51" "24"
## [29] "50" "52" "58" "61" "60" "6"  "46" "13" "0"  "11" "1"  "20" "59" "12"
## [43] "21" "0"  "5"  "35" "18" "17" "26" "29" "27" "30" "0"  "14" "62" "0" 
## [57] "7"  "31" "41" "33" "32" "55" "2"  "22"

Next we convert the R1_O variable from character to numeric and check the classification and print the output.

R1_O <- as.numeric(R1_O)
class(R1_O)
## [1] "numeric"
R1_O
##  [1] 39 63  8 23 45 34 57  3 25 16 38 42 36 54 19 10 48 47 15 40 43 64  4
## [24] 28  9 49 51 24 50 52 58 61 60  6 46 13  0 11  1 20 59 12 21  0  5 35
## [47] 18 17 26 29 27 30  0 14 62  0  7 31 41 33 32 55  2 22

Conduct the same conversions for the remaining data elements

#R2 Extraction
R2_O <- unlist(str_extract_all(chess_opp$R2, "[[:alnum:] ]{1,}"))
R2_O <- unlist(str_replace_all(R2_O, "\\w\\s..\\D","0"))
R2_O <- str_replace_all(R2_O, "[WLD]","")
R2_O <- str_trim(R2_O)
R2_O <- as.numeric(R2_O)
#R3 Extraction
R3_O <- unlist(str_extract_all(chess_opp$R3, "[[:alnum:] ]{1,}"))
R3_O <- unlist(str_replace_all(R3_O, "\\w\\s..\\D","0"))
R3_O <- str_replace_all(R3_O, "[WLD]","")
R3_O <- str_trim(R3_O)
R3_O <- as.numeric(R3_O)
#R4 Extraction
R4_O <- unlist(str_extract_all(chess_opp$R4, "[[:alnum:] ]{1,}"))
R4_O <- unlist(str_replace_all(R4_O, "\\w\\s..\\D","0"))
R4_O <- str_replace_all(R4_O, "[WLD]","")
R4_O <- str_trim(R4_O)
R4_O <- as.numeric(R4_O)
#R5 Extraction
R5_O <- unlist(str_extract_all(chess_opp$R5, "[[:alnum:] ]{1,}"))
R5_O <- unlist(str_replace_all(R5_O, "\\w\\s..\\D","0"))
R5_O <- str_replace_all(R5_O, "[WLD]","")
R5_O <- str_trim(R5_O)
R5_O <- as.numeric(R5_O)
#R6 Extraction
R6_O <- unlist(str_extract_all(chess_opp$R6, "[[:alnum:] ]{1,}"))
R6_O <- unlist(str_replace_all(R6_O, "\\w\\s..\\D","0"))
R6_O <- str_replace_all(R6_O, "[WLD]","")
R6_O <- str_trim(R6_O)
R6_O <- as.numeric(R6_O)
#R7 Extraction
R7_O <- unlist(str_extract_all(chess_opp$R7, "[[:alnum:] ]{1,}"))
R7_O <- unlist(str_replace_all(R7_O, "\\w\\s..\\D","0"))
R7_O <- str_replace_all(R7_O, "[WLD]","")
R7_O <- str_trim(R7_O)
R7_O <- as.numeric(R7_O)

Create a tibble entitled Chess_combined with all of the values

chess_combined <- tibble(Player_Name,Player_State,Tot_Points,
                         Player_Pre,R1_O,R2_O,R3_O,R4_O,R5_O,
                         R6_O,R7_O)
glimpse(chess_combined)
## Observations: 64
## Variables: 11
## $ Player_Name  <chr> "GARY HUA", "DAKSHESH DARURI", "ADITYA BAJAJ", "P...
## $ Player_State <chr> "ON", "MI", "MI", "MI", "MI", "OH", "MI", "MI", "...
## $ Tot_Points   <dbl> 6.0, 6.0, 6.0, 5.5, 5.5, 5.0, 5.0, 5.0, 5.0, 5.0,...
## $ Player_Pre   <dbl> 1794, 1553, 1384, 1716, 1655, 1686, 1649, 1641, 1...
## $ R1_O         <dbl> 39, 63, 8, 23, 45, 34, 57, 3, 25, 16, 38, 42, 36,...
## $ R2_O         <dbl> 21, 58, 61, 28, 37, 29, 46, 32, 18, 19, 56, 33, 2...
## $ R3_O         <dbl> 18, 4, 25, 2, 12, 11, 13, 14, 59, 55, 6, 5, 7, 8,...
## $ R4_O         <dbl> 14, 17, 21, 26, 13, 35, 11, 9, 8, 31, 7, 38, 5, 1...
## $ R5_O         <dbl> 7, 16, 11, 5, 4, 10, 1, 47, 26, 6, 3, 0, 33, 27, ...
## $ R6_O         <dbl> 12, 20, 13, 19, 14, 27, 9, 28, 7, 25, 34, 1, 3, 5...
## $ R7_O         <dbl> 4, 7, 12, 1, 17, 21, 2, 19, 20, 18, 26, 3, 32, 31...

Calculate the average

Create empty placeholders for PreRatingAverage, Average, TotCount & Denominator. Next, create a loop that will run the total length of the Player_Pre data element from the chess_combined tibble. The competitors data element pulls data from **R1_0:R7:_0** according to the loop.

PreRatingAverage <- vector() 
Average <- numeric(0) 
TotCount <- numeric(0) 
Denominator <- numeric(0) 

for (i in 1:length(chess_combined$Player_Pre))
{
    competitors <- as.numeric(as.vector(chess_combined[i,5:11]))
    #add competitors 
    TotCount <- sum(chess_combined[competitors,"Player_Pre"])
    #exclude any number equal to 0
    Denominator <- sum(chess_combined[i,c(5:11)]!=0)
    Average <- TotCount/Denominator
    PreRatingAverage <- append(PreRatingAverage,Average)
}

PreRatingAverage <- round(PreRatingAverage, digits = 0)
PreRatingAverage
##  [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263

Create final tibble

chess_final <- tibble(Player_Name,Player_State,Tot_Points,
                      Player_Pre,PreRatingAverage)
glimpse(chess_final)
## Observations: 64
## Variables: 5
## $ Player_Name      <chr> "GARY HUA", "DAKSHESH DARURI", "ADITYA BAJAJ"...
## $ Player_State     <chr> "ON", "MI", "MI", "MI", "MI", "OH", "MI", "MI...
## $ Tot_Points       <dbl> 6.0, 6.0, 6.0, 5.5, 5.5, 5.0, 5.0, 5.0, 5.0, ...
## $ Player_Pre       <dbl> 1794, 1553, 1384, 1716, 1655, 1686, 1649, 164...
## $ PreRatingAverage <dbl> 1605, 1469, 1564, 1574, 1501, 1519, 1372, 146...

View the output

chess_final
## # A tibble: 64 x 5
##    Player_Name         Player_State Tot_Points Player_Pre PreRatingAverage
##    <chr>               <chr>             <dbl>      <dbl>            <dbl>
##  1 GARY HUA            ON                 6.00       1794             1605
##  2 DAKSHESH DARURI     MI                 6.00       1553             1469
##  3 ADITYA BAJAJ        MI                 6.00       1384             1564
##  4 PATRICK H SCHILLING MI                 5.50       1716             1574
##  5 HANSHI ZUO          MI                 5.50       1655             1501
##  6 HANSEN SONG         OH                 5.00       1686             1519
##  7 GARY DEE SWATHELL   MI                 5.00       1649             1372
##  8 EZEKIEL HOUGHTON    MI                 5.00       1641             1468
##  9 STEFANO LEE         ON                 5.00       1411             1523
## 10 ANVIT RAO           MI                 5.00       1365             1554
## # ... with 54 more rows

Write CSV file

write.csv(chess_final, file = "ChessTournament.csv")