Introduction

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

Data Trimming

The tournament data is tab “|” deli metered data so it requires a little bit of manipulation. There are various ways we can approach this: 1 - the most advanced techniques involve regular expression to skip specific tab between the data set 2 - load the data using tab deli metered approach with specific parameters 3 - other ways requires a bit of programming and loop iteration

We are going to use 2nd method.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# load teh data
website <- 'https://raw.githubusercontent.com/joewarner89/CUNY-607/main/Project%201%20Data%20Analysis/data/tournament_file.txt'
# Creating the raw data 

rawdata <- read.delim(file = website,
                 header = T,
                 sep = '|'
                ,skip = 1
                ,check.names = T,
                
                strip.white = FALSE
                ,
                
                )
head(rawdata)
##                                                                                        Pair
## 1                                                                                     Num  
## 2 -----------------------------------------------------------------------------------------
## 3                                                                                        1 
## 4                                                                                       ON 
## 5 -----------------------------------------------------------------------------------------
## 6                                                                                        2 
##                         Player.Name Total Round Round.1 Round.2 Round.3 Round.4
## 1  USCF ID / Rtg (Pre->Post)         Pts    1       2       3       4       5  
## 2                                                                              
## 3  GARY HUA                         6.0   W  39   W  21   W  18   W  14   W   7
## 4  15445895 / R: 1794   ->1817      N:2   W       B       W       B       W    
## 5                                                                              
## 6  DAKSHESH DARURI                  6.0   W  63   W  58   L   4   W  17   W  16
##   Round.5 Round.6  X
## 1     6       7   NA
## 2                 NA
## 3   D  12   D   4 NA
## 4   B       W     NA
## 5                 NA
## 6   W  20   W   7 NA
# delete the row on based on empyrow row and lengthy unecessary strings
data <- rawdata[-which(rawdata$Pair == '-----------------------------------------------------------------------------------------'),]

# reset the indexing row so data 
row.names(data) <- NULL
# Remove the first row  and the last column x 
data_f <- data[-1,]
data <-  subset(data_f, select = -X)
data <- data %>% rename(Pair_Number = Pair,
                        Player_Name = Player.Name,
                        Total_Pts = Total,
                        Round_1 = Round,
                        Round_2 = Round.1,
                        Round_3 = Round.2,
                        Round_4 = Round.3,
                        Round_5 = Round.4,
                        Round_6 = Round.5,
                        Round_7 = Round.6)
head(data,10)
##    Pair_Number                       Player_Name Total_Pts Round_1 Round_2
## 2           1   GARY HUA                             6.0     W  39   W  21
## 3          ON   15445895 / R: 1794   ->1817          N:2     W       B    
## 4           2   DAKSHESH DARURI                      6.0     W  63   W  58
## 5          MI   14598900 / R: 1553   ->1663          N:2     B       W    
## 6           3   ADITYA BAJAJ                         6.0     L   8   W  61
## 7          MI   14959604 / R: 1384   ->1640          N:2     W       B    
## 8           4   PATRICK H SCHILLING                  5.5     W  23   D  28
## 9          MI   12616049 / R: 1716   ->1744          N:2     W       B    
## 10          5   HANSHI ZUO                           5.5     W  45   W  37
## 11         MI   14601533 / R: 1655   ->1690          N:2     B       W    
##    Round_3 Round_4 Round_5 Round_6 Round_7
## 2    W  18   W  14   W   7   D  12   D   4
## 3    W       B       W       B       W    
## 4    L   4   W  17   W  16   W  20   W   7
## 5    B       W       B       W       B    
## 6    W  25   W  21   W  11   W  13   W  12
## 7    W       B       W       B       W    
## 8    W   2   W  26   D   5   W  19   D   1
## 9    W       B       W       B       B    
## 10   D  12   D  13   D   4   W  14   W  17
## 11   B       W       B       W       B

String Manipulations

#Recreating the variable Pair Number and Sates
Pair_Num <- na.omit(str_extract(data$Pair_Number, '\\d+'))
Pair_Num <- as.numeric(Pair_Num)
Pair_Num
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [51] 51 52 53 54 55 56 57 58 59 60 61 62 63 64
State <- na.omit(str_extract(data$Pair_Number, '[A-Z]+'))
State <- as.character(State)
State
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"
Player_Name <- na.omit(str_extract(data$Player_Name, '[A-Z]+\\s[A-Z]+'))
Player_Name <- as.character(Player_Name)
Player_Name
##  [1] "GARY HUA"             "DAKSHESH DARURI"      "ADITYA BAJAJ"        
##  [4] "PATRICK H"            "HANSHI ZUO"           "HANSEN SONG"         
##  [7] "GARY DEE"             "EZEKIEL HOUGHTON"     "STEFANO LEE"         
## [10] "ANVIT RAO"            "CAMERON WILLIAM"      "KENNETH J"           
## [13] "TORRANCE HENRY"       "BRADLEY SHAW"         "ZACHARY JAMES"       
## [16] "MIKE NIKITIN"         "RONALD GRZEGORCZYK"   "DAVID SUNDEEN"       
## [19] "DIPANKAR ROY"         "JASON ZHENG"          "DINH DANG"           
## [22] "EUGENE L"             "ALAN BUI"             "MICHAEL R"           
## [25] "LOREN SCHWIEBERT"     "MAX ZHU"              "GAURAV GIDWANI"      
## [28] "SOFIA ADINA"          "CHIEDOZIE OKORIE"     "GEORGE AVERY"        
## [31] "RISHI SHETTY"         "JOSHUA PHILIP"        "JADE GE"             
## [34] "MICHAEL JEFFERY"      "JOSHUA DAVID"         "SIDDHARTH JHA"       
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"            "JOEL R"              
## [40] "FOREST ZHANG"         "KYLE WILLIAM"         "JARED GE"            
## [43] "ROBERT GLEN"          "JUSTIN D"             "DEREK YAN"           
## [46] "JACOB ALEXANDER"      "ERIC WRIGHT"          "DANIEL KHAIN"        
## [49] "MICHAEL J"            "SHIVAM JHA"           "TEJAS AYYAGARI"      
## [52] "ETHAN GUO"            "JOSE C"               "LARRY HODGE"         
## [55] "ALEX KONG"            "MARISA RICCI"         "MICHAEL LU"          
## [58] "VIRAJ MOHILE"         "SEAN M"               "JULIA SHEN"          
## [61] "JEZZEL FARKAS"        "ASHWIN BALAJI"        "THOMAS JOSEPH"       
## [64] "BEN LI"
USCF_ID <- na.omit(str_extract(data$Player_Name, "(?<=\\d{0,14})\\d+"))
USCF_ID <- as.character(USCF_ID)
USCF_ID
##  [1] "15445895" "14598900" "14959604" "12616049" "14601533" "15055204"
##  [7] "11146376" "15142253" "14954524" "14150362" "12581589" "12681257"
## [13] "15082995" "10131499" "15619130" "10295068" "10297702" "11342094"
## [19] "14862333" "14529060" "15495066" "12405534" "15030142" "13469010"
## [25] "12486656" "15131520" "14476567" "14882954" "15323285" "12577178"
## [31] "15131618" "14073750" "14691842" "15051807" "14601397" "14773163"
## [37] "15489571" "15108523" "12923035" "14892710" "15761443" "14462326"
## [43] "14101068" "15323504" "15372807" "15490981" "12533115" "14369165"
## [49] "12531685" "14773178" "15205474" "14918803" "12578849" "12836773"
## [55] "15412571" "14679887" "15113330" "14700365" "12841036" "14579262"
## [61] "15771592" "15219542" "15057092" "15006561"
Pre_Ratings <- na.omit(str_extract(data$Player_Name, "(?<=R:\\s{1,2})(\\w+|\\d{1,4}+(?=\\s))"))
Pre_Ratings <- na.omit(str_extract(Pre_Ratings, "[[:digit:]]+"))
Pre_Ratings <- as.numeric(Pre_Ratings)
Pre_Ratings
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610 1220
## [16] 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507 1602 1522
## [31] 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332 1283 1199 1242
## [46]  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153 1092  917  853  967
## [61]  955 1530 1175 1163
Total_points <- na.omit(str_extract(data$Total_Pts, "^-?\\d{1,3}+.\\d{1,4}"))
Total_points <- as.numeric(Total_points)
Total_points
##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0 4.0 4.0
## [20] 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.0
## [39] 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5 2.5 2.0 2.0 2.0 2.0 2.0
## [58] 2.0 2.0 1.5 1.5 1.0 1.0 1.0

We create a function to get all of the odd number of rows from column Round_1 to Round_7 based on the position of the index. We select odd Row data and even data because of the way the data aligned with their specific column The function will select only the first group class variable assigned to each round. See code below. It is very important to understand the dimension of the text file we are working with.

# functions to select odd number of rows in a vector 

odd_row <- function(strings){
  strings = as.data.frame(strings)
  
  rows <- nrow(strings)
  
  odd_rows <- seq_len(rows) %% 2
  data_mod <- strings[odd_rows == 1, ]
  print(data_mod)
  
}

# round 1 to 7 
Round_1 <-  str_extract(data$Round_1, "[^\\r\\n]*[A-Z]+")
Round_1[is.na(Round_1)] <- ""
Round_1 <- as.character(Round_1)
Round_1  <- odd_row(Round_1)
##  [1] "W" "W" "L" "W" "W" "W" "W" "W" "W" "D" "D" "W" "W" "W" "D" "D" "W" "W" "D"
## [20] "L" "W" "W" "L" "L" "L" "W" "W" "W" "W" "L" "L" "W" "W" "L" "L" "L" "B" "D"
## [39] "L" "W" "W" "L" "L" "B" "L" "W" "L" "L" "L" "L" "L" "W" "H" "L" "L" "H" "L"
## [58] "W" "L" "L" "L" "W" "L" "L"
Round_1
##  [1] "W" "W" "L" "W" "W" "W" "W" "W" "W" "D" "D" "W" "W" "W" "D" "D" "W" "W" "D"
## [20] "L" "W" "W" "L" "L" "L" "W" "W" "W" "W" "L" "L" "W" "W" "L" "L" "L" "B" "D"
## [39] "L" "W" "W" "L" "L" "B" "L" "W" "L" "L" "L" "L" "L" "W" "H" "L" "L" "H" "L"
## [58] "W" "L" "L" "L" "W" "L" "L"
Round_2 <-  str_extract(data$Round_2, "[^\\r\\n]*[A-Z]+")
Round_2[is.na(Round_2)] <- ""
Round_2 <- as.character(Round_2)
Round_2  <- odd_row(Round_2)
##  [1] "W" "W" "W" "D" "W" "D" "W" "W" "L" "L" "W" "W" "W" "W" "L" "W" "W" "W" "W"
## [20] "W" "L" "D" "W" "L" "W" "W" "L" "D" "D" "D" "D" "L" "L" "W" "L" "W" "L" "W"
## [39] "W" "L" "L" "L" "L" "L" "L" "L" "W" "W" "L" "W" "W" "D" "L" "L" "D" "L" "L"
## [58] "L" "B" "L" "L" "U" "L" "D"
Round_2
##  [1] "W" "W" "W" "D" "W" "D" "W" "W" "L" "L" "W" "W" "W" "W" "L" "W" "W" "W" "W"
## [20] "W" "L" "D" "W" "L" "W" "W" "L" "D" "D" "D" "D" "L" "L" "W" "L" "W" "L" "W"
## [39] "W" "L" "L" "L" "L" "L" "L" "L" "W" "W" "L" "W" "W" "D" "L" "L" "D" "L" "L"
## [58] "L" "B" "L" "L" "U" "L" "D"
Round_3 <-  str_extract(data$Round_3, "[^\\r\\n]*[A-Z]+")
Round_3[is.na(Round_3)] <- ""
Round_3 <- as.character(Round_3)
Round_3  <- odd_row(Round_3)
##  [1] "W" "L" "W" "W" "D" "L" "W" "L" "W" "W" "W" "D" "L" "W" "W" "H" "L" "L" "W"
## [20] "W" "W" "L" "L" "W" "L" "W" "W" "W" "L" "L" "W" "W" "W" "L" "W" "W" "W" "W"
## [39] "W" "L" "W" "L" "L" "L" "D" "L" "L" "H" "D" "L" "L" "L" "H" "L" "L" "L" "W"
## [58] "L" "L" "D" "W" "U" "D" "L"
Round_3
##  [1] "W" "L" "W" "W" "D" "L" "W" "L" "W" "W" "W" "D" "L" "W" "W" "H" "L" "L" "W"
## [20] "W" "W" "L" "L" "W" "L" "W" "W" "W" "L" "L" "W" "W" "W" "L" "W" "W" "W" "W"
## [39] "W" "L" "W" "L" "L" "L" "D" "L" "L" "H" "D" "L" "L" "L" "H" "L" "L" "L" "W"
## [58] "L" "L" "D" "W" "U" "D" "L"
Round_4 <-  str_extract(data$Round_4, "[^\\r\\n]*[A-Z]+")
Round_4[is.na(Round_1)] <- ""
Round_4 <- as.character(Round_4)
Round_4  <- odd_row(Round_4)
##  [1] "W" "W" "W" "W" "D" "W" "W" "L" "W" "W" "L" "W" "D" "L" "L" "W" "L" "W" "D"
## [20] "W" "L" "W" "W" "L" "W" "L" "W" "D" "L" "W" "L" "L" "D" "W" "L" "D" "L" "L"
## [39] "L" "W" "L" "D" "W" "W" "L" "L" "W" "D" "D" "W" "W" "D" "L" "B" "L" "W" "L"
## [58] "L" "L" "D" "L" "U" "L" "D"
Round_4
##  [1] "W" "W" "W" "W" "D" "W" "W" "L" "W" "W" "L" "W" "D" "L" "L" "W" "L" "W" "D"
## [20] "W" "L" "W" "W" "L" "W" "L" "W" "D" "L" "W" "L" "L" "D" "W" "L" "D" "L" "L"
## [39] "L" "W" "L" "D" "W" "W" "L" "L" "W" "D" "D" "W" "W" "D" "L" "B" "L" "W" "L"
## [58] "L" "L" "D" "L" "U" "L" "D"
Round_5 <-  str_extract(data$Round_5, "[^\\r\\n]*[A-Z]+")
Round_5[is.na(Round_5)] <- ""
Round_5 <- as.character(Round_5)
Round_5  <- odd_row(Round_5)
##  [1] "W" "W" "W" "D" "D" "D" "L" "W" "W" "D" "L" "H" "W" "D" "W" "L" "W" "L" "W"
## [20] "W" "W" "H" "L" "W" "D" "L" "D" "L" "W" "L" "W" "W" "L" "D" "W" "H" "H" "H"
## [39] "W" "L" "X" "D" "W" "L" "W" "W" "L" "H" "W" "H" "L" "L" "U" "L" "B" "H" "L"
## [58] "L" "L" "L" "D" "U" "L" "L"
Round_5
##  [1] "W" "W" "W" "D" "D" "D" "L" "W" "W" "D" "L" "H" "W" "D" "W" "L" "W" "L" "W"
## [20] "W" "W" "H" "L" "W" "D" "L" "D" "L" "W" "L" "W" "W" "L" "D" "W" "H" "H" "H"
## [39] "W" "L" "X" "D" "W" "L" "W" "W" "L" "H" "W" "H" "L" "L" "U" "L" "B" "H" "L"
## [58] "L" "L" "L" "D" "U" "L" "L"
Round_6 <-  str_extract(data$Round_6, "[^\\r\\n]*[A-Z]+")
Round_6[is.na(Round_6)] <- ""
Round_6 <- as.character(Round_6)
Round_6  <- odd_row(Round_6)
##  [1] "D" "W" "W" "W" "W" "W" "W" "W" "L" "W" "W" "D" "L" "L" "W" "W" "W" "W" "L"
## [20] "L" "W" "L" "W" "W" "L" "D" "L" "L" "W" "W" "W" "D" "L" "L" "D" "L" "L" "L"
## [39] "L" "W" "U" "W" "L" "L" "D" "W" "D" "L" "H" "L" "D" "D" "W" "L" "D" "L" "L"
## [58] "B" "W" "H" "L" "U" "H" "L"
Round_6
##  [1] "D" "W" "W" "W" "W" "W" "W" "W" "L" "W" "W" "D" "L" "L" "W" "W" "W" "W" "L"
## [20] "L" "W" "L" "W" "W" "L" "D" "L" "L" "W" "W" "W" "D" "L" "L" "D" "L" "L" "L"
## [39] "L" "W" "U" "W" "L" "L" "D" "W" "D" "L" "H" "L" "D" "D" "W" "L" "D" "L" "L"
## [58] "B" "W" "H" "L" "U" "H" "L"
Round_7 <-  str_extract(data$Round_7, "[^\\r\\n]*[A-Z]+")
Round_7[is.na(Round_7)] <- ""
Round_7 <- as.character(Round_7)
Round_7  <- odd_row(Round_7)
##  [1] "D" "W" "W" "D" "W" "W" "L" "W" "W" "W" "W" "L" "W" "W" "W" "U" "L" "L" "L"
## [20] "L" "L" "W" "W" "W" "W" "L" "U" "D" "U" "W" "L" "L" "W" "W" "W" "D" "W" "L"
## [39] "L" "L" "U" "W" "W" "W" "W" "L" "L" "L" "U" "L" "L" "L" "U" "W" "L" "L" "B"
## [58] "L" "L" "U" "L" "U" "U" "L"
even_row <- function(strings){
  strings = as.data.frame(strings)
  
  rows <- nrow(strings)
  
  odd_rows <- seq_len(rows) %% 2
  data_mod <- strings[odd_rows == 0, ]
  print(data_mod)
  
}

Number_Plays <- str_extract(data$Total_Pts, "(?<=:\\s{1,2})\\d\\w+|\\d{1,4}+")
Number_Plays[is.na(Number_Plays)] <- "0"
Number_Plays <- even_row(Number_Plays)
##  [1] "2" "2" "2" "2" "2" "3" "3" "3" "2" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3"
## [20] "4" "3" "4" "0" "4" "4" "4" "4" "3" "4" "0" "0" "4" "0" "0" "0" "4" "0" "4"
## [39] "4" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "4" "0" "0" "0" "0" "0"
## [58] "0" "0" "0" "0" "0" "0" "0"
Number_Plays
##  [1] "2" "2" "2" "2" "2" "3" "3" "3" "2" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3"
## [20] "4" "3" "4" "0" "4" "4" "4" "4" "3" "4" "0" "0" "4" "0" "0" "0" "4" "0" "4"
## [39] "4" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "4" "0" "0" "0" "0" "0"
## [58] "0" "0" "0" "0" "0" "0" "0"
Player_Games1 <- str_extract(data$Round_1, "\\d+")
Player_Games1[is.na(Player_Games1)] <- "0"
Player_Games1 <- odd_row(Player_Games1)
##  [1] "39" "63" "8"  "23" "45" "34" "57" "3"  "25" "16" "38" "42" "36" "54" "19"
## [16] "10" "48" "47" "15" "40" "43" "64" "4"  "28" "9"  "49" "51" "24" "50" "52"
## [31] "58" "61" "60" "6"  "46" "13" "0"  "11" "1"  "20" "59" "12" "21" "0"  "5" 
## [46] "35" "18" "17" "26" "29" "27" "30" "0"  "14" "62" "0"  "7"  "31" "41" "33"
## [61] "32" "55" "2"  "22"
Player_Games2 <- str_extract(data$Round_2, "\\d+")
Player_Games2[is.na(Player_Games2)] <- "0"
Player_Games2 <- odd_row(Player_Games2)
##  [1] "21" "58" "61" "28" "37" "29" "46" "32" "18" "19" "56" "33" "27" "44" "16"
## [16] "15" "41" "9"  "10" "49" "1"  "52" "43" "47" "53" "40" "13" "4"  "6"  "64"
## [31] "55" "8"  "12" "60" "38" "57" "5"  "35" "54" "26" "17" "50" "23" "14" "51"
## [46] "7"  "24" "63" "20" "42" "45" "22" "25" "39" "31" "11" "36" "2"  "0"  "34"
## [61] "3"  "0"  "48" "30"
Player_Games3 <- str_extract(data$Round_3, "\\d+")
Player_Games3[is.na(Player_Games3)] <- "0"
Player_Games3 <- odd_row(Player_Games3)
##  [1] "18" "4"  "25" "2"  "12" "11" "13" "14" "59" "55" "6"  "5"  "7"  "8"  "30"
## [16] "0"  "26" "1"  "52" "23" "47" "28" "20" "43" "3"  "17" "46" "22" "38" "15"
## [31] "64" "44" "50" "37" "56" "51" "34" "29" "40" "39" "58" "57" "24" "32" "60"
## [46] "27" "21" "0"  "63" "33" "36" "19" "0"  "61" "10" "35" "42" "41" "9"  "45"
## [61] "54" "0"  "49" "31"
Player_Games4 <- str_extract(data$Round_4, "\\d+")
Player_Games4[is.na(Player_Games4)] <- "0"
Player_Games4 <- odd_row(Player_Games4)
##  [1] "14" "17" "21" "26" "13" "35" "11" "9"  "8"  "31" "7"  "38" "5"  "1"  "22"
## [16] "39" "2"  "32" "28" "41" "3"  "15" "58" "25" "24" "4"  "37" "19" "34" "55"
## [31] "10" "18" "36" "29" "6"  "33" "27" "12" "16" "59" "20" "60" "63" "53" "56"
## [46] "50" "61" "52" "64" "46" "57" "48" "44" "0"  "30" "45" "51" "23" "40" "42"
## [61] "47" "0"  "43" "49"
Player_Games5 <- str_extract(data$Round_5, "\\d+")
Player_Games5[is.na(Player_Games5)] <- "0"
Player_Games5 <- odd_row(Player_Games5)
##  [1] "7"  "16" "11" "5"  "4"  "10" "1"  "47" "26" "6"  "3"  "0"  "33" "27" "54"
## [16] "2"  "23" "19" "18" "28" "40" "0"  "17" "60" "34" "9"  "14" "20" "52" "31"
## [31] "30" "51" "13" "25" "57" "0"  "0"  "0"  "44" "21" "0"  "61" "59" "39" "63"
## [46] "64" "8"  "0"  "58" "0"  "32" "29" "0"  "15" "0"  "0"  "35" "49" "43" "24"
## [61] "42" "0"  "45" "46"
Player_Games6 <- str_extract(data$Round_6, "\\d+")
Player_Games6[is.na(Player_Games6)] <- "0"
Player_Games6 <- odd_row(Player_Games6)
##  [1] "12" "20" "13" "19" "14" "27" "9"  "28" "7"  "25" "34" "1"  "3"  "5"  "33"
## [16] "36" "22" "38" "4"  "2"  "39" "17" "37" "44" "10" "32" "6"  "8"  "48" "61"
## [31] "50" "26" "15" "11" "52" "16" "23" "18" "21" "56" "0"  "64" "46" "24" "55"
## [46] "43" "51" "29" "0"  "31" "47" "35" "57" "59" "45" "40" "53" "0"  "54" "0" 
## [61] "30" "0"  "0"  "42"
Player_Games7 <- str_extract(data$Round_7, "\\d+")
Player_Games7[is.na(Player_Games7)] <- "0"
Player_Games7 <- odd_row(Player_Games7)
##  [1] "4"  "7"  "12" "1"  "17" "21" "2"  "19" "20" "18" "26" "3"  "32" "31" "38"
## [16] "0"  "5"  "10" "8"  "9"  "6"  "40" "46" "39" "47" "11" "0"  "36" "0"  "50"
## [31] "14" "13" "51" "52" "48" "28" "61" "15" "24" "22" "0"  "56" "55" "59" "58"
## [46] "23" "25" "35" "0"  "30" "33" "34" "0"  "64" "43" "42" "0"  "45" "44" "0" 
## [61] "37" "0"  "0"  "54"
temp <- str_extract_all(data$Player_Name,"\\b\\d{1,}")
temp <- data.frame(as.character(temp))
temp <- separate(data = temp, col = as.character.temp., into = c("col1","col2","col3"), sep = ",")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 64 rows [1, 3, 5, 7, 9,
## 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, ...].
Post_Ratings <- even_row(temp$col3)
##  [1] " \"1817\")" " \"1663\")" " \"1640\")" " \"1744\")" " \"1690\")"
##  [6] " \"1687\")" " \"1673\")" " \"1657\")" " \"1564\")" " \"1544\")"
## [11] " \"1696\")" " \"1670\")" " \"1662\")" " \"1618\")" " \"1416\")"
## [16] " \"1613\")" " \"1610\")" " \"1600\")" " \"1570\")" " \"1569\")"
## [21] " \"1562\")" " \"1529\")" " \"1371\")" " \"1300\")" " \"1681\")"
## [26] " \"1564\")" " \"1539\")" " \"1513\")" " \"1508\")" " \"1444\")"
## [31] " \"1444\")" " \"1433\")" " \"1421\")" " \"1400\")" " \"1392\")"
## [36] " \"1367\")" " \"1077\")" " \"1439\")" " \"1413\")" " \"1346\")"
## [41] " \"1341\")" " \"1256\")" " \"1244\")" " \"1199\")" " \"1191\")"
## [46] " \"1076\")" " \"1341\")" " \"1335\")" " \"1259\")" " \"1111\")"
## [51] " \"1097\")" " \"1092\")" " \"1359\")" " \"1200\")" " \"1163\")"
## [56] " \"1140\")" " \"1079\")" " \"941\")"  " \"878\")"  " \"984\")" 
## [61] " \"979\")"  " \"1535\")" " \"1125\")" " \"1112\")"
Post_Ratings[is.na(Post_Ratings)] <- "0"
Post_Ratings  <- str_extract(Post_Ratings,"\\d+")
Post_Ratings
##  [1] "1817" "1663" "1640" "1744" "1690" "1687" "1673" "1657" "1564" "1544"
## [11] "1696" "1670" "1662" "1618" "1416" "1613" "1610" "1600" "1570" "1569"
## [21] "1562" "1529" "1371" "1300" "1681" "1564" "1539" "1513" "1508" "1444"
## [31] "1444" "1433" "1421" "1400" "1392" "1367" "1077" "1439" "1413" "1346"
## [41] "1341" "1256" "1244" "1199" "1191" "1076" "1341" "1335" "1259" "1111"
## [51] "1097" "1092" "1359" "1200" "1163" "1140" "1079" "941"  "878"  "984" 
## [61] "979"  "1535" "1125" "1112"

Final Data Set

the final data set has all the rows that has been cleaned and processed to look like data frame.

final <- data.frame(Pair_Num = Pair_Num,
           State = State,
           Player_Name = Player_Name,
           Pre_Ratings = Pre_Ratings,
           Post_Ratings = as.numeric(Post_Ratings),
           Number_Plays = Number_Plays,
           Total_points = Total_points,
           Player_ID = USCF_ID,
           Player_Games1 = Player_Games1,
           Player_Games2 = Player_Games2,
           Player_Games3 = Player_Games3,
           Player_Games4 = Player_Games4,
           Player_Games5 = Player_Games5,
           Player_Games6 = Player_Games6,
           Player_Games7 = Player_Games7,
           
           Round_1 = Round_1,
           Round_2 = Round_2,
           Round_3= Round_3,
           Round_4 = Round_4,
           Round_5 = Round_5,
           Round_6 = Round_6,
           Round_7 = Round_7
           )
final[is.na(final)] <- as.numeric(0)
final$Number_Plays <- as.numeric(final$Number_Plays) 
final$Player_Games1 <- as.numeric(final$Player_Games1)
final$Player_Games2 <- as.numeric(final$Player_Games2)
final$Player_Games3 <- as.numeric(final$Player_Games3)
final$Player_Games4 <- as.numeric(final$Player_Games4)
final$Player_Games5 <- as.numeric(final$Player_Games5)
final$Player_Games6 <- as.numeric(final$Player_Games6)
final$Player_Games7 <- as.numeric(final$Player_Games7)
head(final, 10)
##    Pair_Num State      Player_Name Pre_Ratings Post_Ratings Number_Plays
## 1         1    ON         GARY HUA        1794         1817            2
## 2         2    MI  DAKSHESH DARURI        1553         1663            2
## 3         3    MI     ADITYA BAJAJ        1384         1640            2
## 4         4    MI        PATRICK H        1716         1744            2
## 5         5    MI       HANSHI ZUO        1655         1690            2
## 6         6    OH      HANSEN SONG        1686         1687            3
## 7         7    MI         GARY DEE        1649         1673            3
## 8         8    MI EZEKIEL HOUGHTON        1641         1657            3
## 9         9    ON      STEFANO LEE        1411         1564            2
## 10       10    MI        ANVIT RAO        1365         1544            3
##    Total_points Player_ID Player_Games1 Player_Games2 Player_Games3
## 1           6.0  15445895            39            21            18
## 2           6.0  14598900            63            58             4
## 3           6.0  14959604             8            61            25
## 4           5.5  12616049            23            28             2
## 5           5.5  14601533            45            37            12
## 6           5.0  15055204            34            29            11
## 7           5.0  11146376            57            46            13
## 8           5.0  15142253             3            32            14
## 9           5.0  14954524            25            18            59
## 10          5.0  14150362            16            19            55
##    Player_Games4 Player_Games5 Player_Games6 Player_Games7 Round_1 Round_2
## 1             14             7            12             4       W       W
## 2             17            16            20             7       W       W
## 3             21            11            13            12       L       W
## 4             26             5            19             1       W       D
## 5             13             4            14            17       W       W
## 6             35            10            27            21       W       D
## 7             11             1             9             2       W       W
## 8              9            47            28            19       W       W
## 9              8            26             7            20       W       L
## 10            31             6            25            18       D       L
##    Round_3 Round_4 Round_5 Round_6 Round_7
## 1        W       W       W       D       D
## 2        L       W       W       W       W
## 3        W       W       W       W       W
## 4        W       W       D       W       D
## 5        D       D       D       W       W
## 6        L       W       D       W       W
## 7        W       W       L       W       L
## 8        L       L       W       W       W
## 9        W       W       W       L       W
## 10       W       W       D       W       W
str(final)
## 'data.frame':    64 obs. of  22 variables:
##  $ Pair_Num     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ State        : chr  "ON" "MI" "MI" "MI" ...
##  $ Player_Name  : chr  "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H" ...
##  $ Pre_Ratings  : num  1794 1553 1384 1716 1655 ...
##  $ Post_Ratings : num  1817 1663 1640 1744 1690 ...
##  $ Number_Plays : num  2 2 2 2 2 3 3 3 2 3 ...
##  $ Total_points : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ Player_ID    : chr  "15445895" "14598900" "14959604" "12616049" ...
##  $ Player_Games1: num  39 63 8 23 45 34 57 3 25 16 ...
##  $ Player_Games2: num  21 58 61 28 37 29 46 32 18 19 ...
##  $ Player_Games3: num  18 4 25 2 12 11 13 14 59 55 ...
##  $ Player_Games4: num  14 17 21 26 13 35 11 9 8 31 ...
##  $ Player_Games5: num  7 16 11 5 4 10 1 47 26 6 ...
##  $ Player_Games6: num  12 20 13 19 14 27 9 28 7 25 ...
##  $ Player_Games7: num  4 7 12 1 17 21 2 19 20 18 ...
##  $ Round_1      : chr  "W" "W" "L" "W" ...
##  $ Round_2      : chr  "W" "W" "W" "D" ...
##  $ Round_3      : chr  "W" "L" "W" "W" ...
##  $ Round_4      : chr  "W" "W" "W" "W" ...
##  $ Round_5      : chr  "W" "W" "W" "D" ...
##  $ Round_6      : chr  "D" "W" "W" "W" ...
##  $ Round_7      : chr  "D" "W" "W" "D" ...

We are going to create a the average opponent variable using the mean() and grep()

#create an Empty variable
final$avg_opponent_rate <- NA

for (i in 1:nrow(final)) {
  final$avg_opponent_rate[i] <- round(rowMeans(final[i,grep('Pre_Ratings',
                                                       colnames(final)):grep("Post_Ratings",colnames(final))], na.rm=TRUE))
}
head(final)
##   Pair_Num State     Player_Name Pre_Ratings Post_Ratings Number_Plays
## 1        1    ON        GARY HUA        1794         1817            2
## 2        2    MI DAKSHESH DARURI        1553         1663            2
## 3        3    MI    ADITYA BAJAJ        1384         1640            2
## 4        4    MI       PATRICK H        1716         1744            2
## 5        5    MI      HANSHI ZUO        1655         1690            2
## 6        6    OH     HANSEN SONG        1686         1687            3
##   Total_points Player_ID Player_Games1 Player_Games2 Player_Games3
## 1          6.0  15445895            39            21            18
## 2          6.0  14598900            63            58             4
## 3          6.0  14959604             8            61            25
## 4          5.5  12616049            23            28             2
## 5          5.5  14601533            45            37            12
## 6          5.0  15055204            34            29            11
##   Player_Games4 Player_Games5 Player_Games6 Player_Games7 Round_1 Round_2
## 1            14             7            12             4       W       W
## 2            17            16            20             7       W       W
## 3            21            11            13            12       L       W
## 4            26             5            19             1       W       D
## 5            13             4            14            17       W       W
## 6            35            10            27            21       W       D
##   Round_3 Round_4 Round_5 Round_6 Round_7 avg_opponent_rate
## 1       W       W       W       D       D              1806
## 2       L       W       W       W       W              1608
## 3       W       W       W       W       W              1512
## 4       W       W       D       W       D              1730
## 5       D       D       D       W       W              1672
## 6       L       W       D       W       W              1686
#write.csv(csv.table, file = "Villalobos-tournamentInfo.csv")
write.table(final, file = "warner-tournamentInfo.csv",row.names=FALSE, na="",col.names=TRUE, sep=",")

Conclusion

This assignment is like a work assignment. I spent a lot time doing research and learning about different technique to clean the data. I read the data set in R using read.delim() function which give a better layout to clean this data set. This assignment would be possible without the use of Regular expression.