PROJECT 01: Chess Player Rating Ingestion Tool

INDEX (Step by Step)

STEP 1. Load Libraries
STEP 2. Load the file
STEP 3. Data Extraction:Regex the file data
STEP 4. Data Munging: Create measurement of opponents score
STEP 5. Data Munging: Create a final dataframe as suggested in the problem
STEP 6. Create interactive Data Table from the dataframe
STEP 7. Dump the data into CSV file
STEP 8. Use the SQL script to create new chess table and load newly created csv file into the chess table
STEP 9. Query the chess table from this RMD and check the results
STEP 10. success!!

IMPORTANT PROJECT ASSUMPTIONS

Assumption 1. Pre Tournament rating was considered as player’s rating for final table
Assumption 2. For Opponent avg. score calculation:
a. All opponents numbers were matched with their numbers in the file
b. The codes like U and H where no value for opponent was given in the file were ignored from the calculation in order to make sure data is valid
Assumption 3. Rating (example 1220P13) was treated as rating of 1220 as meaning of P13 was not clear.

STEP 1 : Load your libraries

# Load the libraries
library(stringr)    #For Regex
library(RCurl)      #For File Operations
## Loading required package: bitops
library(dplyr)      #For Manipulating the data frames
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DT)         #For Data table package
library(RMySQL)     #For SQL Operations
## Loading required package: DBI
library(ggplot2)    #For Visualizations

STEP 2 : Load the File

# Good Practise: Basic house keeping: cleanup the env before you start new work
rm(list=ls())

# Good Practise: Set up the Working Directory when working with a file system
setwd("C:\\CUNY\\607Data\\Assignments\\project01")

# Read the File
raw.file.data <- readLines("tournamentinfo.txt")

# Make sure file loaded correct
head(raw.file.data)
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

STEP 3 : Data Extraction:Regex the file data

Regex to find Player Names

##
# Now the cleanup fun begins
##

##  
#   I created my regex and tested these using website 
#   http://rubular.com/
##


##  
#   Regex name of the players
##
namenum        <- unlist(str_extract_all(raw.file.data, 
                                      "\\d{1,2}[ ]\\|[ ]([A-Z]+[ ]){2,3}"))
name        <- unlist(str_extract_all(namenum, "([A-Z]+[ ]){2,3}"))
name        <- trimws(name, which = c("both"))
name
##  [1] "GARY HUA"                 "DAKSHESH DARURI"         
##  [3] "ADITYA BAJAJ"             "PATRICK H SCHILLING"     
##  [5] "HANSHI ZUO"               "HANSEN SONG"             
##  [7] "GARY DEE SWATHELL"        "EZEKIEL HOUGHTON"        
##  [9] "STEFANO LEE"              "ANVIT RAO"               
## [11] "CAMERON WILLIAM MC"       "KENNETH J TACK"          
## [13] "TORRANCE HENRY JR"        "BRADLEY SHAW"            
## [15] "ZACHARY JAMES HOUGHTON"   "MIKE NIKITIN"            
## [17] "RONALD GRZEGORCZYK"       "DAVID SUNDEEN"           
## [19] "DIPANKAR ROY"             "JASON ZHENG"             
## [21] "DINH DANG BUI"            "EUGENE L MCCLURE"        
## [23] "ALAN BUI"                 "MICHAEL R ALDRICH"       
## [25] "LOREN SCHWIEBERT"         "MAX ZHU"                 
## [27] "GAURAV GIDWANI"           "SOFIA ADINA"             
## [29] "CHIEDOZIE OKORIE"         "GEORGE AVERY JONES"      
## [31] "RISHI SHETTY"             "JOSHUA PHILIP MATHEWS"   
## [33] "JADE GE"                  "MICHAEL JEFFERY THOMAS"  
## [35] "JOSHUA DAVID LEE"         "SIDDHARTH JHA"           
## [37] "AMIYATOSH PWNANANDAM"     "BRIAN LIU"               
## [39] "JOEL R HENDON"            "FOREST ZHANG"            
## [41] "KYLE WILLIAM MURPHY"      "JARED GE"                
## [43] "ROBERT GLEN VASEY"        "JUSTIN D SCHILLING"      
## [45] "DEREK YAN"                "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT"              "DANIEL KHAIN"            
## [49] "MICHAEL J MARTIN"         "SHIVAM JHA"              
## [51] "TEJAS AYYAGARI"           "ETHAN GUO"               
## [53] "JOSE C YBARRA"            "LARRY HODGE"             
## [55] "ALEX KONG"                "MARISA RICCI"            
## [57] "MICHAEL LU"               "VIRAJ MOHILE"            
## [59] "SEAN M MC"                "JULIA SHEN"              
## [61] "JEZZEL FARKAS"            "ASHWIN BALAJI"           
## [63] "THOMAS JOSEPH HOSMER"     "BEN LI"

Regex to find Player Numbers

##  
#   Regex number of the players
##
pnum   <- as.numeric(unlist(str_extract_all(namenum, "\\d{1,2}")))
pnum
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64

Regex to find Player Pre Tournament Rating

##  
#   Regex players rating
##
rating      <- unlist(str_extract_all(raw.file.data, "R:[ ]+(\\d{3,4})"))
rating      <- as.numeric(unlist(str_extract_all(rating, "\\d{3,4}")))
rating
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153
## [57] 1092  917  853  967  955 1530 1175 1163

Regex to find Player State

##  
#   Regex players state
##
state     <- unlist(str_extract_all(raw.file.data, "\\b([A-Z]{2})[ ]\\|"))
state     <- unlist(str_extract_all(state, "[A-Z]{2}"))
state
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"

Regex to find Player Points

##  
#   Regex the points
##
points      <- as.numeric(unlist(str_extract_all(raw.file.data, "\\d\\.\\d")))
points
##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0
## [18] 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
## [35] 3.5 3.5 3.5 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5
## [52] 2.5 2.0 2.0 2.0 2.0 2.0 2.0 2.0 1.5 1.5 1.0 1.0 1.0

Create a Dataframe with the data

pointsdf <- data.frame(pnum=pnum, name=name, state=state, points=points, rating=rating)
head(pointsdf)
##   pnum                name state points rating
## 1    1            GARY HUA    ON    6.0   1794
## 2    2     DAKSHESH DARURI    MI    6.0   1553
## 3    3        ADITYA BAJAJ    MI    6.0   1384
## 4    4 PATRICK H SCHILLING    MI    5.5   1716
## 5    5          HANSHI ZUO    MI    5.5   1655
## 6    6         HANSEN SONG    OH    5.0   1686

STEP 4: Create measurement of opponents scores

Cleanup the file further to find list of opponents for each player

## Cleanup lines to derive the average of opponents

# Remove the lines that just contain ------- 
raw.data <- str_extract_all(raw.file.data, "^(?!.*--.*).+$")

# Create a list of opponents each player played
opponentslist <-  str_extract_all(raw.data,"[WLDHU][ ]+\\d{1,2}")

# remove any empty rows from the list
opponentslist <- opponentslist[lapply(opponentslist,length)>0]

#check the list before processing futher
opponentslist
## [[1]]
## [1] "W  39" "W  21" "W  18" "W  14" "W   7" "D  12" "D   4"
## 
## [[2]]
## [1] "W  63" "W  58" "L   4" "W  17" "W  16" "W  20" "W   7"
## 
## [[3]]
## [1] "L   8" "W  61" "W  25" "W  21" "W  11" "W  13" "W  12"
## 
## [[4]]
## [1] "W  23" "D  28" "W   2" "W  26" "D   5" "W  19" "D   1"
## 
## [[5]]
## [1] "W  45" "W  37" "D  12" "D  13" "D   4" "W  14" "W  17"
## 
## [[6]]
## [1] "W  34" "D  29" "L  11" "W  35" "D  10" "W  27" "W  21"
## 
## [[7]]
## [1] "W  57" "W  46" "W  13" "W  11" "L   1" "W   9" "L   2"
## 
## [[8]]
## [1] "W   3" "W  32" "L  14" "L   9" "W  47" "W  28" "W  19"
## 
## [[9]]
## [1] "W  25" "L  18" "W  59" "W   8" "W  26" "L   7" "W  20"
## 
## [[10]]
## [1] "D  16" "L  19" "W  55" "W  31" "D   6" "W  25" "W  18"
## 
## [[11]]
## [1] "D  38" "W  56" "W   6" "L   7" "L   3" "W  34" "W  26"
## 
## [[12]]
## [1] "W  42" "W  33" "D   5" "W  38" "D   1" "L   3"
## 
## [[13]]
## [1] "W  36" "W  27" "L   7" "D   5" "W  33" "L   3" "W  32"
## 
## [[14]]
## [1] "W  54" "W  44" "W   8" "L   1" "D  27" "L   5" "W  31"
## 
## [[15]]
## [1] "D  19" "L  16" "W  30" "L  22" "W  54" "W  33" "W  38"
## 
## [[16]]
## [1] "D  10" "W  15" "W  39" "L   2" "W  36"
## 
## [[17]]
## [1] "W  48" "W  41" "L  26" "L   2" "W  23" "W  22" "L   5"
## 
## [[18]]
## [1] "W  47" "W   9" "L   1" "W  32" "L  19" "W  38" "L  10"
## 
## [[19]]
## [1] "D  15" "W  10" "W  52" "D  28" "W  18" "L   4" "L   8"
## 
## [[20]]
## [1] "L  40" "W  49" "W  23" "W  41" "W  28" "L   2" "L   9"
## 
## [[21]]
## [1] "W  43" "L   1" "W  47" "L   3" "W  40" "W  39" "L   6"
## 
## [[22]]
## [1] "W  64" "D  52" "L  28" "W  15" "L  17" "W  40"
## 
## [[23]]
## [1] "L   4" "W  43" "L  20" "W  58" "L  17" "W  37" "W  46"
## 
## [[24]]
## [1] "L  28" "L  47" "W  43" "L  25" "W  60" "W  44" "W  39"
## 
## [[25]]
## [1] "L   9" "W  53" "L   3" "W  24" "D  34" "L  10" "W  47"
## 
## [[26]]
## [1] "W  49" "W  40" "W  17" "L   4" "L   9" "D  32" "L  11"
## 
## [[27]]
## [1] "W  51" "L  13" "W  46" "W  37" "D  14" "L   6"
## 
## [[28]]
## [1] "W  24" "D   4" "W  22" "D  19" "L  20" "L   8" "D  36"
## 
## [[29]]
## [1] "W  50" "D   6" "L  38" "L  34" "W  52" "W  48"
## 
## [[30]]
## [1] "L  52" "D  64" "L  15" "W  55" "L  31" "W  61" "W  50"
## 
## [[31]]
## [1] "L  58" "D  55" "W  64" "L  10" "W  30" "W  50" "L  14"
## 
## [[32]]
## [1] "W  61" "L   8" "W  44" "L  18" "W  51" "D  26" "L  13"
## 
## [[33]]
## [1] "W  60" "L  12" "W  50" "D  36" "L  13" "L  15" "W  51"
## 
## [[34]]
## [1] "L   6" "W  60" "L  37" "W  29" "D  25" "L  11" "W  52"
## 
## [[35]]
## [1] "L  46" "L  38" "W  56" "L   6" "W  57" "D  52" "W  48"
## 
## [[36]]
## [1] "L  13" "W  57" "W  51" "D  33" "L  16" "D  28"
## 
## [[37]]
## [1] "L   5" "W  34" "L  27" "L  23" "W  61"
## 
## [[38]]
## [1] "D  11" "W  35" "W  29" "L  12" "L  18" "L  15"
## 
## [[39]]
## [1] "L   1" "W  54" "W  40" "L  16" "W  44" "L  21" "L  24"
## 
## [[40]]
## [1] "W  20" "L  26" "L  39" "W  59" "L  21" "W  56" "L  22"
## 
## [[41]]
## [1] "W  59" "L  17" "W  58" "L  20"
## 
## [[42]]
## [1] "L  12" "L  50" "L  57" "D  60" "D  61" "W  64" "W  56"
## 
## [[43]]
## [1] "L  21" "L  23" "L  24" "W  63" "W  59" "L  46" "W  55"
## 
## [[44]]
## [1] "L  14" "L  32" "W  53" "L  39" "L  24" "W  59"
## 
## [[45]]
## [1] "L   5" "L  51" "D  60" "L  56" "W  63" "D  55" "W  58"
## 
## [[46]]
## [1] "W  35" "L   7" "L  27" "L  50" "W  64" "W  43" "L  23"
## 
## [[47]]
## [1] "L  18" "W  24" "L  21" "W  61" "L   8" "D  51" "L  25"
## 
## [[48]]
## [1] "L  17" "W  63" "D  52" "L  29" "L  35"
## 
## [[49]]
## [1] "L  26" "L  20" "D  63" "D  64" "W  58"
## 
## [[50]]
## [1] "L  29" "W  42" "L  33" "W  46" "L  31" "L  30"
## 
## [[51]]
## [1] "L  27" "W  45" "L  36" "W  57" "L  32" "D  47" "L  33"
## 
## [[52]]
## [1] "W  30" "D  22" "L  19" "D  48" "L  29" "D  35" "L  34"
## 
## [[53]]
## [1] "L  25" "L  44" "W  57"
## 
## [[54]]
## [1] "L  14" "L  39" "L  61" "L  15" "L  59" "W  64"
## 
## [[55]]
## [1] "L  62" "D  31" "L  10" "L  30" "D  45" "L  43"
## 
## [[56]]
## [1] "L  11" "L  35" "W  45" "L  40" "L  42"
## 
## [[57]]
## [1] "L   7" "L  36" "W  42" "L  51" "L  35" "L  53"
## 
## [[58]]
## [1] "W  31" "L   2" "L  41" "L  23" "L  49" "L  45"
## 
## [[59]]
## [1] "L  41" "L   9" "L  40" "L  43" "W  54" "L  44"
## 
## [[60]]
## [1] "L  33" "L  34" "D  45" "D  42" "L  24"
## 
## [[61]]
## [1] "L  32" "L   3" "W  54" "L  47" "D  42" "L  30" "L  37"
## 
## [[62]]
## [1] "W  55"
## 
## [[63]]
## [1] "L   2" "L  48" "D  49" "L  43" "L  45"
## 
## [[64]]
## [1] "L  22" "D  30" "L  31" "D  49" "L  46" "L  42" "L  54"

For loop to compute opponent averages for all the players

# Compute the average value for all the opponents

avg_list <- c()
for (i in 1:64){
        sum <- 0
        for (j in 1:length(opponentslist[[i]])){
            opponent <- as.numeric(str_extract_all(
                opponentslist[[i]][j], "\\d{1,2}"))            
                sum <- sum + as.numeric(as.character(
                    pointsdf$rating[which(pointsdf$pnum==opponent)][1]))
        }
        #print(sum)
        avg_list[i] <- round(sum/length(opponentslist[[i]]))
}
#all opponent averages
avg_list
##  [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263

Add the opponent averages to the existing data frame

# Add the opponext average to the existing data frame
pointsdf["opponentAvg"] <- avg_list

STEP 5: Create a final dataframe as suggested in the problem

##
# Cleanup and format the dataframe 
##

pointsdf <- subset(pointsdf, select = c(name, state, points, rating, opponentAvg))

format(pointsdf$points, digits=1, nsmall=1)
##  [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"
head(pointsdf)
##                  name state points rating opponentAvg
## 1            GARY HUA    ON    6.0   1794        1605
## 2     DAKSHESH DARURI    MI    6.0   1553        1469
## 3        ADITYA BAJAJ    MI    6.0   1384        1564
## 4 PATRICK H SCHILLING    MI    5.5   1716        1574
## 5          HANSHI ZUO    MI    5.5   1655        1501
## 6         HANSEN SONG    OH    5.0   1686        1519

STEP 6: Create a Beautiful Data Table for the data frame

datatable(pointsdf)

STEP 7: Write the dataframe to CSV file

## Write the file in CSV format to the disk
write.table(pointsdf, "chessdata.csv", sep=",", row.names=FALSE, col.names=FALSE)

STEP 8: SQL SCript to create chess table

# Create a database connection
mydb = dbConnect(MySQL(), user='root', password='mysql57', host='localhost', dbname="tb")

Exhibits for my SQL Scripts to create tables

/*
 The below are just exhibits from the SQL Script which I ran using mysql databases
 This Script create the table in the database
 The actual script is attached with this project RMD code 
 
 A. Create Table
 B. Insert the rows using the CSV created by the above write.csv command
 C. Check the results
*/

/*
DROP TABLE IF EXISTS chess;

# Create new table
CREATE TABLE chess          
(
  name varchar(100) NOT NULL,  
  state varchar(10) NOT NULL,  
  points float4 NOT NULL,  
  rating int NOT NULL,  
  opponentavg int NOT NULL   
);

LOAD DATA INFILE 'C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\chessdata.csv'  
INTO TABLE chess 
FIELDS TERMINATED BY ','  
lines terminated BY "\n";

*/

STEP 9: Database connection to check SQL operations

##
# Create a Database connection
##

mysqloutput <- fetch(dbSendQuery(mydb, "select * from chess;"))

head(mysqloutput)
##                    name state points rating opponentavg
## 1            "GARY HUA"  "ON"    6.0   1794        1605
## 2     "DAKSHESH DARURI"  "MI"    6.0   1553        1469
## 3        "ADITYA BAJAJ"  "MI"    6.0   1384        1564
## 4 "PATRICK H SCHILLING"  "MI"    5.5   1716        1574
## 5          "HANSHI ZUO"  "MI"    5.5   1655        1501
## 6         "HANSEN SONG"  "OH"    5.0   1686        1519

STEP 10: Success!!

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.