Load the tidyverse package
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tibble)
Create a vector entitled names which will assign column names to the chess tibble.
#create the names vector
names <- c("Number","PrePost","Pts","R1","R2","R3","R4","R5","R6","R7","Blank")
#use the read_delim function to import data into the chess tibble
chess <- read_delim(file="C:\\Users\\lizza\\Documents\\CUNY - Data Analytics\\DATA 607_Revised\\Week 4\\Project\\tournamentinfo.txt", col_names = names, delim="|", skip=3)
## Parsed with column specification:
## cols(
## Number = col_character(),
## PrePost = col_character(),
## Pts = col_character(),
## R1 = col_character(),
## R2 = col_character(),
## R3 = col_character(),
## R4 = col_character(),
## R5 = col_character(),
## R6 = col_character(),
## R7 = col_character(),
## Blank = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 65 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1 <NA> 11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ file 2 4 <NA> 11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ row 3 7 <NA> 11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ col 4 10 <NA> 11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~ expected 5 13 <NA> 11 columns 1 columns "'C:\\Users\\lizza\\Documents\\CUNY - ~
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
#take a look at the tibble using the glimpse option
glimpse(chess)
## Observations: 193
## Variables: 11
## $ Number <chr> "-----------------------------------------------------...
## $ PrePost <chr> NA, " GARY HUA ", " 15445895 / ...
## $ Pts <chr> NA, "6.0 ", "N:2 ", NA, "6.0 ", "N:2 ", NA, "6.0 ...
## $ R1 <chr> NA, "W 39", "W ", NA, "W 63", "B ", NA, "L 8...
## $ R2 <chr> NA, "W 21", "B ", NA, "W 58", "W ", NA, "W 61...
## $ R3 <chr> NA, "W 18", "W ", NA, "L 4", "B ", NA, "W 25...
## $ R4 <chr> NA, "W 14", "B ", NA, "W 17", "W ", NA, "W 21...
## $ R5 <chr> NA, "W 7", "W ", NA, "W 16", "B ", NA, "W 11...
## $ R6 <chr> NA, "D 12", "B ", NA, "W 20", "W ", NA, "W 13...
## $ R7 <chr> NA, "D 4", "W ", NA, "W 7", "B ", NA, "W 12...
## $ Blank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
After importing the tibble, we see that every 3rd row doesn’t have any usable data. To clean the data we create a vector entitled nodata which uses the seq feature starting at the 1st row ending at the 193rd by 3. Then we create chess_clean which excludes the findings of nodata
nodata <- seq(1,193,3)
chess_clean <- chess[-c(nodata),]
dim(chess_clean)
## [1] 128 11
chess_clean has a more manageable 128 observations and 11 variables.
Use str_extract_all along with regular expressions to pull the players names from the PrePost variable.
Player_Name <- unlist(str_extract_all(chess_clean$PrePost, "\\w+[A-Z] ?\\w+ \\w+"))
#check classification & view the Player_Name variable
class(Player_Name)
## [1] "character"
Player_Name
## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [21] "DINH DANG BUI" "EUGENE L MCCLURE"
## [23] "ALAN BUI" "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT" "MAX ZHU"
## [27] "GAURAV GIDWANI" "SOFIA ADINA STANESCU"
## [29] "CHIEDOZIE OKORIE" "GEORGE AVERY JONES"
## [31] "RISHI SHETTY" "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE" "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE" "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"
## [39] "JOEL R HENDON" "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY" "JARED GE"
## [43] "ROBERT GLEN VASEY" "JUSTIN D SCHILLING"
## [45] "DEREK YAN" "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT" "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN" "SHIVAM JHA"
## [51] "TEJAS AYYAGARI" "ETHAN GUO"
## [53] "JOSE C YBARRA" "LARRY HODGE"
## [55] "ALEX KONG" "MARISA RICCI"
## [57] "MICHAEL LU" "VIRAJ MOHILE"
## [59] "SEAN M MC" "JULIA SHEN"
## [61] "JEZZEL FARKAS" "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER" "BEN LI"
Use str_extract_all along with regular expressions to pull the players state from the Number variable.
Player_State <- unlist(str_extract_all(chess_clean$Number,"[A-Z]{1,2}"))
#check classification & print Player_State
class(Player_State)
## [1] "character"
Player_State
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
Tot_Points <- (unlist(str_extract_all(chess_clean$Pts, "[0-9]\\.[0-9]")))
#convert Tot_Points to a numeric variable
Tot_Points <- as.numeric(Tot_Points)
#check classification & print Tot_Points
class(Tot_Points)
## [1] "numeric"
Tot_Points
## [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0
## [18] 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
## [35] 3.5 3.5 3.5 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5
## [52] 2.5 2.0 2.0 2.0 2.0 2.0 2.0 2.0 1.5 1.5 1.0 1.0 1.0
#use str_extract_all & regular expressions to pull PrePost data
Player_Pre <- unlist(str_extract_all(chess_clean$PrePost,":[:blank:]+[0-9]{1,4}"))
#view the first 5 data elements
head(Player_Pre,n=5)
## [1] ": 1794" ": 1553" ": 1384" ": 1716" ": 1655"
As you can see the colon precedes the PrePost number, next we use the str_sub feature to remove the colon and the blank spaces. Next we trim the data using str_trim since several entries have 3 digits instead of 4.
Player_Pre <- str_sub(Player_Pre, start = 3, end=6)
Player_Pre
## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" " 980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" " 377" "1362" "1382" "1291" "1056"
## [51] "1011" " 935" "1393" "1270" "1186" "1153" "1092" " 917" " 853" " 967"
## [61] " 955" "1530" "1175" "1163"
Player_Pre <- str_trim(Player_Pre)
#convert Player_Pre to a numeric variable
Player_Pre <- as.numeric(Player_Pre)
#check classification & print Player_Pre
class(Player_Pre)
## [1] "numeric"
Player_Pre
## [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355 980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242 377 1362 1382 1291 1056 1011 935 1393 1270 1186 1153
## [57] 1092 917 853 967 955 1530 1175 1163
In order to extract the opponents data in a cleaner environment a vector entitled even was created that would begin at the number 2 to 128 counting by 2 to mirror chess_clean which has 128 variables; our output is added to chess_opp
#create the even vector along with the chess_opp data frame
even <- seq(2,128,2)
chess_opp <- chess_clean[-c(even),]
R1 Extraction Use str_extract_all & regular expressions to pull the opposing players’ number.
R1_O <- unlist(str_extract_all(chess_opp$R1, "[[:alnum:] ]{1,}"))
R1_O
## [1] "W 39" "W 63" "L 8" "W 23" "W 45" "W 34" "W 57" "W 3"
## [9] "W 25" "D 16" "D 38" "W 42" "W 36" "W 54" "D 19" "D 10"
## [17] "W 48" "W 47" "D 15" "L 40" "W 43" "W 64" "L 4" "L 28"
## [25] "L 9" "W 49" "W 51" "W 24" "W 50" "L 52" "L 58" "W 61"
## [33] "W 60" "L 6" "L 46" "L 13" "B " "D 11" "L 1" "W 20"
## [41] "W 59" "L 12" "L 21" "B " "L 5" "W 35" "L 18" "L 17"
## [49] "L 26" "L 29" "L 27" "W 30" "H " "L 14" "L 62" "H "
## [57] "L 7" "W 31" "L 41" "L 33" "L 32" "W 55" "L 2" "L 22"
In certain cases we have a character followed by blanks which will generate an NA value but will fail to line up properly. In order to keep the values consistent we had a 0 to missing values using the str_replace_all feature
R1_O <- unlist(str_replace_all(R1_O, "\\w\\s..\\D","0"))
R1_O
## [1] "W 39" "W 63" "L 8" "W 23" "W 45" "W 34" "W 57" "W 3"
## [9] "W 25" "D 16" "D 38" "W 42" "W 36" "W 54" "D 19" "D 10"
## [17] "W 48" "W 47" "D 15" "L 40" "W 43" "W 64" "L 4" "L 28"
## [25] "L 9" "W 49" "W 51" "W 24" "W 50" "L 52" "L 58" "W 61"
## [33] "W 60" "L 6" "L 46" "L 13" "0" "D 11" "L 1" "W 20"
## [41] "W 59" "L 12" "L 21" "0" "L 5" "W 35" "L 18" "L 17"
## [49] "L 26" "L 29" "L 27" "W 30" "0" "L 14" "L 62" "0"
## [57] "L 7" "W 31" "L 41" "L 33" "L 32" "W 55" "L 2" "L 22"
Next use str_replace_all again to remove all data elements with the characters WLD
R1_O <- str_replace_all(R1_O, "[WLD]","")
R1_O
## [1] " 39" " 63" " 8" " 23" " 45" " 34" " 57" " 3" " 25" " 16"
## [11] " 38" " 42" " 36" " 54" " 19" " 10" " 48" " 47" " 15" " 40"
## [21] " 43" " 64" " 4" " 28" " 9" " 49" " 51" " 24" " 50" " 52"
## [31] " 58" " 61" " 60" " 6" " 46" " 13" "0" " 11" " 1" " 20"
## [41] " 59" " 12" " 21" "0" " 5" " 35" " 18" " 17" " 26" " 29"
## [51] " 27" " 30" "0" " 14" " 62" "0" " 7" " 31" " 41" " 33"
## [61] " 32" " 55" " 2" " 22"
Next we use the str_trim feature in order to remove the extra padding
R1_O <- str_trim(R1_O)
R1_O
## [1] "39" "63" "8" "23" "45" "34" "57" "3" "25" "16" "38" "42" "36" "54"
## [15] "19" "10" "48" "47" "15" "40" "43" "64" "4" "28" "9" "49" "51" "24"
## [29] "50" "52" "58" "61" "60" "6" "46" "13" "0" "11" "1" "20" "59" "12"
## [43] "21" "0" "5" "35" "18" "17" "26" "29" "27" "30" "0" "14" "62" "0"
## [57] "7" "31" "41" "33" "32" "55" "2" "22"
Next we convert the R1_O variable from character to numeric and check the classification and print the output.
R1_O <- as.numeric(R1_O)
class(R1_O)
## [1] "numeric"
R1_O
## [1] 39 63 8 23 45 34 57 3 25 16 38 42 36 54 19 10 48 47 15 40 43 64 4
## [24] 28 9 49 51 24 50 52 58 61 60 6 46 13 0 11 1 20 59 12 21 0 5 35
## [47] 18 17 26 29 27 30 0 14 62 0 7 31 41 33 32 55 2 22
Conduct the same conversions for the remaining data elements
#R2 Extraction
R2_O <- unlist(str_extract_all(chess_opp$R2, "[[:alnum:] ]{1,}"))
R2_O <- unlist(str_replace_all(R2_O, "\\w\\s..\\D","0"))
R2_O <- str_replace_all(R2_O, "[WLD]","")
R2_O <- str_trim(R2_O)
R2_O <- as.numeric(R2_O)
#R3 Extraction
R3_O <- unlist(str_extract_all(chess_opp$R3, "[[:alnum:] ]{1,}"))
R3_O <- unlist(str_replace_all(R3_O, "\\w\\s..\\D","0"))
R3_O <- str_replace_all(R3_O, "[WLD]","")
R3_O <- str_trim(R3_O)
R3_O <- as.numeric(R3_O)
#R4 Extraction
R4_O <- unlist(str_extract_all(chess_opp$R4, "[[:alnum:] ]{1,}"))
R4_O <- unlist(str_replace_all(R4_O, "\\w\\s..\\D","0"))
R4_O <- str_replace_all(R4_O, "[WLD]","")
R4_O <- str_trim(R4_O)
R4_O <- as.numeric(R4_O)
#R5 Extraction
R5_O <- unlist(str_extract_all(chess_opp$R5, "[[:alnum:] ]{1,}"))
R5_O <- unlist(str_replace_all(R5_O, "\\w\\s..\\D","0"))
R5_O <- str_replace_all(R5_O, "[WLD]","")
R5_O <- str_trim(R5_O)
R5_O <- as.numeric(R5_O)
#R6 Extraction
R6_O <- unlist(str_extract_all(chess_opp$R6, "[[:alnum:] ]{1,}"))
R6_O <- unlist(str_replace_all(R6_O, "\\w\\s..\\D","0"))
R6_O <- str_replace_all(R6_O, "[WLD]","")
R6_O <- str_trim(R6_O)
R6_O <- as.numeric(R6_O)
#R7 Extraction
R7_O <- unlist(str_extract_all(chess_opp$R7, "[[:alnum:] ]{1,}"))
R7_O <- unlist(str_replace_all(R7_O, "\\w\\s..\\D","0"))
R7_O <- str_replace_all(R7_O, "[WLD]","")
R7_O <- str_trim(R7_O)
R7_O <- as.numeric(R7_O)
chess_combined <- tibble(Player_Name,Player_State,Tot_Points,
Player_Pre,R1_O,R2_O,R3_O,R4_O,R5_O,
R6_O,R7_O)
glimpse(chess_combined)
## Observations: 64
## Variables: 11
## $ Player_Name <chr> "GARY HUA", "DAKSHESH DARURI", "ADITYA BAJAJ", "P...
## $ Player_State <chr> "ON", "MI", "MI", "MI", "MI", "OH", "MI", "MI", "...
## $ Tot_Points <dbl> 6.0, 6.0, 6.0, 5.5, 5.5, 5.0, 5.0, 5.0, 5.0, 5.0,...
## $ Player_Pre <dbl> 1794, 1553, 1384, 1716, 1655, 1686, 1649, 1641, 1...
## $ R1_O <dbl> 39, 63, 8, 23, 45, 34, 57, 3, 25, 16, 38, 42, 36,...
## $ R2_O <dbl> 21, 58, 61, 28, 37, 29, 46, 32, 18, 19, 56, 33, 2...
## $ R3_O <dbl> 18, 4, 25, 2, 12, 11, 13, 14, 59, 55, 6, 5, 7, 8,...
## $ R4_O <dbl> 14, 17, 21, 26, 13, 35, 11, 9, 8, 31, 7, 38, 5, 1...
## $ R5_O <dbl> 7, 16, 11, 5, 4, 10, 1, 47, 26, 6, 3, 0, 33, 27, ...
## $ R6_O <dbl> 12, 20, 13, 19, 14, 27, 9, 28, 7, 25, 34, 1, 3, 5...
## $ R7_O <dbl> 4, 7, 12, 1, 17, 21, 2, 19, 20, 18, 26, 3, 32, 31...
Create empty placeholders for PreRatingAverage, Average, TotCount & Denominator. Next, create a loop that will run the total length of the Player_Pre data element from the chess_combined tibble. The competitors data element pulls data from **R1_0:R7:_0** according to the loop.
PreRatingAverage <- vector()
Average <- numeric(0)
TotCount <- numeric(0)
Denominator <- numeric(0)
for (i in 1:length(chess_combined$Player_Pre))
{
competitors <- as.numeric(as.vector(chess_combined[i,5:11]))
#add competitors
TotCount <- sum(chess_combined[competitors,"Player_Pre"])
#exclude any number equal to 0
Denominator <- sum(chess_combined[i,c(5:11)]!=0)
Average <- TotCount/Denominator
PreRatingAverage <- append(PreRatingAverage,Average)
}
PreRatingAverage <- round(PreRatingAverage, digits = 0)
PreRatingAverage
## [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263
chess_final <- tibble(Player_Name,Player_State,Tot_Points,
Player_Pre,PreRatingAverage)
glimpse(chess_final)
## Observations: 64
## Variables: 5
## $ Player_Name <chr> "GARY HUA", "DAKSHESH DARURI", "ADITYA BAJAJ"...
## $ Player_State <chr> "ON", "MI", "MI", "MI", "MI", "OH", "MI", "MI...
## $ Tot_Points <dbl> 6.0, 6.0, 6.0, 5.5, 5.5, 5.0, 5.0, 5.0, 5.0, ...
## $ Player_Pre <dbl> 1794, 1553, 1384, 1716, 1655, 1686, 1649, 164...
## $ PreRatingAverage <dbl> 1605, 1469, 1564, 1574, 1501, 1519, 1372, 146...
chess_final
## # A tibble: 64 x 5
## Player_Name Player_State Tot_Points Player_Pre PreRatingAverage
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 GARY HUA ON 6.00 1794 1605
## 2 DAKSHESH DARURI MI 6.00 1553 1469
## 3 ADITYA BAJAJ MI 6.00 1384 1564
## 4 PATRICK H SCHILLING MI 5.50 1716 1574
## 5 HANSHI ZUO MI 5.50 1655 1501
## 6 HANSEN SONG OH 5.00 1686 1519
## 7 GARY DEE SWATHELL MI 5.00 1649 1372
## 8 EZEKIEL HOUGHTON MI 5.00 1641 1468
## 9 STEFANO LEE ON 5.00 1411 1523
## 10 ANVIT RAO MI 5.00 1365 1554
## # ... with 54 more rows
write.csv(chess_final, file = "ChessTournament.csv")