read csv and create a character vector
## Warning: package 'readr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: Missing column names filled in: 'X11' [11], 'X12' [12],
## 'X13' [13], 'X14' [14]
## Parsed with column specification:
## cols(
## `Num/State` = col_character(),
## `Name/Rating` = col_character(),
## Pts = col_double(),
## `Round 1` = col_character(),
## `Round 2` = col_character(),
## `Round 3` = col_character(),
## `Round 4` = col_character(),
## `Round 5` = col_character(),
## `Round 6` = col_character(),
## `Round 7` = col_character(),
## X11 = col_logical(),
## X12 = col_logical(),
## X13 = col_logical(),
## X14 = col_character()
## )
display structure
str(tournament_info_csv)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 128 obs. of 14 variables:
## $ Num/State : chr "1" "ON" "2" "MI" ...
## $ Name/Rating: chr "GARY HUA" "15445895 / R: 1794 ->1817" "DAKSHESH DARURI" "14598900 / R: 1553 ->1663" ...
## $ Pts : num 6 NA 6 NA 6 NA 5.5 NA 5.5 NA ...
## $ Round 1 : chr "W 39" NA "W 63" NA ...
## $ Round 2 : chr "W 21" NA "W 58" NA ...
## $ Round 3 : chr "W 18" NA "L 4" NA ...
## $ Round 4 : chr "W 14" NA "W 17" NA ...
## $ Round 5 : chr "W 7" NA "W 16" NA ...
## $ Round 6 : chr "D 12" NA "W 20" NA ...
## $ Round 7 : chr "D 4" NA "W 7" NA ...
## $ X11 : logi NA NA NA NA NA NA ...
## $ X12 : logi NA NA NA NA NA NA ...
## $ X13 : logi NA NA NA NA NA NA ...
## $ X14 : chr NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. `Num/State` = col_character(),
## .. `Name/Rating` = col_character(),
## .. Pts = col_double(),
## .. `Round 1` = col_character(),
## .. `Round 2` = col_character(),
## .. `Round 3` = col_character(),
## .. `Round 4` = col_character(),
## .. `Round 5` = col_character(),
## .. `Round 6` = col_character(),
## .. `Round 7` = col_character(),
## .. X11 = col_logical(),
## .. X12 = col_logical(),
## .. X13 = col_logical(),
## .. X14 = col_character()
## .. )
display first ten elements of num/state column
head(tournament_info_csv$`Num/State`)
## [1] "1" "ON" "2" "MI" "3" "MI"
create a vector of player number
player_id <- grep(pattern = '[0-9]+',
tournament_info_csv$`Num/State`, value = TRUE)
create a vector of player states
player_state <- grep(pattern = '[A-Z]+',
tournament_info_csv$`Num/State`, value = TRUE)
display first ten rows of each
head(player_id)
## [1] "1" "2" "3" "4" "5" "6"
head(player_state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
create a dataframe of player id & his state
converted the player id to numeric and allowed the state
to be factors since they are categorical
player_df <- data.frame(player_id, player_state)
display details on df
str(player_df)
## 'data.frame': 64 obs. of 2 variables:
## $ player_id : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
## $ player_state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
table(player_df)
## player_state
## player_id MI OH ON
## 1 0 0 1
## 10 1 0 0
## 11 1 0 0
## 12 1 0 0
## 13 1 0 0
## 14 1 0 0
## 15 1 0 0
## 16 1 0 0
## 17 1 0 0
## 18 1 0 0
## 19 1 0 0
## 2 1 0 0
## 20 1 0 0
## 21 0 0 1
## 22 1 0 0
## 23 0 0 1
## 24 1 0 0
## 25 1 0 0
## 26 0 0 1
## 27 1 0 0
## 28 1 0 0
## 29 1 0 0
## 3 1 0 0
## 30 0 0 1
## 31 1 0 0
## 32 0 0 1
## 33 1 0 0
## 34 1 0 0
## 35 1 0 0
## 36 1 0 0
## 37 1 0 0
## 38 1 0 0
## 39 1 0 0
## 4 1 0 0
## 40 1 0 0
## 41 1 0 0
## 42 1 0 0
## 43 1 0 0
## 44 1 0 0
## 45 1 0 0
## 46 1 0 0
## 47 1 0 0
## 48 1 0 0
## 49 1 0 0
## 5 1 0 0
## 50 1 0 0
## 51 1 0 0
## 52 1 0 0
## 53 1 0 0
## 54 1 0 0
## 55 1 0 0
## 56 1 0 0
## 57 1 0 0
## 58 1 0 0
## 59 1 0 0
## 6 0 1 0
## 60 1 0 0
## 61 0 0 1
## 62 1 0 0
## 63 1 0 0
## 64 1 0 0
## 7 1 0 0
## 8 1 0 0
## 9 0 0 1
head(player_df)
## player_id player_state
## 1 1 ON
## 2 2 MI
## 3 3 MI
## 4 4 MI
## 5 5 MI
## 6 6 OH
create a new vector of player points and add it as a column to
the existing player df
player_pts <- grep(pattern = '^[0-9]',
tournament_info_csv$Pts, value = TRUE)
head(player_pts)
## [1] "6" "6" "6" "5.5" "5.5" "5"
str(player_pts)
## chr [1:64] "6" "6" "6" "5.5" "5.5" "5" "5" "5" "5" "5" "4.5" "4.5" ...
player_df$player_pts <- as.numeric(player_pts)
colnames(player_df)
## [1] "player_id" "player_state" "player_pts"
str(player_df)
## 'data.frame': 64 obs. of 3 variables:
## $ player_id : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
## $ player_state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
## $ player_pts : num 6 6 6 5.5 5.5 5 5 5 5 5 ...
create a new vector of player names and add it as a column to
the existing player df
player_name <- grep(pattern = '^[A-Z]+',
tournament_info_csv$`Name/Rating`, value=TRUE)
player_df$player_name <- player_name
player_df <- player_df[, c(1,4,2,3)]
str(player_df)
## 'data.frame': 64 obs. of 4 variables:
## $ player_id : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
## $ player_name : chr "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
## $ player_state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
## $ player_pts : num 6 6 6 5.5 5.5 5 5 5 5 5 ...
create a new vector of player ratings and add it as a column to
the existing player df
player_rating <- str_extract(tournament_info_csv$`Name/Rating`,
"(:\\s{1,2}[0-9]{1,4})")
remove NAs
str(player_rating)
## chr [1:128] NA ": 1794" NA ": 1553" NA ": 1384" NA ": 1716" NA ...
player_rating <- player_rating[!is.na(player_rating)]
str(player_rating)
## chr [1:64] ": 1794" ": 1553" ": 1384" ": 1716" ": 1655" ": 1686" ...
remove ‘:’
player_rating <- str_replace(player_rating, pattern = ": ", "")
str(player_rating)
## chr [1:64] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" ...
player_df$player_rating <- as.numeric(player_rating)
create a new vector of player round1 opponents and add it as a column to
the existing player df
player_opp1 <- str_extract(tournament_info_csv$`Round 1`,
"([B]|[H]|[A-Z]\\s+[0-9]{1,2})")
player_opp1 <- player_opp1[!is.na(player_opp1)]
player_opp1 <- str_replace(player_opp1, pattern = "[A-Z]\\s{2,3}", "")
player_df$round1 <- player_opp1
create a new vector of player round2 opponents and add it as a column to
the existing player df
player_opp2 <- str_extract(tournament_info_csv$`Round 2`,
"([B]|[H]|[U]|[A-Z]\\s+[0-9]{1,2})")
player_opp2 <- player_opp2[!is.na(player_opp2)]
player_opp2 <- str_replace(player_opp2, pattern = "[A-Z]\\s{2,3}", "")
player_df$round2 <- player_opp2
create a new vector of player round3 opponents and add it as a column to
the existing player df
player_opp3 <- str_extract(tournament_info_csv$`Round 3`,
"([B]|[H]|[U]|[A-Z]\\s+[0-9]{1,2})")
player_opp3 <- player_opp3[!is.na(player_opp3)]
player_opp3 <- str_replace(player_opp3, pattern = "[A-Z]\\s{2,3}", "")
player_df$round3 <- player_opp3
create a new vector of player round4 opponents and add it as a column to
the existing player df
player_opp4 <- str_extract(tournament_info_csv$`Round 4`,
"([B]|[H]|[U]|[A-Z]\\s+[0-9]{1,2})")
player_opp4 <- player_opp4[!is.na(player_opp4)]
player_opp4 <- str_replace(player_opp4, pattern = "[A-Z]\\s{2}", "")
player_df$round4 <- player_opp4
create a new vector of player round5 opponents and add it as a column to
the existing player df
player_opp5 <- str_extract(tournament_info_csv$`Round 5`,
"([B]|[H]|[U]|[X]|[A-Z]\\s+[0-9]{1,2})")
player_opp5 <- player_opp5[!is.na(player_opp5)]
player_opp5 <- str_replace(player_opp5, pattern = "[A-Z]\\s{1,3}", "")
player_df$round5 <- player_opp5
create a new vector of player round6 opponents and add it as a column to
the existing player df
player_opp6 <- str_extract(tournament_info_csv$`Round 6`,
"([B]|[H]|[U]|[X]|[A-Z]\\s+[0-9]{1,2})")
player_opp6 <- player_opp6[!is.na(player_opp6)]
player_opp6 <- str_replace(player_opp6, pattern = "[A-Z]\\s{1,3}", "")
player_df$round6 <- player_opp6
create a new vector of player round7 opponents and add it as a column to
the existing player df
player_opp7 <- str_extract(tournament_info_csv$`Round 7`,
"([B]|[H]|[U]|[X]|[A-Z]\\s+[0-9]{1,2})")
player_opp7 <- player_opp7[!is.na(player_opp7)]
player_opp7 <- str_replace(player_opp7, pattern = "[A-Z]\\s{1,3}", "")
player_df$round7 <- player_opp7
display complete data frame
head(player_df)
## player_id player_name player_state player_pts player_rating
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
## round1 round2 round3 round4 round5 round6 round7
## 1 39 21 18 14 7 12 4
## 2 63 58 4 17 16 20 7
## 3 8 61 25 21 11 13 12
## 4 23 28 2 26 5 19 1
## 5 45 37 12 13 4 14 17
## 6 34 29 11 35 10 27 21
tail(player_df)
## player_id player_name player_state player_pts player_rating
## 59 59 SEAN M MC CORMICK MI 2.0 853
## 60 60 JULIA SHEN MI 1.5 967
## 61 61 JEZZEL FARKAS ON 1.5 955
## 62 62 ASHWIN BALAJI MI 1.0 1530
## 63 63 THOMAS JOSEPH HOSMER MI 1.0 1175
## 64 64 BEN LI MI 1.0 1163
## round1 round2 round3 round4 round5 round6 round7
## 59 41 B 9 40 43 54 44
## 60 33 34 45 42 24 H U
## 61 32 3 54 47 42 30 37
## 62 55 U U U U U U
## 63 2 48 49 43 45 H U
## 64 22 30 31 49 46 42 54
create a tibble to replace oppet numbers with their rating
player_tb <- as_tibble(player_df)
for (row in 1:nrow(player_df)) {
if (player_df$round1[row] < 'A') {
opp_tb <- filter(player_tb, player_id==player_df$round1[row])
player_df$round1[row] <- opp_tb$player_rating
}
}
player_df$round1
## [1] "1436" "1175" "1641" "1363" "1242" "1399" "1092" "1384" "1745" "1604"
## [11] "1423" "1332" "1355" "1270" "1564" "1365" "1382" "1362" "1220" "1348"
## [21] "1283" "1163" "1716" "1507" "1411" "1291" "1011" "1229" "1056" "935"
## [31] "917" "955" "967" "1686" "377" "1666" "B" "1712" "1794" "1595"
## [41] "853" "1663" "1563" "B" "1655" "1438" "1600" "1629" "1579" "1602"
## [51] "1552" "1522" "H" "1610" "1530" "H" "1649" "1494" "1403" "1449"
## [61] "1441" "1186" "1553" "1555"
player_tb <- as_tibble(player_df)
player_tb
## # A tibble: 64 x 12
## player_id player_name player_state player_pts player_rating round1
## <fct> <chr> <fct> <dbl> <dbl> <chr>
## 1 1 GARY HUA ON 6 1794 1436
## 2 2 DAKSHESH D~ MI 6 1553 1175
## 3 3 ADITYA BAJ~ MI 6 1384 1641
## 4 4 PATRICK H ~ MI 5.5 1716 1363
## 5 5 HANSHI ZUO MI 5.5 1655 1242
## 6 6 HANSEN SONG OH 5 1686 1399
## 7 7 GARY DEE S~ MI 5 1649 1092
## 8 8 EZEKIEL HO~ MI 5 1641 1384
## 9 9 STEFANO LEE ON 5 1411 1745
## 10 10 ANVIT RAO MI 5 1365 1604
## # ... with 54 more rows, and 6 more variables: round2 <chr>, round3 <chr>,
## # round4 <chr>, round5 <chr>, round6 <chr>, round7 <chr>