Data607 - Project1

read csv and create a character vector

## Warning: package 'readr' was built under R version 3.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Warning: Missing column names filled in: 'X11' [11], 'X12' [12],
## 'X13' [13], 'X14' [14]

## Parsed with column specification:
## cols(
##   `Num/State` = col_character(),
##   `Name/Rating` = col_character(),
##   Pts = col_double(),
##   `Round 1` = col_character(),
##   `Round 2` = col_character(),
##   `Round 3` = col_character(),
##   `Round 4` = col_character(),
##   `Round 5` = col_character(),
##   `Round 6` = col_character(),
##   `Round 7` = col_character(),
##   X11 = col_logical(),
##   X12 = col_logical(),
##   X13 = col_logical(),
##   X14 = col_character()
## )

display structure

str(tournament_info_csv)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 128 obs. of  14 variables:
##  $ Num/State  : chr  "1" "ON" "2" "MI" ...
##  $ Name/Rating: chr  "GARY HUA" "15445895 / R: 1794   ->1817" "DAKSHESH DARURI" "14598900 / R: 1553   ->1663" ...
##  $ Pts        : num  6 NA 6 NA 6 NA 5.5 NA 5.5 NA ...
##  $ Round 1    : chr  "W  39" NA "W  63" NA ...
##  $ Round 2    : chr  "W  21" NA "W  58" NA ...
##  $ Round 3    : chr  "W  18" NA "L   4" NA ...
##  $ Round 4    : chr  "W  14" NA "W  17" NA ...
##  $ Round 5    : chr  "W   7" NA "W  16" NA ...
##  $ Round 6    : chr  "D  12" NA "W  20" NA ...
##  $ Round 7    : chr  "D   4" NA "W   7" NA ...
##  $ X11        : logi  NA NA NA NA NA NA ...
##  $ X12        : logi  NA NA NA NA NA NA ...
##  $ X13        : logi  NA NA NA NA NA NA ...
##  $ X14        : chr  NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Num/State` = col_character(),
##   ..   `Name/Rating` = col_character(),
##   ..   Pts = col_double(),
##   ..   `Round 1` = col_character(),
##   ..   `Round 2` = col_character(),
##   ..   `Round 3` = col_character(),
##   ..   `Round 4` = col_character(),
##   ..   `Round 5` = col_character(),
##   ..   `Round 6` = col_character(),
##   ..   `Round 7` = col_character(),
##   ..   X11 = col_logical(),
##   ..   X12 = col_logical(),
##   ..   X13 = col_logical(),
##   ..   X14 = col_character()
##   .. )

display first ten elements of num/state column

head(tournament_info_csv$`Num/State`)

## [1] "1"  "ON" "2"  "MI" "3"  "MI"

create a vector of player number

player_id <- grep(pattern = '[0-9]+', 
             tournament_info_csv$`Num/State`, value = TRUE)

create a vector of player states

player_state <- grep(pattern = '[A-Z]+', 
             tournament_info_csv$`Num/State`, value = TRUE)

display first ten rows of each

head(player_id)

## [1] "1" "2" "3" "4" "5" "6"

head(player_state)

## [1] "ON" "MI" "MI" "MI" "MI" "OH"

create a dataframe of player id & his state

converted the player id to numeric and allowed the state

to be factors since they are categorical

player_df <- data.frame(player_id, player_state)

display details on df

str(player_df)

## 'data.frame':    64 obs. of  2 variables:
##  $ player_id   : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
##  $ player_state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...

table(player_df)

##          player_state
## player_id MI OH ON
##        1   0  0  1
##        10  1  0  0
##        11  1  0  0
##        12  1  0  0
##        13  1  0  0
##        14  1  0  0
##        15  1  0  0
##        16  1  0  0
##        17  1  0  0
##        18  1  0  0
##        19  1  0  0
##        2   1  0  0
##        20  1  0  0
##        21  0  0  1
##        22  1  0  0
##        23  0  0  1
##        24  1  0  0
##        25  1  0  0
##        26  0  0  1
##        27  1  0  0
##        28  1  0  0
##        29  1  0  0
##        3   1  0  0
##        30  0  0  1
##        31  1  0  0
##        32  0  0  1
##        33  1  0  0
##        34  1  0  0
##        35  1  0  0
##        36  1  0  0
##        37  1  0  0
##        38  1  0  0
##        39  1  0  0
##        4   1  0  0
##        40  1  0  0
##        41  1  0  0
##        42  1  0  0
##        43  1  0  0
##        44  1  0  0
##        45  1  0  0
##        46  1  0  0
##        47  1  0  0
##        48  1  0  0
##        49  1  0  0
##        5   1  0  0
##        50  1  0  0
##        51  1  0  0
##        52  1  0  0
##        53  1  0  0
##        54  1  0  0
##        55  1  0  0
##        56  1  0  0
##        57  1  0  0
##        58  1  0  0
##        59  1  0  0
##        6   0  1  0
##        60  1  0  0
##        61  0  0  1
##        62  1  0  0
##        63  1  0  0
##        64  1  0  0
##        7   1  0  0
##        8   1  0  0
##        9   0  0  1

head(player_df)

##   player_id player_state
## 1         1           ON
## 2         2           MI
## 3         3           MI
## 4         4           MI
## 5         5           MI
## 6         6           OH

create a new vector of player points and add it as a column to

the existing player df

player_pts <- grep(pattern = '^[0-9]', 
                   tournament_info_csv$Pts, value = TRUE)
head(player_pts)

## [1] "6"   "6"   "6"   "5.5" "5.5" "5"

str(player_pts)

##  chr [1:64] "6" "6" "6" "5.5" "5.5" "5" "5" "5" "5" "5" "4.5" "4.5" ...

player_df$player_pts <- as.numeric(player_pts)
colnames(player_df)

## [1] "player_id"    "player_state" "player_pts"

str(player_df)

## 'data.frame':    64 obs. of  3 variables:
##  $ player_id   : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
##  $ player_state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
##  $ player_pts  : num  6 6 6 5.5 5.5 5 5 5 5 5 ...

create a new vector of player names and add it as a column to

the existing player df

player_name <- grep(pattern = '^[A-Z]+', 
                    tournament_info_csv$`Name/Rating`, value=TRUE)
player_df$player_name <- player_name
player_df <- player_df[, c(1,4,2,3)]
str(player_df)

## 'data.frame':    64 obs. of  4 variables:
##  $ player_id   : Factor w/ 64 levels "1","10","11",..: 1 12 23 34 45 56 62 63 64 2 ...
##  $ player_name : chr  "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
##  $ player_state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
##  $ player_pts  : num  6 6 6 5.5 5.5 5 5 5 5 5 ...

create a new vector of player ratings and add it as a column to

the existing player df

player_rating <-  str_extract(tournament_info_csv$`Name/Rating`,
                      "(:\\s{1,2}[0-9]{1,4})")

remove NAs

str(player_rating)

##  chr [1:128] NA ": 1794" NA ": 1553" NA ": 1384" NA ": 1716" NA ...

player_rating <- player_rating[!is.na(player_rating)]
str(player_rating)

##  chr [1:64] ": 1794" ": 1553" ": 1384" ": 1716" ": 1655" ": 1686" ...

remove ‘:’

player_rating <- str_replace(player_rating, pattern = ": ", "")
str(player_rating)

##  chr [1:64] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" ...

player_df$player_rating <- as.numeric(player_rating)

create a new vector of player round1 opponents and add it as a column to

the existing player df

player_opp1 <-  str_extract(tournament_info_csv$`Round 1`,
                "([B]|[H]|[A-Z]\\s+[0-9]{1,2})")
player_opp1 <- player_opp1[!is.na(player_opp1)]
player_opp1 <- str_replace(player_opp1, pattern = "[A-Z]\\s{2,3}", "")
player_df$round1 <- player_opp1

create a new vector of player round2 opponents and add it as a column to

the existing player df

player_opp2 <-  str_extract(tournament_info_csv$`Round 2`,
                         "([B]|[H]|[U]|[A-Z]\\s+[0-9]{1,2})")
player_opp2 <- player_opp2[!is.na(player_opp2)]
player_opp2 <- str_replace(player_opp2, pattern = "[A-Z]\\s{2,3}", "")
player_df$round2 <- player_opp2

create a new vector of player round3 opponents and add it as a column to

the existing player df

player_opp3 <-  str_extract(tournament_info_csv$`Round 3`,
                            "([B]|[H]|[U]|[A-Z]\\s+[0-9]{1,2})")
player_opp3 <- player_opp3[!is.na(player_opp3)]
player_opp3 <- str_replace(player_opp3, pattern = "[A-Z]\\s{2,3}", "")
player_df$round3 <- player_opp3

create a new vector of player round4 opponents and add it as a column to

the existing player df

player_opp4 <-  str_extract(tournament_info_csv$`Round 4`,
                            "([B]|[H]|[U]|[A-Z]\\s+[0-9]{1,2})")
player_opp4 <- player_opp4[!is.na(player_opp4)]
player_opp4 <- str_replace(player_opp4, pattern = "[A-Z]\\s{2}", "")
player_df$round4 <- player_opp4

create a new vector of player round5 opponents and add it as a column to

the existing player df

player_opp5 <-  str_extract(tournament_info_csv$`Round 5`,
                     "([B]|[H]|[U]|[X]|[A-Z]\\s+[0-9]{1,2})")
player_opp5 <- player_opp5[!is.na(player_opp5)]
player_opp5 <- str_replace(player_opp5, pattern = "[A-Z]\\s{1,3}", "")
player_df$round5 <- player_opp5

create a new vector of player round6 opponents and add it as a column to

the existing player df

player_opp6 <-  str_extract(tournament_info_csv$`Round 6`,
                            "([B]|[H]|[U]|[X]|[A-Z]\\s+[0-9]{1,2})")
player_opp6 <- player_opp6[!is.na(player_opp6)]
player_opp6 <- str_replace(player_opp6, pattern = "[A-Z]\\s{1,3}", "")
player_df$round6 <- player_opp6

create a new vector of player round7 opponents and add it as a column to

the existing player df

player_opp7 <-  str_extract(tournament_info_csv$`Round 7`,
                            "([B]|[H]|[U]|[X]|[A-Z]\\s+[0-9]{1,2})")
player_opp7 <- player_opp7[!is.na(player_opp7)]
player_opp7 <- str_replace(player_opp7, pattern = "[A-Z]\\s{1,3}", "")
player_df$round7 <- player_opp7

display complete data frame

head(player_df)

##   player_id         player_name player_state player_pts player_rating
## 1         1            GARY HUA           ON        6.0          1794
## 2         2     DAKSHESH DARURI           MI        6.0          1553
## 3         3        ADITYA BAJAJ           MI        6.0          1384
## 4         4 PATRICK H SCHILLING           MI        5.5          1716
## 5         5          HANSHI ZUO           MI        5.5          1655
## 6         6         HANSEN SONG           OH        5.0          1686
##   round1 round2 round3 round4 round5 round6 round7
## 1     39     21     18     14      7     12      4
## 2     63     58      4     17     16     20      7
## 3      8     61     25     21     11     13     12
## 4     23     28      2     26      5     19      1
## 5     45     37     12     13      4     14     17
## 6     34     29     11     35     10     27     21

tail(player_df)

##    player_id          player_name player_state player_pts player_rating
## 59        59    SEAN M MC CORMICK           MI        2.0           853
## 60        60           JULIA SHEN           MI        1.5           967
## 61        61        JEZZEL FARKAS           ON        1.5           955
## 62        62        ASHWIN BALAJI           MI        1.0          1530
## 63        63 THOMAS JOSEPH HOSMER           MI        1.0          1175
## 64        64               BEN LI           MI        1.0          1163
##    round1 round2 round3 round4 round5 round6 round7
## 59     41      B      9     40     43     54     44
## 60     33     34     45     42     24      H      U
## 61     32      3     54     47     42     30     37
## 62     55      U      U      U      U      U      U
## 63      2     48     49     43     45      H      U
## 64     22     30     31     49     46     42     54

create a tibble to replace oppet numbers with their rating

player_tb <- as_tibble(player_df)
for (row in 1:nrow(player_df)) {
  if (player_df$round1[row] < 'A') {
    opp_tb <- filter(player_tb, player_id==player_df$round1[row])
    player_df$round1[row] <- opp_tb$player_rating
  }
}
player_df$round1

##  [1] "1436" "1175" "1641" "1363" "1242" "1399" "1092" "1384" "1745" "1604"
## [11] "1423" "1332" "1355" "1270" "1564" "1365" "1382" "1362" "1220" "1348"
## [21] "1283" "1163" "1716" "1507" "1411" "1291" "1011" "1229" "1056" "935" 
## [31] "917"  "955"  "967"  "1686" "377"  "1666" "B"    "1712" "1794" "1595"
## [41] "853"  "1663" "1563" "B"    "1655" "1438" "1600" "1629" "1579" "1602"
## [51] "1552" "1522" "H"    "1610" "1530" "H"    "1649" "1494" "1403" "1449"
## [61] "1441" "1186" "1553" "1555"

player_tb <- as_tibble(player_df)
player_tb

## # A tibble: 64 x 12
##    player_id player_name player_state player_pts player_rating round1
##    <fct>     <chr>       <fct>             <dbl>         <dbl> <chr> 
##  1 1         GARY HUA    ON                  6            1794 1436  
##  2 2         DAKSHESH D~ MI                  6            1553 1175  
##  3 3         ADITYA BAJ~ MI                  6            1384 1641  
##  4 4         PATRICK H ~ MI                  5.5          1716 1363  
##  5 5         HANSHI ZUO  MI                  5.5          1655 1242  
##  6 6         HANSEN SONG OH                  5            1686 1399  
##  7 7         GARY DEE S~ MI                  5            1649 1092  
##  8 8         EZEKIEL HO~ MI                  5            1641 1384  
##  9 9         STEFANO LEE ON                  5            1411 1745  
## 10 10        ANVIT RAO   MI                  5            1365 1604  
## # ... with 54 more rows, and 6 more variables: round2 <chr>, round3 <chr>,
## #   round4 <chr>, round5 <chr>, round6 <chr>, round7 <chr>

Data607 - Project1

Vanita Hackworth

February , 2019

read csv and create a character vector

display structure

display first ten elements of num/state column

create a vector of player number

create a vector of player states

display first ten rows of each

create a dataframe of player id & his state

converted the player id to numeric and allowed the state

to be factors since they are categorical

display details on df

create a new vector of player points and add it as a column to

the existing player df

create a new vector of player names and add it as a column to

the existing player df

create a new vector of player ratings and add it as a column to

the existing player df

remove NAs

remove ‘:’

create a new vector of player round1 opponents and add it as a column to

the existing player df

create a new vector of player round2 opponents and add it as a column to

the existing player df

create a new vector of player round3 opponents and add it as a column to

the existing player df

create a new vector of player round4 opponents and add it as a column to

the existing player df

create a new vector of player round5 opponents and add it as a column to

the existing player df

create a new vector of player round6 opponents and add it as a column to

the existing player df

create a new vector of player round7 opponents and add it as a column to

the existing player df

display complete data frame

create a tibble to replace oppet numbers with their rating