library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(dplyr)
library(stringr)

Overview: Project 1 Data 607

In this project, we take a text file

chess <- read_delim("https://raw.githubusercontent.com/evelynbartley/Data-607/main/Chess%20File", delim = "|", col_names = c("state", "name", "number", "pointTotal", "prerating", "avgOpp"), skip = 4, show_col_types = FALSE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
# Convert the file to a data frame
chess <- data.frame(chess)
# Remove the NA rows
chess <- subset(chess, chess$state != chess$state[3])

Here we use an iteration to create a data frame from the odd rows and a separate data frame for the even rows, and then horizontally merge those two data frames to create a data frame with 22 variables and 64 observations.

name_rows <- data.frame()
state_rows <- data.frame()
# Use recursion to create two data frames
for (i in 1:nrow(chess)) {
  this_row <- chess[i,]
  if (i %% 2 == 1) {
    name_rows <- rbind(name_rows, this_row)
  } 
  else {
    state_rows <- rbind(state_rows, this_row)
  }
}
#Combine the two data frames
chess1 = data.frame(name_rows, state_rows, check.names=T)

Delete the columns with “NA” and other irrelevant columns.

chess2 <- select(chess1, -c("X11","X11.1", "number.1", "pointTotal.1", "prerating.1", "avgOpp.1", "X7.1", "X8.1", "X9.1", "X10.1"))

Rename the columns.

chess3 <- 
rename(chess2,
    "PlayerID" = "state",
    "PlayerName" = "name",
    "TotalPoints" = "number",
    "Round1" = "pointTotal",
    "Round2" = "prerating",
    "Round3" = "avgOpp",
    "Round4" = "X7",
    "Round5" = "X8",
    "Round6" = "X9",
    "Round7" = "X10",
    "State" = "state.1",
    "Prerating" = "name.1"
)

Let’s make a separate column to isolate the prerating for each player:

chess4 <- separate_wider_delim(chess3, cols = "Prerating", delim = ":", names = "NewCol", too_many = "debug")
## Warning: Debug mode activated: adding variables `Prerating_ok`, `Prerating_pieces`, and
## `Prerating_remainder`.
chess4
## # A tibble: 64 × 16
##    PlayerID PlayerName     TotalPoints Round1 Round2 Round3 Round4 Round5 Round6
##    <chr>    <chr>          <chr>       <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
##  1 "    1 " " GARY HUA   … "6.0  "     W  39  W  21  W  18  W  14  W   7  D  12 
##  2 "    2 " " DAKSHESH DA… "6.0  "     W  63  W  58  L   4  W  17  W  16  W  20 
##  3 "    3 " " ADITYA BAJA… "6.0  "     L   8  W  61  W  25  W  21  W  11  W  13 
##  4 "    4 " " PATRICK H S… "5.5  "     W  23  D  28  W   2  W  26  D   5  W  19 
##  5 "    5 " " HANSHI ZUO … "5.5  "     W  45  W  37  D  12  D  13  D   4  W  14 
##  6 "    6 " " HANSEN SONG… "5.0  "     W  34  D  29  L  11  W  35  D  10  W  27 
##  7 "    7 " " GARY DEE SW… "5.0  "     W  57  W  46  W  13  W  11  L   1  W   9 
##  8 "    8 " " EZEKIEL HOU… "5.0  "     W   3  W  32  L  14  L   9  W  47  W  28 
##  9 "    9 " " STEFANO LEE… "5.0  "     W  25  L  18  W  59  W   8  W  26  L   7 
## 10 "   10 " " ANVIT RAO  … "5.0  "     D  16  L  19  W  55  W  31  D   6  W  25 
## # ℹ 54 more rows
## # ℹ 7 more variables: Round7 <chr>, State <chr>, NewCol <chr>, Prerating <chr>,
## #   Prerating_ok <lgl>, Prerating_pieces <int>, Prerating_remainder <chr>
chess5 <- separate_wider_delim(chess4, cols = "Prerating_remainder", delim = "->", names = "NewCol2", too_many = "debug")
## Warning: Debug mode activated: adding variables `Prerating_remainder_ok`,
## `Prerating_remainder_pieces`, and `Prerating_remainder_remainder`.

Here we get rid of the irrelevant columns.

chess6 <- select(chess5, -c("NewCol", "Prerating","Prerating_ok", "Prerating_pieces", "Prerating_remainder", "Prerating_remainder_ok", "Prerating_remainder_pieces", "Prerating_remainder_remainder"))

We rename the column that contains the preratings.

chess7 <- rename(chess6,
                 "PlayerPreRating" = "NewCol2")

We get rid of all the “:”’s that precede our preratings. And expand the players preratings that end with a “P” and a number.

chess7$PlayerPreRating <- gsub(":","",as.character(chess7$PlayerPreRating))

chess8 <- separate_wider_delim(chess7, cols = "PlayerPreRating", delim = "P", names = "PreRating", too_many = "debug")
## Warning: Debug mode activated: adding variables `PlayerPreRating_ok`,
## `PlayerPreRating_pieces`, and `PlayerPreRating_remainder`.

Clean it up.

chessF <- select(chess8, -c("PlayerPreRating", "PlayerPreRating_ok", "PlayerPreRating_pieces", "PlayerPreRating_remainder"))

Here we isolate the player ID’s for each round by removing the letter that represents the result of the round.

chessF$Round1 <- gsub('[A-Z]', "", chessF$Round1)
chessF$Round2 <- gsub('[A-Z]', "", chessF$Round2)
chessF$Round3 <- gsub('[A-Z]', "", chessF$Round3)
chessF$Round4 <- gsub('[A-Z]', "", chessF$Round4)
chessF$Round5 <- gsub('[A-Z]', "", chessF$Round5)
chessF$Round6 <- gsub('[A-Z]', "", chessF$Round6)
chessF$Round7 <- gsub('[A-Z]', "", chessF$Round7)

Now we want to replace each player id that shows in each round by the corresponding player id’s prerating.

I want to change every row with numbers to a numeric vector.

chessF$PlayerID = as.numeric(sub("[A-Z]", "", chessF$PlayerID))
chessF$Round1 = as.numeric(sub("[A-Z]", "", chessF$Round1))
chessF$Round2 = as.numeric(sub("[A-Z]", "", chessF$Round2))
chessF$Round3 = as.numeric(sub("[A-Z]", "", chessF$Round3))
chessF$Round4 = as.numeric(sub("[A-Z]", "", chessF$Round4))
chessF$Round5 = as.numeric(sub("[A-Z]", "", chessF$Round5))
chessF$Round6 = as.numeric(sub("[A-Z]", "", chessF$Round6))
chessF$Round7 = as.numeric(sub("[A-Z]", "", chessF$Round7))
chessF$PreRating = as.numeric(sub("[A-Z]", "", chessF$PreRating))

This for loop replaces the opponent’s ID with their prerating in Round 1.

for (i in chessF$Round1)
  {
  chessF$Round1[i] <- chessF$PreRating[chessF$Round1[i]]
}
for (i in chessF$Round2)
  {
  chessF$Round2[i] <- chessF$PreRating[chessF$Round2[i]]
}
for (i in chessF$Round3)
  {
  chessF$Round3[i] <- chessF$PreRating[chessF$Round3[i]]
}
for (i in chessF$Round4)
  {
  chessF$Round4[i] <- chessF$PreRating[chessF$Round4[i]]
}
for (i in chessF$Round5)
  {
  chessF$Round5[i] <- chessF$PreRating[chessF$Round5[i]]
}
for (i in chessF$Round6)
  {
  chessF$Round6[i] <- chessF$PreRating[chessF$Round6[i]]
}
for (i in chessF$Round7)
  {
  chessF$Round7[i] <- chessF$PreRating[chessF$Round7[i]]
}
head(chessF)
## # A tibble: 6 × 12
##   PlayerID PlayerName      TotalPoints Round1 Round2 Round3 Round4 Round5 Round6
##      <dbl> <chr>           <chr>        <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1        1 " GARY HUA    … "6.0  "       1436   1563   1600   1610   1649   1663
## 2        2 " DAKSHESH DAR… "6.0  "       1175    917   1716   1629   1604   1595
## 3        3 " ADITYA BAJAJ… "6.0  "       1641    955   1745   1563   1712   1666
## 4        4 " PATRICK H SC… "5.5  "       1363   1507   1553   1579   1655   1564
## 5        5 " HANSHI ZUO  … "5.5  "       1242    980   1663   1666   1716   1610
## 6        6 " HANSEN SONG … "5.0  "       1399   1602   1712   1438   1365   1552
## # ℹ 3 more variables: Round7 <dbl>, State <chr>, PreRating <dbl>

This line of code takes the seven opponents prerating numbers and take the average of each row and store them in a new column. (Since I was unable to replace the IDs with their corresponding prerating, my code just averages their ID numbers.)

chessF <- chessF |>
  rowwise() |>
  mutate('OppAvgPrerating' = mean(c(Round1, Round2, Round3, Round4, Round5, Round6, Round7), na.rm = TRUE))

chessF <- chessF |>
  rowwise() |>
  mutate('OppAvgPrerating' = round(OppAvgPrerating, digits = 0))

##Conclusion: I am left with a dataframe with information on Player’s Name, Total Points, State, PreRating, and Average Pre Chess Rating of Opponents for each player.

chessF <- select(chessF, -c("PlayerID", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7"))

head(chessF)
## # A tibble: 6 × 5
## # Rowwise: 
##   PlayerName                         TotalPoints State PreRating OppAvgPrerating
##   <chr>                              <chr>       <chr>     <dbl>           <dbl>
## 1 " GARY HUA                       … "6.0  "     "   …      1794            1605
## 2 " DAKSHESH DARURI                … "6.0  "     "   …      1553            1469
## 3 " ADITYA BAJAJ                   … "6.0  "     "   …      1384            1564
## 4 " PATRICK H SCHILLING            … "5.5  "     "   …      1716            1574
## 5 " HANSHI ZUO                     … "5.5  "     "   …      1655            1501
## 6 " HANSEN SONG                    … "5.0  "     "   …      1686            1519

Here is the .csv file:

write_csv(chessF, 'chessFinal.csv')