I used the same chess text data file suggested in the project 1 page on blackboard (so that the data is available to whoever going to run the code again).
I’m first going to try to clean the data and turn into into a structured form.
# structuring the data
library(stringr)
# indicating the separator lines
separator_indices <- grep("^-+$", lines)
player_data_indices <- c()
for (i in seq_along(separator_indices)[-length(separator_indices)]) {
player_data_indices <- c(player_data_indices, (separator_indices[i] + 1), (separator_indices[i] + 2))
}
# extracting data lines
player_data_lines <- lines[player_data_indices]
# separating the lines into two columns
player_data <- data.frame(
`Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|` = player_data_lines[c(TRUE, FALSE)],
`Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |` = player_data_lines[c(FALSE, TRUE)],
stringsAsFactors = FALSE
)
# splitting a column into multiple columns
split_into_columns <- function(column, n) {
do.call(rbind, strsplit(trimws(column), "\\s*\\|\\s*", perl = TRUE))
}
player_data_first_half <- split_into_columns(player_data$`Pair...Player.Name..Total.Round.Round.Round.Round.Round.Round.Round.`, 10)
player_data_second_half <- split_into_columns(player_data$`Num...USCF.ID...Rtg..Pre..Post....Pts...1...2...3...4...5...6...7..`, 10)
# combining into a single df
combined_player_data <- cbind(
as.data.frame(player_data_first_half, stringsAsFactors = FALSE),
as.data.frame(player_data_second_half, stringsAsFactors = FALSE)
)
# adding column names
colnames(combined_player_data) <- c(
"PairNum", "PlayerName", "TotalPoints", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7",
"State", "USCF_ID, PreRating, PostRating", "Pts", "Game1", "Game2", "Game3", "Game4", "Game5", "Game6", "Game7"
)
# Removing the first row that has the labels already used
combined_player_data <- combined_player_data[-1, ]
# separating the column "USCF_ID_PreRating_PostRating" into the respective new columns:
library(tidyr)
combined_player_data <- separate(combined_player_data,
col = `USCF_ID, PreRating, PostRating`,
into = c("USCF_ID", "PreRating", "PostRating"),
sep = " / | ?->",
remove = TRUE,
convert = TRUE)
# fixing some issues with the PreRating column
# removing the 'R: ' from the PreRating column
combined_player_data$PreRating <- gsub("R: ", "", combined_player_data$PreRating)
########### added in revised submission: there is a P letter in some of the PreRating as well as PostRating cells that is causing some of the cells to not be taken into account when using this variable. So I will remove this P letter and the two digits that follow it.
combined_player_data$PreRating <- as.numeric(sub("P\\d{1,2}", "", combined_player_data$PreRating))
combined_player_data$PostRating <- as.numeric(sub("P\\d{1,2}", "", combined_player_data$PostRating))
### NO NEED FOR THIS PART ANYMORE
# extract the missing values into the PostRating column
#combined_player_data$PostRating <- ifelse(is.na(combined_player_data$PostRating),
# as.character(sapply(combined_player_data$PreRating, function(x) {
# matches <- regmatches(x, regexec("->(\\d+P\\d+)", x))
# if (length(matches[[1]]) > 1) {
# return(matches[[1]][2])
# } else {
# return(NA)
# }
# })),
# combined_player_data$PostRating)
# removing '->' from PreRating column if it exists
#combined_player_data$PreRating <- gsub("->.*", "", combined_player_data$PreRating)
# making sure the PreRating and PostRating columns are numeric
#combined_player_data$PreRating <- as.numeric(combined_player_data$PreRating)
#combined_player_data$PostRating <- as.numeric(gsub("P\\d+", "", combined_player_data$PostRating))
head(combined_player_data)
## PairNum PlayerName TotalPoints Round1 Round2 Round3 Round4 Round5
## 2 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 4 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 5 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5
## 6 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4
## 7 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35 D 10
## Round6 Round7 State USCF_ID PreRating PostRating Pts Game1 Game2 Game3 Game4
## 2 D 12 D 4 ON 15445895 1794 1817 N:2 W B W B
## 3 W 20 W 7 MI 14598900 1553 1663 N:2 B W B W
## 4 W 13 W 12 MI 14959604 1384 1640 N:2 W B W B
## 5 W 19 D 1 MI 12616049 1716 1744 N:2 W B W B
## 6 W 14 W 17 MI 14601533 1655 1690 N:2 B W B W
## 7 W 27 W 21 OH 15055204 1686 1687 N:3 W B W B
## Game5 Game6 Game7
## 2 W B W
## 3 B W B
## 4 W B W
## 5 W B B
## 6 B W B
## 7 B W B
Now I can see that I have 64 players’ data, each player’s data is in a separate row.
Let’s explore the data a bit more, before thinking about further steps to clean it further (as it clearly still needs some cleaning, especially for the )
# checking if there are any duplicates in the PlayerNames column
duplicates <- combined_player_data[duplicated(combined_player_data$PlayerName) | duplicated(combined_player_data$PlayerName, fromLast = TRUE), ]
print(paste("Number of duplicates in PlayerName:", nrow(duplicates)))
## [1] "Number of duplicates in PlayerName: 0"
# I found 0 duplicates
# summary statistics of the TotalPoints, PreRating, and PostRating variables
summary_statistics <- summary(combined_player_data[, c("TotalPoints", "PreRating", "PostRating")])
print(summary_statistics)
## TotalPoints PreRating PostRating
## Length:64 Min. : 377 Min. : 878
## Class :character 1st Qu.:1227 1st Qu.:1200
## Mode :character Median :1407 Median :1418
## Mean :1378 Mean :1398
## 3rd Qu.:1583 3rd Qu.:1578
## Max. :1794 Max. :1817
Let’s see if there is a correlation between the PreRating and the PostRating scores for a player:
cor_test_result <- cor.test(combined_player_data$PreRating, combined_player_data$PostRating,
method = "pearson", use = "complete.obs")
p_value <- cor_test_result$p.value
conf_int <- cor_test_result$conf.int
print(cor_test_result)
##
## Pearson's product-moment correlation
##
## data: combined_player_data$PreRating and combined_player_data$PostRating
## t = 17.604, df = 62, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8600145 0.9463167
## sample estimates:
## cor
## 0.9128481
# and a graph to visualize that
library(ggplot2)
plot <- ggplot(combined_player_data, aes(x = PreRating, y = PostRating)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Correlation between PreRating and PostRating",
x = "PreRating", y = "PostRating") +
annotate("text", x = Inf, y = Inf, hjust = 1, vjust = 3,
label = paste("Pearson r:", round(cor_test_result$estimate, 3),
"\np-value:", signif(p_value, digits = 3),
"\n95% CI:", paste("(", round(conf_int[1], 3), ",", round(conf_int[2], 3), ")", sep = "")),
size = 3.5) +
theme_minimal()
print(plot)
Looks like there is a VERY STRONG correlation between the two scores, which makes sense.
Now: There are 7 columns named Round1 all the way to Round7. Each of these variables has a latter (W for a win, L for a loss, and D for a draw), along with a number. The number is key because it refers to the opponent who played against each person (the same number refers to the PairNum variable). So for example, for Round1 of Gary Hua, he played against Joel R Hendon in PairNum 39, etc.
I will first separate each of the Round1-Round7 variables into “Round1_result” and Round1_Opponent” which will take the letter and the number, respectively. Then I will use the Round1_Opponent so I can Create a new variable labelled Round1_Opponent_PreRating” which will be the PreRating variable value for the Opponent, but this new variable will be for each row for each Round.
# separating the Round columns into result and opponent's PairNum
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
for(i in 1:7) {
round_col <- paste0("Round", i)
combined_player_data <- combined_player_data %>%
separate(col = round_col,
into = c(paste0(round_col, "_result"), paste0(round_col, "_Opponent")),
sep = "\\s(?=\\d)",
remove = TRUE,
convert = FALSE,
extra = "merge",
fill = "right")
}
# creating a lookup table for PairNum to PreRating
lookup_table <- combined_player_data %>% select(PairNum, PreRating)
# a function to lookup PreRating based on PairNum
get_opponent_prerating <- function(pair_num) {
opponent_prerating <- lookup_table %>%
filter(PairNum == pair_num) %>%
pull(PreRating)
if(length(opponent_prerating) == 0) return(NA)
return(opponent_prerating)
}
# finally, to create the RoundX_Opponent_PreRating variables
for(i in 1:7) {
combined_player_data <- combined_player_data %>%
mutate(!!paste0("Round", i, "_Opponent_PreRating") :=
mapply(get_opponent_prerating, .data[[paste0("Round", i, "_Opponent")]]))
}
Now calculating the opponents preratings for each player:
opponent_preRating_columns <- combined_player_data %>%
select(matches("Round\\d+_Opponent_PreRating"))
opponent_preRating_columns[] <- lapply(opponent_preRating_columns, function(x) as.numeric(as.character(x)))
combined_player_data$Average_Opponent_PreRating <- rowMeans(opponent_preRating_columns, na.rm = TRUE)
combined_player_data$Average_Opponent_PreRating <- round(combined_player_data$Average_Opponent_PreRating, 0)
# Show
average_pre_ratings <- combined_player_data$Average_Opponent_PreRating
average_pre_ratings
## [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515 1484
## [16] 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522 1314 1144
## [31] 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150 1107 1327 1152
## [46] 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414 1363 1391 1319 1330
## [61] 1327 1186 1350 1263