My initial focus is on importing the text file and then storing the relevant data into a dataframe that can be used for additional pre-processing
data_string = readLines(file_name, warn = F)
cols = c('pair_num', 'player_name','player_points', 'player_state', 'pre_rating',
'round1_opponent', 'round2_opponent', 'round3_opponent', 'round4_opponent',
'round5_opponent', 'round6_opponent', 'round7_opponent', 'avg_opponent_rating')
player_matrix = matrix(nrow=64, ncol=length(cols))
colnames(player_matrix) = cols
player_df = as_tibble(player_matrix)
player_index = 1
sub_index = 2
while(player_index<=64) {
row1 = data_string[2*player_index+sub_index+1]
row2 = data_string[2*player_index+sub_index+2]
row1_string_vect = str_split(row1,'[|]')
row2_string_vect = str_split(row2,'[|]')
player_df$pair_num[player_index] = row1_string_vect[[1]][1]
player_df$player_name[player_index] = row1_string_vect[[1]][2]
player_df$player_points[player_index] = row1_string_vect[[1]][3]
player_df$player_state[player_index] = row2_string_vect[[1]][1]
player_df$pre_rating[player_index] = row2_string_vect[[1]][2]
player_df$round1_opponent[player_index] = row1_string_vect[[1]][4]
player_df$round2_opponent[player_index] = row1_string_vect[[1]][5]
player_df$round3_opponent[player_index] = row1_string_vect[[1]][6]
player_df$round4_opponent[player_index] = row1_string_vect[[1]][7]
player_df$round5_opponent[player_index] = row1_string_vect[[1]][8]
player_df$round6_opponent[player_index] = row1_string_vect[[1]][9]
player_df$round7_opponent[player_index] = row1_string_vect[[1]][10]
#print(pre_rating)
player_index=player_index+1
sub_index = sub_index+1
}
In this data cleaning chunk I want to complete the following pre-processing steps for the following columns:
#Clean up the white space in the first four columns
player_df = player_df %>% mutate(pair_num = str_squish(pair_num),
player_name = str_squish(player_name),
player_points = str_squish(player_points),
player_state = str_squish(player_state),
pre_rating = str_squish(pre_rating))
#Capture the player's pre-rating
player_df = player_df %>% mutate(pre_rating = str_extract(str_extract_all(player_df$pre_rating,'R:\\s+[\\d]+'),'[\\d]+'))
#Capture the player's opponent number for each round
player_df = player_df %>% mutate(round1_opponent = str_extract(round1_opponent, '[\\d]+'),
round2_opponent = str_extract(round2_opponent, '[\\d]+'),
round3_opponent = str_extract(round3_opponent, '[\\d]+'),
round4_opponent = str_extract(round4_opponent, '[\\d]+'),
round5_opponent = str_extract(round5_opponent, '[\\d]+'),
round6_opponent = str_extract(round6_opponent, '[\\d]+'),
round7_opponent = str_extract(round7_opponent, '[\\d]+'))
This block of code loops through the player dataframe and calculates the average pre-chess rating of the opponents for each player
#Create a function that takes a row and calculates the average pre-match ranking for each opponent
calc_average = function(df_row) {
round1_opponent_score = (player_df %>% filter(player_df$pair_num == df_row$round1_opponent))$pre_rating
round2_opponent_score = (player_df %>% filter(player_df$pair_num == df_row$round2_opponent))$pre_rating
round3_opponent_score = (player_df %>% filter(player_df$pair_num == df_row$round3_opponent))$pre_rating
round4_opponent_score = (player_df %>% filter(player_df$pair_num == df_row$round4_opponent))$pre_rating
round5_opponent_score = (player_df %>% filter(player_df$pair_num == df_row$round5_opponent))$pre_rating
round6_opponent_score = (player_df %>% filter(player_df$pair_num == df_row$round6_opponent))$pre_rating
round7_opponent_score = (player_df %>% filter(player_df$pair_num == df_row$round7_opponent))$pre_rating
pre_rating_scores = c(round1_opponent_score,round2_opponent_score,round3_opponent_score,
round4_opponent_score,round5_opponent_score,round6_opponent_score,
round7_opponent_score)
average_opponent_pre_rating = mean(as.numeric(pre_rating_scores))
return(average_opponent_pre_rating)
}
for(i in 1:nrow(player_df)) {
avg_score = calc_average(player_df[i,])
player_df[i,]$avg_opponent_rating = round(avg_score,0)
}
This block of code creates a simplified dataframe with only the requested fields and then exports it to a .csv file.
simple_df = player_df %>% select(c('player_name','player_state','player_points','pre_rating', 'avg_opponent_rating'))
knitr::kable(head(simple_df))
| player_name | player_state | player_points | pre_rating | avg_opponent_rating |
|---|---|---|---|---|
| GARY HUA | ON | 6.0 | 1794 | 1605 |
| DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| HANSEN SONG | OH | 5.0 | 1686 | 1519 |
write_csv(simple_df, 'KMartin_project1_output.csv')