n this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played. If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth… The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments. You may substitute another text file (or set of text files, or data scraped from web pages) of similar or greater complexity, and create your own assignment and solution. You may work in a small team. All of your code should be in an R markdown file (and published to rpubs.com); with your data accessible for the person running the script.
# Reading in the text file provided for assignment
path = path.expand("~/OneDrive/Desktop/CUNY_SPS_Materials/DATA607 - DATA ACQ/Project1/tornamentinfo.txt")
print(path)
## [1] "C:/Users/johnf/OneDrive/Desktop/CUNY_SPS_Materials/DATA607 - DATA ACQ/Project1/tornamentinfo.txt"
### alternative is to read from git
# alt_text <-read.table("https://raw.githubusercontent.com/jhnboyy/CUNYSPS_DATA607/refs/heads/main/Project1/tornamentinfo.txt",header = FALSE, sep = "\n", dec = ".")
# alt_text
## CITATION for read.table:http://www.sthda.com/english/wiki/reading-data-from-txt-csv-files-r-base-functions#google_vignette
raw_txt <- read.table(path, header = FALSE, sep = "\n", dec = ".")#, ...)
#Ctation for typeof: https://swcarpentry.github.io/r-novice-inflammation/13-supp-data-structures.html
typeof(raw_txt)
## [1] "list"
#Getting the total number of values so i can extract the needed info
max_index <- length(raw_txt[,1])
print(max_index)
## [1] 196
#Generating the index numbers that i need to parae
## Citation: Found the seq() command:https://r02pro.github.io/vector-patterns.html
dash_entries = seq(from = 1, to = max_index, by = 3)
# print(dash_entries)
no_dash <- raw_txt[-c(dash_entries),]
# print(no_dash)
no_dash_len <- length(no_dash)
# print(no_dash_len)
#Skipping the First two rows that have original raw headers
raw_data <- no_dash[3:no_dash_len]
# print(raw_data)
#Hardcording the original Structure of the file into an empty df
data_frame = data.frame(
PairNum_State = character(),
PlayerName_USCFID_RtgPrePost = character(),
TtlPts = character(),
Round1 = character(),
Round2 = character(),
Round3 = character(),
Round4 = character(),
Round5 = character(),
Round6 = character(),
Round7 = character())
# Using a for loop to go item by item to parse, strip and clean.
## For loop Citation : https://epirhandbook.com/new_pages/iteration.html
for (raw_row in raw_data) {
# Removing last pip before splitting
raw_row <-str_sub(raw_row, end= -1)
split <- strsplit(raw_row, "\\|")
temp_vec <-c()
for (s in split){
trimmed_s <- str_trim(s, side="left")
trimmed_s <- str_trim(trimmed_s, side="right")
temp_vec = c(temp_vec, trimmed_s)
}
data_frame[nrow(data_frame)+1,] <- temp_vec
}
#Checking the dataframe results
head(data_frame)
## PairNum_State PlayerName_USCFID_RtgPrePost TtlPts Round1 Round2 Round3 Round4
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W B
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 4 MI 14598900 / R: 1553 ->1663 N:2 B W B W
## 5 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 6 MI 14959604 / R: 1384 ->1640 N:2 W B W B
## Round5 Round6 Round7
## 1 W 7 D 12 D 4
## 2 W B W
## 3 W 16 W 20 W 7
## 4 B W B
## 5 W 11 W 13 W 12
## 6 W B W
# Taking the rows that are the main ans secondary rows for each entry by index Number
regular_index = seq(from = 1, to = length(data_frame[,1]), by = 2)
#Split them into 2 separate dataframes and rename the columns in both to be proper
#Dealing with the "regular" or primary rows
regular_vals<-data_frame[c(regular_index),]
#Rename CItation: https://www.datanovia.com/en/lessons/rename-data-frame-columns-in-r/
regular_vals<-regular_vals %>%
rename(
PairNum = PairNum_State,
PlayerName=PlayerName_USCFID_RtgPrePost,
TtlPts_1=TtlPts,
Round1_1=Round1,
Round2_1=Round2,
Round3_1=Round3,
Round4_1=Round4,
Round5_1=Round5,
Round6_1=Round6,
Round7_1=Round7
)
#Using the Regular index as join column
regular_vals$join_col<-regular_index
#checking results
head(regular_vals)
## PairNum PlayerName TtlPts_1 Round1_1 Round2_1 Round3_1 Round4_1
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 5 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 7 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 9 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 11 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## Round5_1 Round6_1 Round7_1 join_col
## 1 W 7 D 12 D 4 1
## 3 W 16 W 20 W 7 3
## 5 W 11 W 13 W 12 5
## 7 D 5 W 19 D 1 7
## 9 D 4 W 14 W 17 9
## 11 D 10 W 27 W 21 11
## Secondary Lines that start with state vals
state_index = seq(from = 2, to = length(data_frame[,1]), by = 2)
state_vals <- data_frame[c(state_index),]
state_vals<-state_vals %>%
rename(
State=PairNum_State,
USCFID_RtgPrePost=PlayerName_USCFID_RtgPrePost,
TtlPts2=TtlPts,
sec_Round1=Round1,
Round2_2=Round2,
Round3_2=Round3,
Round4_2=Round4,
Round5_2=Round5,
Round6_2=Round6,
Round7_2=Round7
)
state_vals$join_col <- regular_index
head(state_vals)
## State USCFID_RtgPrePost TtlPts2 sec_Round1 Round2_2 Round3_2
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W
## 4 MI 14598900 / R: 1553 ->1663 N:2 B W B
## 6 MI 14959604 / R: 1384 ->1640 N:2 W B W
## 8 MI 12616049 / R: 1716 ->1744 N:2 W B W
## 10 MI 14601533 / R: 1655 ->1690 N:2 B W B
## 12 OH 15055204 / R: 1686 ->1687 N:3 W B W
## Round4_2 Round5_2 Round6_2 Round7_2 join_col
## 2 B W B W 1
## 4 W B W B 3
## 6 B W B W 5
## 8 B W B B 7
## 10 W B W B 9
## 12 B B W B 11
# Joining the two dfs
#Citation for merge: https://www.datanovia.com/en/lessons/rename-data-frame-columns-in-r/
all_data = merge(x = regular_vals, y = state_vals, by = "join_col")
head(all_data)
## join_col PairNum PlayerName TtlPts_1 Round1_1 Round2_1 Round3_1
## 1 1 1 GARY HUA 6.0 W 39 W 21 W 18
## 2 3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 3 5 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 4 7 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 5 9 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 6 11 6 HANSEN SONG 5.0 W 34 D 29 L 11
## Round4_1 Round5_1 Round6_1 Round7_1 State USCFID_RtgPrePost TtlPts2
## 1 W 14 W 7 D 12 D 4 ON 15445895 / R: 1794 ->1817 N:2
## 2 W 17 W 16 W 20 W 7 MI 14598900 / R: 1553 ->1663 N:2
## 3 W 21 W 11 W 13 W 12 MI 14959604 / R: 1384 ->1640 N:2
## 4 W 26 D 5 W 19 D 1 MI 12616049 / R: 1716 ->1744 N:2
## 5 D 13 D 4 W 14 W 17 MI 14601533 / R: 1655 ->1690 N:2
## 6 W 35 D 10 W 27 W 21 OH 15055204 / R: 1686 ->1687 N:3
## sec_Round1 Round2_2 Round3_2 Round4_2 Round5_2 Round6_2 Round7_2
## 1 W B W B W B W
## 2 B W B W B W B
## 3 W B W B W B W
## 4 W B W B W B B
## 5 B W B W B W B
## 6 W B W B B W B
#Cleaning up the column with the prerating
all_data$pre_rating <- str_extract(all_data$USCFID_RtgPrePost, "R:\\s*(\\d+)(P\\d+)?\\s*->")
all_data$pre_rating <-str_replace_all(all_data$pre_rating ,"R:", "")
all_data$pre_rating <-str_replace_all(all_data$pre_rating ,"(\\s?->)|(P(\\d+))", "")
all_data <- all_data %>%
separate(Round1_1, c('Round1_Status', 'Round1_Oppnt')) %>%
separate(Round2_1, c('Round2_Status', 'Round2_Oppnt')) %>%
separate(Round3_1, c('Round3_Status', 'Round3_Oppnt')) %>%
separate(Round4_1, c('Round4_Status', 'Round4_Oppnt')) %>%
separate(Round5_1, c('Round5_Status', 'Round5_Oppnt')) %>%
separate(Round6_1, c('Round6_Status', 'Round6_Oppnt')) %>%
separate(Round7_1, c('Round7_Status', 'Round7_Oppnt'))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [37, 44,
## 53, 56].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [59, 62].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [16, 48,
## 53, 62].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [54, 62].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 12 rows [12, 22, 36, 37,
## 38, 41, 48, 50, 53, 55, 56, 62].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 6 rows [41, 49, 58, 60,
## 62, 63].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 10 rows [16, 27, 29, 41,
## 49, 53, 57, 60, 62, 63].
head(all_data)
## join_col PairNum PlayerName TtlPts_1 Round1_Status Round1_Oppnt
## 1 1 1 GARY HUA 6.0 W 39
## 2 3 2 DAKSHESH DARURI 6.0 W 63
## 3 5 3 ADITYA BAJAJ 6.0 L 8
## 4 7 4 PATRICK H SCHILLING 5.5 W 23
## 5 9 5 HANSHI ZUO 5.5 W 45
## 6 11 6 HANSEN SONG 5.0 W 34
## Round2_Status Round2_Oppnt Round3_Status Round3_Oppnt Round4_Status
## 1 W 21 W 18 W
## 2 W 58 L 4 W
## 3 W 61 W 25 W
## 4 D 28 W 2 W
## 5 W 37 D 12 D
## 6 D 29 L 11 W
## Round4_Oppnt Round5_Status Round5_Oppnt Round6_Status Round6_Oppnt
## 1 14 W 7 D 12
## 2 17 W 16 W 20
## 3 21 W 11 W 13
## 4 26 D 5 W 19
## 5 13 D 4 W 14
## 6 35 D 10 W 27
## Round7_Status Round7_Oppnt State USCFID_RtgPrePost TtlPts2
## 1 D 4 ON 15445895 / R: 1794 ->1817 N:2
## 2 W 7 MI 14598900 / R: 1553 ->1663 N:2
## 3 W 12 MI 14959604 / R: 1384 ->1640 N:2
## 4 D 1 MI 12616049 / R: 1716 ->1744 N:2
## 5 W 17 MI 14601533 / R: 1655 ->1690 N:2
## 6 W 21 OH 15055204 / R: 1686 ->1687 N:3
## sec_Round1 Round2_2 Round3_2 Round4_2 Round5_2 Round6_2 Round7_2 pre_rating
## 1 W B W B W B W 1794
## 2 B W B W B W B 1553
## 3 W B W B W B W 1384
## 4 W B W B W B B 1716
## 5 B W B W B W B 1655
## 6 W B W B B W B 1686
#Limiting to the columns that we need for the project mandate Plyer Data
player_data <- all_data[c("PairNum","PlayerName","State", "TtlPts_1","pre_rating")]
head(player_data)
## PairNum PlayerName State TtlPts_1 pre_rating
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
# Limiting to the columns that need parsing for the scores
scoring_data <- all_data[c("PairNum","Round1_Oppnt","Round2_Oppnt","Round3_Oppnt",
"Round4_Oppnt","Round5_Oppnt","Round6_Oppnt","Round7_Oppnt")]
#### ---- START : DIDNT END UP NEEDING THIS SECTION BECAUSE OF THE na.rm feature. ----
# scoring_data_transposed <- t(scoring_data)
# head(scoring_data_transposed)
## Null counts to subrtract from 7 rounds for each play in ordinal fashion
##Citation: https://www.spsanderson.com/steveondata/posts/2024-05-07/#:~:text=We'll%20leverage%20the%20colSums,each%20column%20of%20a%20dataframe.&text=In%20this%20code%20snippet%2C%20is,count%20of%20NAs%20per%20column.
# na_counts <- colSums(is.na(scoring_data_transposed))
# print(na_counts)
# rounds_played <- 7 - na_counts
# head(rounds_played)
#Adding to df
# player_data$rounds_played <-rounds_played
# head(player_data)
#### ---- END: DIDNT END UP NEEDING THIS SECTION BECAUSE OF THE na.rm feature. ----
#New columns for the for loop to place values in
scoring_data$opp1_prerate = numeric(nrow(scoring_data))
scoring_data$opp2_prerate = numeric(nrow(scoring_data))
scoring_data$opp3_prerate = numeric(nrow(scoring_data))
scoring_data$opp4_prerate = numeric(nrow(scoring_data))
scoring_data$opp5_prerate = numeric(nrow(scoring_data))
scoring_data$opp6_prerate = numeric(nrow(scoring_data))
scoring_data$opp7_prerate = numeric(nrow(scoring_data))
## if clause citation: https://www.dataquest.io/blog/control-structures-in-r-using-loops-and-if-else-statements/
for (i in 1:nrow(scoring_data)) {
row <- scoring_data[i,]
opp1_number = row$Round1_Oppnt
opp2_number = row$Round2_Oppnt
opp3_number = row$Round3_Oppnt
opp4_number = row$Round4_Oppnt
opp5_number = row$Round5_Oppnt
opp6_number = row$Round6_Oppnt
opp7_number = row$Round7_Oppnt
if (!is.na(opp1_number)){
temp_df1<- player_data %>% filter(PairNum ==opp1_number)
scoring_data$opp1_prerate[i] <- as.numeric(temp_df1$pre_rating)
}
if (!is.na(opp2_number)){
temp_df2<- player_data %>% filter(PairNum ==opp2_number)
scoring_data$opp2_prerate[i] <- as.numeric(temp_df2$pre_rating)
}
if (!is.na(opp3_number)){
temp_df3<- player_data %>% filter(PairNum ==opp3_number)
scoring_data$opp3_prerate[i] <- as.numeric(temp_df3$pre_rating)
}
if (!is.na(opp4_number)){
temp_df4<- player_data %>% filter(PairNum ==opp4_number)
scoring_data$opp4_prerate[i] <- as.numeric(temp_df4$pre_rating)
}
if (!is.na(opp5_number)){
temp_df5<- player_data %>% filter(PairNum ==opp5_number)
scoring_data$opp5_prerate[i] <- as.numeric(temp_df5$pre_rating)
}
if (!is.na(opp6_number)){
temp_df6<- player_data %>% filter(PairNum ==opp6_number)
scoring_data$opp6_prerate[i] <- as.numeric(temp_df6$pre_rating)
}
if (!is.na(opp7_number)){
temp_df7<- player_data %>% filter(PairNum ==opp7_number)
scoring_data$opp7_prerate[i] <- as.numeric(temp_df7$pre_rating)
}
}
## Replacing the Zeros fro mhow i made the co;umns with Nulls
scoring_data$opp1_prerate[scoring_data$opp1_prerate == 0] <- NA
scoring_data$opp2_prerate[scoring_data$opp2_prerate == 0] <- NA
scoring_data$opp3_prerate[scoring_data$opp3_prerate == 0] <- NA
scoring_data$opp4_prerate[scoring_data$opp4_prerate == 0] <- NA
scoring_data$opp5_prerate[scoring_data$opp5_prerate == 0] <- NA
scoring_data$opp6_prerate[scoring_data$opp6_prerate == 0] <- NA
scoring_data$opp7_prerate[scoring_data$opp7_prerate == 0] <- NA
### Citation: https://stataiml.com/posts/calculate_mean_sel_columns_r/
scoring_data$opponentavg <- rowMeans(subset(scoring_data,
select =c(opp1_prerate,opp2_prerate,opp3_prerate,opp4_prerate,
opp5_prerate,opp6_prerate,opp7_prerate
)),
na.rm = TRUE)
scoring_data_lim <- select(scoring_data,PairNum,opponentavg)
head(scoring_data_lim)
## PairNum opponentavg
## 1 1 1605.286
## 2 2 1469.286
## 3 3 1563.571
## 4 4 1573.571
## 5 5 1500.857
## 6 6 1518.714
#merging together for semifinal df
semifinal = merge(x = player_data, y = scoring_data_lim, by = "PairNum")
## limiting to only the columns i need
final<- select(semifinal,PlayerName,State,TtlPts_1,pre_rating,opponentavg)
head(final)
## PlayerName State TtlPts_1 pre_rating opponentavg
## 1 GARY HUA ON 6.0 1794 1605.286
## 2 ANVIT RAO MI 5.0 1365 1554.143
## 3 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1467.571
## 4 KENNETH J TACK MI 4.5 1663 1506.167
## 5 TORRANCE HENRY JR MI 4.5 1666 1497.857
## 6 BRADLEY SHAW MI 4.5 1610 1515.000
## Placing into a local CSV
write.csv(final, "ChessPlayerData.csv")