Project 1 Scope

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

library(data.table)
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
original_file <- data.frame(read.table('C:/Users/burke/OneDrive/Desktop/Data 607/tournamentinfo_project1.txt', 
                                       sep = ",", fill = TRUE ))

reformatted_file <- data.frame(original_file[grep("-----------------------------------------------------------------------------------------", original_file$V1, invert = TRUE) , ])
#split into oddd and even rows to group the values appropriately
odd_rows <- data.frame(reformatted_file[ !c(TRUE,FALSE), ])  # rows


even_rows <-data.frame(reformatted_file[ !c(FALSE,TRUE), ])  # rows
#reformat the evens data 
split_even <- strsplit(as.character(even_rows$reformatted_file..c.FALSE..TRUE....), "|", fixed = TRUE)

Pair <- sapply(split_even , "[", 1)
Player_Name<- sapply(split_even , "[", 2)
Total<- sapply(split_even , "[", 3)
Total <- as.numeric(as.character(Total))
## Warning: NAs introduced by coercion
Round_1<- sapply(split_even , "[", 4)
Round_2<- sapply(split_even , "[", 5)
Round_3<- sapply(split_even , "[", 6)
Round_4<- sapply(split_even , "[", 7)
Round_5<- sapply(split_even , "[", 8)
Round_6<- sapply(split_even , "[", 9)
Round_7<- sapply(split_even , "[", 10)



evens_data  <- data.frame (Pair, Player_Name, Total, Round_1, Round_2, Round_3, Round_4,
                           Round_5, Round_6, Round_7)
evens_data <- evens_data[-1 ,]

knitr::kable(head(evens_data,10),  caption = "Initial Even Table")
Initial Even Table
Pair Player_Name Total Round_1 Round_2 Round_3 Round_4 Round_5 Round_6 Round_7
2 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12 D 4
3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16 W 20 W 7
4 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11 W 13 W 12
5 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5 W 19 D 1
6 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4 W 14 W 17
7 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35 D 10 W 27 W 21
8 7 GARY DEE SWATHELL 5.0 W 57 W 46 W 13 W 11 L 1 W 9 L 2
9 8 EZEKIEL HOUGHTON 5.0 W 3 W 32 L 14 L 9 W 47 W 28 W 19
10 9 STEFANO LEE 5.0 W 25 L 18 W 59 W 8 W 26 L 7 W 20
11 10 ANVIT RAO 5.0 D 16 L 19 W 55 W 31 D 6 W 25 W 18
sub_evens  <- evens_data[,c("Round_1","Round_2", "Round_3", "Round_4", "Round_5", "Round_6", "Round_7")]
expression_9 = "[A-Z]"
opp_1  <- str_replace_all(sub_evens$Round_1, expression_9, "")
opp_2  <- str_replace_all(sub_evens$Round_2, expression_9, "")
opp_3  <- str_replace_all(sub_evens$Round_3, expression_9, "")
opp_4  <- str_replace_all(sub_evens$Round_4, expression_9, "")
opp_5  <- str_replace_all(sub_evens$Round_5, expression_9, "")
opp_6  <- str_replace_all(sub_evens$Round_6, expression_9, "")
opp_7  <- str_replace_all(sub_evens$Round_7, expression_9, "")
opp_df <-data.frame (opp_1, opp_2, opp_3, opp_4, opp_5, opp_6, opp_7)

knitr::kable(head(opp_df,10),  caption = " Opponents/Round")
Opponents/Round
opp_1 opp_2 opp_3 opp_4 opp_5 opp_6 opp_7
39 21 18 14 7 12 4
63 58 4 17 16 20 7
8 61 25 21 11 13 12
23 28 2 26 5 19 1
45 37 12 13 4 14 17
34 29 11 35 10 27 21
57 46 13 11 1 9 2
3 32 14 9 47 28 19
25 18 59 8 26 7 20
16 19 55 31 6 25 18
#win/loss/draw metadata
sub_evens  <- evens_data[,c("Round_1","Round_2", "Round_3", "Round_4", "Round_5", "Round_6", "Round_7")]
expression_8 = "[:digit:]"
winlossdraw_1  <- str_replace_all(sub_evens$Round_1, expression_8, "")
winlossdraw_2 <- str_replace_all(sub_evens$Round_2, expression_8, "")
winlossdraw_3<- str_replace_all(sub_evens$Round_3, expression_8, "")
winlossdraw_4<- str_replace_all(sub_evens$Round_4, expression_8, "")
winlossdraw_5<- str_replace_all(sub_evens$Round_5, expression_8, "")
winlossdraw_6<- str_replace_all(sub_evens$Round_6, expression_8, "")
winlossdraw_7<- str_replace_all(sub_evens$Round_7, expression_8, "")
winlossdraw_df <-data.frame (winlossdraw_1, winlossdraw_2, winlossdraw_3, winlossdraw_4, winlossdraw_5, winlossdraw_6, winlossdraw_7)

knitr::kable(head(winlossdraw_df,10),  caption = "Win/Loss/Draw")
Win/Loss/Draw
winlossdraw_1 winlossdraw_2 winlossdraw_3 winlossdraw_4 winlossdraw_5 winlossdraw_6 winlossdraw_7
W W W W W D D
W W L W W W W
L W W W W W W
W D W W D W D
W W D D D W W
W D L W D W W
W W W W L W L
W W L L W W W
W L W W W L W
D L W W D W W
#filter out the important information from the evens dataframe 
reformatted_evens_df <- data.frame(evens_data[,c("Pair", "Player_Name", "Total")],
                                   opp_df,
                                   winlossdraw_df)

knitr::kable(head(reformatted_evens_df,10),  caption = "Even Rows- Data Table")
Even Rows- Data Table
Pair Player_Name Total opp_1 opp_2 opp_3 opp_4 opp_5 opp_6 opp_7 winlossdraw_1 winlossdraw_2 winlossdraw_3 winlossdraw_4 winlossdraw_5 winlossdraw_6 winlossdraw_7
2 1 GARY HUA 6.0 39 21 18 14 7 12 4 W W W W W D D
3 2 DAKSHESH DARURI 6.0 63 58 4 17 16 20 7 W W L W W W W
4 3 ADITYA BAJAJ 6.0 8 61 25 21 11 13 12 L W W W W W W
5 4 PATRICK H SCHILLING 5.5 23 28 2 26 5 19 1 W D W W D W D
6 5 HANSHI ZUO 5.5 45 37 12 13 4 14 17 W W D D D W W
7 6 HANSEN SONG 5.0 34 29 11 35 10 27 21 W D L W D W W
8 7 GARY DEE SWATHELL 5.0 57 46 13 11 1 9 2 W W W W L W L
9 8 EZEKIEL HOUGHTON 5.0 3 32 14 9 47 28 19 W W L L W W W
10 9 STEFANO LEE 5.0 25 18 59 8 26 7 20 W L W W W L W
11 10 ANVIT RAO 5.0 16 19 55 31 6 25 18 D L W W D W W
#reformatting of the odd data 

split_odd <-strsplit(as.character(odd_rows$reformatted_file..c.TRUE..FALSE...), "|", fixed = TRUE)

Num <- sapply(split_odd , "[", 1)
USCF_ID_Rtg_Pre_Post <- sapply(split_odd , "[", 2)
Pts<- sapply(split_odd , "[", 3)
Pts <- str_replace(Pts , "N:", "")
Pts <- as.numeric(as.character(Pts))
## Warning: NAs introduced by coercion
color_round_1<- sapply(split_odd , "[", 4)
color_round_2<- sapply(split_odd , "[", 5)
color_round_3<- sapply(split_odd , "[", 6)
color_round_4<- sapply(split_odd, "[", 7)
color_round_5<- sapply(split_odd , "[", 8)
color_round_6<- sapply(split_odd , "[", 9)
color_round_7<- sapply(split_odd , "[", 10)

odds_data <-data.frame (Num, USCF_ID_Rtg_Pre_Post, Pts, color_round_1, color_round_2, color_round_3, 
                        color_round_4, color_round_5, color_round_6, color_round_7)
odds_data <- odds_data[-1 ,]


knitr::kable(head(odds_data,10),  caption = "Odd Rows- Data Table")
Odd Rows- Data Table
Num USCF_ID_Rtg_Pre_Post Pts color_round_1 color_round_2 color_round_3 color_round_4 color_round_5 color_round_6 color_round_7
2 ON 15445895 / R: 1794 ->1817 2 W B W B W B W
3 MI 14598900 / R: 1553 ->1663 2 B W B W B W B
4 MI 14959604 / R: 1384 ->1640 2 W B W B W B W
5 MI 12616049 / R: 1716 ->1744 2 W B W B W B B
6 MI 14601533 / R: 1655 ->1690 2 B W B W B W B
7 OH 15055204 / R: 1686 ->1687 3 W B W B B W B
8 MI 11146376 / R: 1649 ->1673 3 W B W B B W W
9 MI 15142253 / R: 1641P17->1657P24 3 B W B W B W W
10 ON 14954524 / R: 1411 ->1564 2 W B W B W B B
11 MI 14150362 / R: 1365 ->1544 3 W W B B W B W
#isolate the USCF ID And the pre and post ratings 

test <- data.frame(USCF_ID_Rtg_Pre_Post)
id_split <-strsplit(as.character(test$USCF_ID_Rtg_Pre_Post), "/", fixed = TRUE)
USCF_ID <- sapply(id_split , "[", 1)
USCF_ID <- str_trim(USCF_ID)
head(USCF_ID,10)
##  [1] "USCF ID"  "15445895" "14598900" "14959604" "12616049" "14601533"
##  [7] "15055204" "11146376" "15142253" "14954524"
Rtg_Pre_Post <- sapply(id_split , "[", 2)
Rtg_Pre_Post <- data.frame(Rtg_Pre_Post)
names(Rtg_Pre_Post)
## [1] "Rtg_Pre_Post"
ratings_split <-strsplit(as.character(Rtg_Pre_Post$Rtg_Pre_Post), "->", fixed = TRUE)

#Pre Ratings
Pre_ratings <-sapply(ratings_split , "[", 1)
Pre_ratings <- str_replace(Pre_ratings , " R: ", "")
expression_10 = "[A-Z]+\\d{2}"
expression_11 = "[A-Z]+\\d{1}"
Pre_ratings  <- str_replace_all(Pre_ratings, expression_10, "")
Pre_ratings  <- str_replace_all(Pre_ratings, expression_11, "")
Pre_ratings <- str_trim(Pre_ratings)
head(Pre_ratings,10)
##  [1] "Rtg (Pre" "1794"     "1553"     "1384"     "1716"     "1655"    
##  [7] "1686"     "1649"     "1641"     "1411"
#Post Ratings 

Post_ratings <-sapply(ratings_split , "[", 2)
Post_ratings <- str_replace(Post_ratings , " R: ", "")
expression_10 = "[A-Z]+\\d{2}"
expression_11 = "[A-Z]+\\d{1}"
Post_ratings  <- str_replace_all(Post_ratings, expression_10, "")
Post_ratings  <- str_replace_all(Post_ratings, expression_11, "")
Post_ratings <- str_trim(Post_ratings)
head(Post_ratings,10)
##  [1] "Post)" "1817"  "1663"  "1640"  "1744"  "1690"  "1687"  "1673" 
##  [9] "1657"  "1564"
#convert all appliable data to numerics 
meta_data <-data.frame (USCF_ID ,Pre_ratings, Post_ratings)
meta_data <- meta_data[-1 ,]
meta_data$USCF_ID <-as.numeric(as.character(meta_data$USCF_ID))
meta_data$Pre_ratings <- as.numeric(as.character(meta_data$Pre_ratings))
meta_data$Post_ratings <- as.numeric(as.character(meta_data$Post_ratings))

knitr::kable(head(meta_data,10),  caption = "Meta Data Table")
Meta Data Table
USCF_ID Pre_ratings Post_ratings
2 15445895 1794 1817
3 14598900 1553 1663
4 14959604 1384 1640
5 12616049 1716 1744
6 14601533 1655 1690
7 15055204 1686 1687
8 11146376 1649 1673
9 15142253 1641 1657
10 14954524 1411 1564
11 14150362 1365 1544
#combined actionable elements from the odds and evens dataframe  
consolidated_df <- data.frame(reformatted_evens_df,
                              odds_data[,c("Num")], 
                              meta_data)

knitr::kable(head(consolidated_df,10),  caption = "Consolidated Table")
Consolidated Table
Pair Player_Name Total opp_1 opp_2 opp_3 opp_4 opp_5 opp_6 opp_7 winlossdraw_1 winlossdraw_2 winlossdraw_3 winlossdraw_4 winlossdraw_5 winlossdraw_6 winlossdraw_7 odds_data…c..Num… USCF_ID Pre_ratings Post_ratings
2 1 GARY HUA 6.0 39 21 18 14 7 12 4 W W W W W D D ON 15445895 1794 1817
3 2 DAKSHESH DARURI 6.0 63 58 4 17 16 20 7 W W L W W W W MI 14598900 1553 1663
4 3 ADITYA BAJAJ 6.0 8 61 25 21 11 13 12 L W W W W W W MI 14959604 1384 1640
5 4 PATRICK H SCHILLING 5.5 23 28 2 26 5 19 1 W D W W D W D MI 12616049 1716 1744
6 5 HANSHI ZUO 5.5 45 37 12 13 4 14 17 W W D D D W W MI 14601533 1655 1690
7 6 HANSEN SONG 5.0 34 29 11 35 10 27 21 W D L W D W W OH 15055204 1686 1687
8 7 GARY DEE SWATHELL 5.0 57 46 13 11 1 9 2 W W W W L W L MI 11146376 1649 1673
9 8 EZEKIEL HOUGHTON 5.0 3 32 14 9 47 28 19 W W L L W W W MI 15142253 1641 1657
10 9 STEFANO LEE 5.0 25 18 59 8 26 7 20 W L W W W L W ON 14954524 1411 1564
11 10 ANVIT RAO 5.0 16 19 55 31 6 25 18 D L W W D W W MI 14150362 1365 1544
#create a reference table to join on the columns
ref_table <-  data.table(consolidated_df[,c("Pair", "Pre_ratings")]) 
colnames(ref_table)[colnames(ref_table)=="Pair"] <- "opp_1" 
ref_table$opp_1 <- as.numeric(as.character(ref_table$opp_1))

knitr::kable(head(ref_table,10),  caption = "Mapping Table")
Mapping Table
opp_1 Pre_ratings
1 1794
2 1553
3 1384
4 1716
5 1655
6 1686
7 1649
8 1641
9 1411
10 1365
#formatting the columns to numerics 
consolidated_df$opp_1 <- as.numeric(as.character(consolidated_df$opp_1))
consolidated_df$opp_2 <- as.numeric(as.character(consolidated_df$opp_2))
consolidated_df$opp_3 <- as.numeric(as.character(consolidated_df$opp_3))
consolidated_df$opp_4 <- as.numeric(as.character(consolidated_df$opp_4))
consolidated_df$opp_5 <- as.numeric(as.character(consolidated_df$opp_5))
consolidated_df$opp_6 <- as.numeric(as.character(consolidated_df$opp_6))
consolidated_df$opp_7 <- as.numeric(as.character(consolidated_df$opp_7))

consolidated_df$opp_1[is.na(consolidated_df$opp_1)] <- 0
consolidated_df$opp_2[is.na(consolidated_df$opp_2)] <- 0
consolidated_df$opp_3[is.na(consolidated_df$opp_3)] <- 0
consolidated_df$opp_4[is.na(consolidated_df$opp_4)] <- 0
consolidated_df$opp_5[is.na(consolidated_df$opp_5)] <- 0
consolidated_df$opp_6[is.na(consolidated_df$opp_6)] <- 0
consolidated_df$opp_7[is.na(consolidated_df$opp_7)] <- 0
#get all of the pre ratings from the reference table, merge onto the original
merged_df <- merge(x = consolidated_df, y = ref_table, by = "opp_1", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings.y"] <- "opp_1_rating"
colnames(ref_table)[colnames(ref_table)=="opp_1"] <- "opp_2" 
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_2", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_2_rating"
colnames(ref_table)[colnames(ref_table)=="opp_2"] <- "opp_3" 
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_3", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_3_rating"
colnames(ref_table)[colnames(ref_table)=="opp_3"] <- "opp_4" 
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_4", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_4_rating"
colnames(ref_table)[colnames(ref_table)=="opp_4"] <- "opp_5"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_5", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_5_rating"
colnames(ref_table)[colnames(ref_table)=="opp_5"] <- "opp_6"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_6", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_6_rating"
colnames(ref_table)[colnames(ref_table)=="opp_6"] <- "opp_7"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_7", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_7_rating"
colnames(ref_table)[colnames(ref_table)=="opp_6"] <- "opp_7"
#fill in the nulls
merged_df$opp_1_rating[is.na(merged_df$opp_1_rating)] <- 0
merged_df$opp_2_rating[is.na(merged_df$opp_2_rating)] <- 0
merged_df$opp_3_rating[is.na(merged_df$opp_3_rating)] <- 0
merged_df$opp_4_rating[is.na(merged_df$opp_4_rating)] <- 0
merged_df$opp_5_rating[is.na(merged_df$opp_5_rating)] <- 0
merged_df$opp_6_rating[is.na(merged_df$opp_6_rating)] <- 0
merged_df$opp_7_rating[is.na(merged_df$opp_7_rating)] <- 0
merged_df$sum_toal_pre_ratings <- (merged_df$opp_1_rating+ merged_df$opp_2_rating+
                        merged_df$opp_3_rating+ merged_df$opp_4_rating+
                                     merged_df$opp_5_rating+ merged_df$opp_6_rating+
                                     merged_df$opp_7_rating)
merged_df$div<- 7 -(rowSums(merged_df[,23:29] == 0))
merged_df$average_opp_pre_ratings <- merged_df$sum_toal_pre_ratings/merged_df$div

Final Table

Format: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

#final Dataframe
final_df <- merged_df[,c("Pair","Player_Name","odds_data...c..Num...", "Total","Pre_ratings.x", "average_opp_pre_ratings" )]
final_df<- final_df[order(final_df$Pair),] 
#renaming some columns
setnames(final_df, old = c('Pair', 'odds_data...c..Num...','Total','Pre_ratings.x', 'average_opp_pre_ratings' ), 
         new = c('Ranking','Player State','Total Number of Points', 'Player Pre-Ranking','Average Opponent Pre-Ranking'))

knitr::kable(head(final_df ,10),  caption = "Final-Cleaned DataFrame")
Final-Cleaned DataFrame
Ranking Player_Name Player State Total Number of Points Player Pre-Ranking Average Opponent Pre-Ranking
14 1 GARY HUA ON 6.0 1794 1605.286
17 2 DAKSHESH DARURI MI 6.0 1553 1469.286
22 3 ADITYA BAJAJ MI 6.0 1384 1563.571
11 4 PATRICK H SCHILLING MI 5.5 1716 1573.571
26 5 HANSHI ZUO MI 5.5 1655 1500.857
30 6 HANSEN SONG OH 5.0 1686 1518.714
12 7 GARY DEE SWATHELL MI 5.0 1649 1372.143
28 8 EZEKIEL HOUGHTON MI 5.0 1641 1468.429
29 9 STEFANO LEE ON 5.0 1411 1523.143
27 10 ANVIT RAO MI 5.0 1365 1554.143