Project 1 Scope
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
library(data.table)
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
original_file <- data.frame(read.table('C:/Users/burke/OneDrive/Desktop/Data 607/tournamentinfo_project1.txt',
sep = ",", fill = TRUE ))
reformatted_file <- data.frame(original_file[grep("-----------------------------------------------------------------------------------------", original_file$V1, invert = TRUE) , ])
#split into oddd and even rows to group the values appropriately
odd_rows <- data.frame(reformatted_file[ !c(TRUE,FALSE), ]) # rows
even_rows <-data.frame(reformatted_file[ !c(FALSE,TRUE), ]) # rows
#reformat the evens data
split_even <- strsplit(as.character(even_rows$reformatted_file..c.FALSE..TRUE....), "|", fixed = TRUE)
Pair <- sapply(split_even , "[", 1)
Player_Name<- sapply(split_even , "[", 2)
Total<- sapply(split_even , "[", 3)
Total <- as.numeric(as.character(Total))
## Warning: NAs introduced by coercion
Round_1<- sapply(split_even , "[", 4)
Round_2<- sapply(split_even , "[", 5)
Round_3<- sapply(split_even , "[", 6)
Round_4<- sapply(split_even , "[", 7)
Round_5<- sapply(split_even , "[", 8)
Round_6<- sapply(split_even , "[", 9)
Round_7<- sapply(split_even , "[", 10)
evens_data <- data.frame (Pair, Player_Name, Total, Round_1, Round_2, Round_3, Round_4,
Round_5, Round_6, Round_7)
evens_data <- evens_data[-1 ,]
knitr::kable(head(evens_data,10), caption = "Initial Even Table")
Initial Even Table
| 2 |
1 |
GARY HUA |
6.0 |
W 39 |
W 21 |
W 18 |
W 14 |
W 7 |
D 12 |
D 4 |
| 3 |
2 |
DAKSHESH DARURI |
6.0 |
W 63 |
W 58 |
L 4 |
W 17 |
W 16 |
W 20 |
W 7 |
| 4 |
3 |
ADITYA BAJAJ |
6.0 |
L 8 |
W 61 |
W 25 |
W 21 |
W 11 |
W 13 |
W 12 |
| 5 |
4 |
PATRICK H SCHILLING |
5.5 |
W 23 |
D 28 |
W 2 |
W 26 |
D 5 |
W 19 |
D 1 |
| 6 |
5 |
HANSHI ZUO |
5.5 |
W 45 |
W 37 |
D 12 |
D 13 |
D 4 |
W 14 |
W 17 |
| 7 |
6 |
HANSEN SONG |
5.0 |
W 34 |
D 29 |
L 11 |
W 35 |
D 10 |
W 27 |
W 21 |
| 8 |
7 |
GARY DEE SWATHELL |
5.0 |
W 57 |
W 46 |
W 13 |
W 11 |
L 1 |
W 9 |
L 2 |
| 9 |
8 |
EZEKIEL HOUGHTON |
5.0 |
W 3 |
W 32 |
L 14 |
L 9 |
W 47 |
W 28 |
W 19 |
| 10 |
9 |
STEFANO LEE |
5.0 |
W 25 |
L 18 |
W 59 |
W 8 |
W 26 |
L 7 |
W 20 |
| 11 |
10 |
ANVIT RAO |
5.0 |
D 16 |
L 19 |
W 55 |
W 31 |
D 6 |
W 25 |
W 18 |
sub_evens <- evens_data[,c("Round_1","Round_2", "Round_3", "Round_4", "Round_5", "Round_6", "Round_7")]
expression_9 = "[A-Z]"
opp_1 <- str_replace_all(sub_evens$Round_1, expression_9, "")
opp_2 <- str_replace_all(sub_evens$Round_2, expression_9, "")
opp_3 <- str_replace_all(sub_evens$Round_3, expression_9, "")
opp_4 <- str_replace_all(sub_evens$Round_4, expression_9, "")
opp_5 <- str_replace_all(sub_evens$Round_5, expression_9, "")
opp_6 <- str_replace_all(sub_evens$Round_6, expression_9, "")
opp_7 <- str_replace_all(sub_evens$Round_7, expression_9, "")
opp_df <-data.frame (opp_1, opp_2, opp_3, opp_4, opp_5, opp_6, opp_7)
knitr::kable(head(opp_df,10), caption = " Opponents/Round")
Opponents/Round
| 39 |
21 |
18 |
14 |
7 |
12 |
4 |
| 63 |
58 |
4 |
17 |
16 |
20 |
7 |
| 8 |
61 |
25 |
21 |
11 |
13 |
12 |
| 23 |
28 |
2 |
26 |
5 |
19 |
1 |
| 45 |
37 |
12 |
13 |
4 |
14 |
17 |
| 34 |
29 |
11 |
35 |
10 |
27 |
21 |
| 57 |
46 |
13 |
11 |
1 |
9 |
2 |
| 3 |
32 |
14 |
9 |
47 |
28 |
19 |
| 25 |
18 |
59 |
8 |
26 |
7 |
20 |
| 16 |
19 |
55 |
31 |
6 |
25 |
18 |
#win/loss/draw metadata
sub_evens <- evens_data[,c("Round_1","Round_2", "Round_3", "Round_4", "Round_5", "Round_6", "Round_7")]
expression_8 = "[:digit:]"
winlossdraw_1 <- str_replace_all(sub_evens$Round_1, expression_8, "")
winlossdraw_2 <- str_replace_all(sub_evens$Round_2, expression_8, "")
winlossdraw_3<- str_replace_all(sub_evens$Round_3, expression_8, "")
winlossdraw_4<- str_replace_all(sub_evens$Round_4, expression_8, "")
winlossdraw_5<- str_replace_all(sub_evens$Round_5, expression_8, "")
winlossdraw_6<- str_replace_all(sub_evens$Round_6, expression_8, "")
winlossdraw_7<- str_replace_all(sub_evens$Round_7, expression_8, "")
winlossdraw_df <-data.frame (winlossdraw_1, winlossdraw_2, winlossdraw_3, winlossdraw_4, winlossdraw_5, winlossdraw_6, winlossdraw_7)
knitr::kable(head(winlossdraw_df,10), caption = "Win/Loss/Draw")
Win/Loss/Draw
| W |
W |
W |
W |
W |
D |
D |
| W |
W |
L |
W |
W |
W |
W |
| L |
W |
W |
W |
W |
W |
W |
| W |
D |
W |
W |
D |
W |
D |
| W |
W |
D |
D |
D |
W |
W |
| W |
D |
L |
W |
D |
W |
W |
| W |
W |
W |
W |
L |
W |
L |
| W |
W |
L |
L |
W |
W |
W |
| W |
L |
W |
W |
W |
L |
W |
| D |
L |
W |
W |
D |
W |
W |
#filter out the important information from the evens dataframe
reformatted_evens_df <- data.frame(evens_data[,c("Pair", "Player_Name", "Total")],
opp_df,
winlossdraw_df)
knitr::kable(head(reformatted_evens_df,10), caption = "Even Rows- Data Table")
Even Rows- Data Table
| 2 |
1 |
GARY HUA |
6.0 |
39 |
21 |
18 |
14 |
7 |
12 |
4 |
W |
W |
W |
W |
W |
D |
D |
| 3 |
2 |
DAKSHESH DARURI |
6.0 |
63 |
58 |
4 |
17 |
16 |
20 |
7 |
W |
W |
L |
W |
W |
W |
W |
| 4 |
3 |
ADITYA BAJAJ |
6.0 |
8 |
61 |
25 |
21 |
11 |
13 |
12 |
L |
W |
W |
W |
W |
W |
W |
| 5 |
4 |
PATRICK H SCHILLING |
5.5 |
23 |
28 |
2 |
26 |
5 |
19 |
1 |
W |
D |
W |
W |
D |
W |
D |
| 6 |
5 |
HANSHI ZUO |
5.5 |
45 |
37 |
12 |
13 |
4 |
14 |
17 |
W |
W |
D |
D |
D |
W |
W |
| 7 |
6 |
HANSEN SONG |
5.0 |
34 |
29 |
11 |
35 |
10 |
27 |
21 |
W |
D |
L |
W |
D |
W |
W |
| 8 |
7 |
GARY DEE SWATHELL |
5.0 |
57 |
46 |
13 |
11 |
1 |
9 |
2 |
W |
W |
W |
W |
L |
W |
L |
| 9 |
8 |
EZEKIEL HOUGHTON |
5.0 |
3 |
32 |
14 |
9 |
47 |
28 |
19 |
W |
W |
L |
L |
W |
W |
W |
| 10 |
9 |
STEFANO LEE |
5.0 |
25 |
18 |
59 |
8 |
26 |
7 |
20 |
W |
L |
W |
W |
W |
L |
W |
| 11 |
10 |
ANVIT RAO |
5.0 |
16 |
19 |
55 |
31 |
6 |
25 |
18 |
D |
L |
W |
W |
D |
W |
W |
#reformatting of the odd data
split_odd <-strsplit(as.character(odd_rows$reformatted_file..c.TRUE..FALSE...), "|", fixed = TRUE)
Num <- sapply(split_odd , "[", 1)
USCF_ID_Rtg_Pre_Post <- sapply(split_odd , "[", 2)
Pts<- sapply(split_odd , "[", 3)
Pts <- str_replace(Pts , "N:", "")
Pts <- as.numeric(as.character(Pts))
## Warning: NAs introduced by coercion
color_round_1<- sapply(split_odd , "[", 4)
color_round_2<- sapply(split_odd , "[", 5)
color_round_3<- sapply(split_odd , "[", 6)
color_round_4<- sapply(split_odd, "[", 7)
color_round_5<- sapply(split_odd , "[", 8)
color_round_6<- sapply(split_odd , "[", 9)
color_round_7<- sapply(split_odd , "[", 10)
odds_data <-data.frame (Num, USCF_ID_Rtg_Pre_Post, Pts, color_round_1, color_round_2, color_round_3,
color_round_4, color_round_5, color_round_6, color_round_7)
odds_data <- odds_data[-1 ,]
knitr::kable(head(odds_data,10), caption = "Odd Rows- Data Table")
Odd Rows- Data Table
| 2 |
ON |
15445895 / R: 1794 ->1817 |
2 |
W |
B |
W |
B |
W |
B |
W |
| 3 |
MI |
14598900 / R: 1553 ->1663 |
2 |
B |
W |
B |
W |
B |
W |
B |
| 4 |
MI |
14959604 / R: 1384 ->1640 |
2 |
W |
B |
W |
B |
W |
B |
W |
| 5 |
MI |
12616049 / R: 1716 ->1744 |
2 |
W |
B |
W |
B |
W |
B |
B |
| 6 |
MI |
14601533 / R: 1655 ->1690 |
2 |
B |
W |
B |
W |
B |
W |
B |
| 7 |
OH |
15055204 / R: 1686 ->1687 |
3 |
W |
B |
W |
B |
B |
W |
B |
| 8 |
MI |
11146376 / R: 1649 ->1673 |
3 |
W |
B |
W |
B |
B |
W |
W |
| 9 |
MI |
15142253 / R: 1641P17->1657P24 |
3 |
B |
W |
B |
W |
B |
W |
W |
| 10 |
ON |
14954524 / R: 1411 ->1564 |
2 |
W |
B |
W |
B |
W |
B |
B |
| 11 |
MI |
14150362 / R: 1365 ->1544 |
3 |
W |
W |
B |
B |
W |
B |
W |
#isolate the USCF ID And the pre and post ratings
test <- data.frame(USCF_ID_Rtg_Pre_Post)
id_split <-strsplit(as.character(test$USCF_ID_Rtg_Pre_Post), "/", fixed = TRUE)
USCF_ID <- sapply(id_split , "[", 1)
USCF_ID <- str_trim(USCF_ID)
head(USCF_ID,10)
## [1] "USCF ID" "15445895" "14598900" "14959604" "12616049" "14601533"
## [7] "15055204" "11146376" "15142253" "14954524"
Rtg_Pre_Post <- sapply(id_split , "[", 2)
Rtg_Pre_Post <- data.frame(Rtg_Pre_Post)
names(Rtg_Pre_Post)
## [1] "Rtg_Pre_Post"
ratings_split <-strsplit(as.character(Rtg_Pre_Post$Rtg_Pre_Post), "->", fixed = TRUE)
#Pre Ratings
Pre_ratings <-sapply(ratings_split , "[", 1)
Pre_ratings <- str_replace(Pre_ratings , " R: ", "")
expression_10 = "[A-Z]+\\d{2}"
expression_11 = "[A-Z]+\\d{1}"
Pre_ratings <- str_replace_all(Pre_ratings, expression_10, "")
Pre_ratings <- str_replace_all(Pre_ratings, expression_11, "")
Pre_ratings <- str_trim(Pre_ratings)
head(Pre_ratings,10)
## [1] "Rtg (Pre" "1794" "1553" "1384" "1716" "1655"
## [7] "1686" "1649" "1641" "1411"
#Post Ratings
Post_ratings <-sapply(ratings_split , "[", 2)
Post_ratings <- str_replace(Post_ratings , " R: ", "")
expression_10 = "[A-Z]+\\d{2}"
expression_11 = "[A-Z]+\\d{1}"
Post_ratings <- str_replace_all(Post_ratings, expression_10, "")
Post_ratings <- str_replace_all(Post_ratings, expression_11, "")
Post_ratings <- str_trim(Post_ratings)
head(Post_ratings,10)
## [1] "Post)" "1817" "1663" "1640" "1744" "1690" "1687" "1673"
## [9] "1657" "1564"
#convert all appliable data to numerics
meta_data <-data.frame (USCF_ID ,Pre_ratings, Post_ratings)
meta_data <- meta_data[-1 ,]
meta_data$USCF_ID <-as.numeric(as.character(meta_data$USCF_ID))
meta_data$Pre_ratings <- as.numeric(as.character(meta_data$Pre_ratings))
meta_data$Post_ratings <- as.numeric(as.character(meta_data$Post_ratings))
knitr::kable(head(meta_data,10), caption = "Meta Data Table")
Meta Data Table
| 2 |
15445895 |
1794 |
1817 |
| 3 |
14598900 |
1553 |
1663 |
| 4 |
14959604 |
1384 |
1640 |
| 5 |
12616049 |
1716 |
1744 |
| 6 |
14601533 |
1655 |
1690 |
| 7 |
15055204 |
1686 |
1687 |
| 8 |
11146376 |
1649 |
1673 |
| 9 |
15142253 |
1641 |
1657 |
| 10 |
14954524 |
1411 |
1564 |
| 11 |
14150362 |
1365 |
1544 |
#combined actionable elements from the odds and evens dataframe
consolidated_df <- data.frame(reformatted_evens_df,
odds_data[,c("Num")],
meta_data)
knitr::kable(head(consolidated_df,10), caption = "Consolidated Table")
Consolidated Table
| 2 |
1 |
GARY HUA |
6.0 |
39 |
21 |
18 |
14 |
7 |
12 |
4 |
W |
W |
W |
W |
W |
D |
D |
ON |
15445895 |
1794 |
1817 |
| 3 |
2 |
DAKSHESH DARURI |
6.0 |
63 |
58 |
4 |
17 |
16 |
20 |
7 |
W |
W |
L |
W |
W |
W |
W |
MI |
14598900 |
1553 |
1663 |
| 4 |
3 |
ADITYA BAJAJ |
6.0 |
8 |
61 |
25 |
21 |
11 |
13 |
12 |
L |
W |
W |
W |
W |
W |
W |
MI |
14959604 |
1384 |
1640 |
| 5 |
4 |
PATRICK H SCHILLING |
5.5 |
23 |
28 |
2 |
26 |
5 |
19 |
1 |
W |
D |
W |
W |
D |
W |
D |
MI |
12616049 |
1716 |
1744 |
| 6 |
5 |
HANSHI ZUO |
5.5 |
45 |
37 |
12 |
13 |
4 |
14 |
17 |
W |
W |
D |
D |
D |
W |
W |
MI |
14601533 |
1655 |
1690 |
| 7 |
6 |
HANSEN SONG |
5.0 |
34 |
29 |
11 |
35 |
10 |
27 |
21 |
W |
D |
L |
W |
D |
W |
W |
OH |
15055204 |
1686 |
1687 |
| 8 |
7 |
GARY DEE SWATHELL |
5.0 |
57 |
46 |
13 |
11 |
1 |
9 |
2 |
W |
W |
W |
W |
L |
W |
L |
MI |
11146376 |
1649 |
1673 |
| 9 |
8 |
EZEKIEL HOUGHTON |
5.0 |
3 |
32 |
14 |
9 |
47 |
28 |
19 |
W |
W |
L |
L |
W |
W |
W |
MI |
15142253 |
1641 |
1657 |
| 10 |
9 |
STEFANO LEE |
5.0 |
25 |
18 |
59 |
8 |
26 |
7 |
20 |
W |
L |
W |
W |
W |
L |
W |
ON |
14954524 |
1411 |
1564 |
| 11 |
10 |
ANVIT RAO |
5.0 |
16 |
19 |
55 |
31 |
6 |
25 |
18 |
D |
L |
W |
W |
D |
W |
W |
MI |
14150362 |
1365 |
1544 |
#create a reference table to join on the columns
ref_table <- data.table(consolidated_df[,c("Pair", "Pre_ratings")])
colnames(ref_table)[colnames(ref_table)=="Pair"] <- "opp_1"
ref_table$opp_1 <- as.numeric(as.character(ref_table$opp_1))
knitr::kable(head(ref_table,10), caption = "Mapping Table")
Mapping Table
| 1 |
1794 |
| 2 |
1553 |
| 3 |
1384 |
| 4 |
1716 |
| 5 |
1655 |
| 6 |
1686 |
| 7 |
1649 |
| 8 |
1641 |
| 9 |
1411 |
| 10 |
1365 |
#formatting the columns to numerics
consolidated_df$opp_1 <- as.numeric(as.character(consolidated_df$opp_1))
consolidated_df$opp_2 <- as.numeric(as.character(consolidated_df$opp_2))
consolidated_df$opp_3 <- as.numeric(as.character(consolidated_df$opp_3))
consolidated_df$opp_4 <- as.numeric(as.character(consolidated_df$opp_4))
consolidated_df$opp_5 <- as.numeric(as.character(consolidated_df$opp_5))
consolidated_df$opp_6 <- as.numeric(as.character(consolidated_df$opp_6))
consolidated_df$opp_7 <- as.numeric(as.character(consolidated_df$opp_7))
consolidated_df$opp_1[is.na(consolidated_df$opp_1)] <- 0
consolidated_df$opp_2[is.na(consolidated_df$opp_2)] <- 0
consolidated_df$opp_3[is.na(consolidated_df$opp_3)] <- 0
consolidated_df$opp_4[is.na(consolidated_df$opp_4)] <- 0
consolidated_df$opp_5[is.na(consolidated_df$opp_5)] <- 0
consolidated_df$opp_6[is.na(consolidated_df$opp_6)] <- 0
consolidated_df$opp_7[is.na(consolidated_df$opp_7)] <- 0
#get all of the pre ratings from the reference table, merge onto the original
merged_df <- merge(x = consolidated_df, y = ref_table, by = "opp_1", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings.y"] <- "opp_1_rating"
colnames(ref_table)[colnames(ref_table)=="opp_1"] <- "opp_2"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_2", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_2_rating"
colnames(ref_table)[colnames(ref_table)=="opp_2"] <- "opp_3"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_3", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_3_rating"
colnames(ref_table)[colnames(ref_table)=="opp_3"] <- "opp_4"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_4", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_4_rating"
colnames(ref_table)[colnames(ref_table)=="opp_4"] <- "opp_5"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_5", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_5_rating"
colnames(ref_table)[colnames(ref_table)=="opp_5"] <- "opp_6"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_6", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_6_rating"
colnames(ref_table)[colnames(ref_table)=="opp_6"] <- "opp_7"
merged_df <- merge(x = merged_df, y = ref_table, by = "opp_7", all.x = TRUE)
colnames(merged_df)[colnames(merged_df)=="Pre_ratings"] <- "opp_7_rating"
colnames(ref_table)[colnames(ref_table)=="opp_6"] <- "opp_7"
#fill in the nulls
merged_df$opp_1_rating[is.na(merged_df$opp_1_rating)] <- 0
merged_df$opp_2_rating[is.na(merged_df$opp_2_rating)] <- 0
merged_df$opp_3_rating[is.na(merged_df$opp_3_rating)] <- 0
merged_df$opp_4_rating[is.na(merged_df$opp_4_rating)] <- 0
merged_df$opp_5_rating[is.na(merged_df$opp_5_rating)] <- 0
merged_df$opp_6_rating[is.na(merged_df$opp_6_rating)] <- 0
merged_df$opp_7_rating[is.na(merged_df$opp_7_rating)] <- 0
merged_df$sum_toal_pre_ratings <- (merged_df$opp_1_rating+ merged_df$opp_2_rating+
merged_df$opp_3_rating+ merged_df$opp_4_rating+
merged_df$opp_5_rating+ merged_df$opp_6_rating+
merged_df$opp_7_rating)
merged_df$div<- 7 -(rowSums(merged_df[,23:29] == 0))
merged_df$average_opp_pre_ratings <- merged_df$sum_toal_pre_ratings/merged_df$div
Final Table
Format: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
#final Dataframe
final_df <- merged_df[,c("Pair","Player_Name","odds_data...c..Num...", "Total","Pre_ratings.x", "average_opp_pre_ratings" )]
final_df<- final_df[order(final_df$Pair),]
#renaming some columns
setnames(final_df, old = c('Pair', 'odds_data...c..Num...','Total','Pre_ratings.x', 'average_opp_pre_ratings' ),
new = c('Ranking','Player State','Total Number of Points', 'Player Pre-Ranking','Average Opponent Pre-Ranking'))
knitr::kable(head(final_df ,10), caption = "Final-Cleaned DataFrame")
Final-Cleaned DataFrame
| 14 |
1 |
GARY HUA |
ON |
6.0 |
1794 |
1605.286 |
| 17 |
2 |
DAKSHESH DARURI |
MI |
6.0 |
1553 |
1469.286 |
| 22 |
3 |
ADITYA BAJAJ |
MI |
6.0 |
1384 |
1563.571 |
| 11 |
4 |
PATRICK H SCHILLING |
MI |
5.5 |
1716 |
1573.571 |
| 26 |
5 |
HANSHI ZUO |
MI |
5.5 |
1655 |
1500.857 |
| 30 |
6 |
HANSEN SONG |
OH |
5.0 |
1686 |
1518.714 |
| 12 |
7 |
GARY DEE SWATHELL |
MI |
5.0 |
1649 |
1372.143 |
| 28 |
8 |
EZEKIEL HOUGHTON |
MI |
5.0 |
1641 |
1468.429 |
| 29 |
9 |
STEFANO LEE |
ON |
5.0 |
1411 |
1523.143 |
| 27 |
10 |
ANVIT RAO |
MI |
5.0 |
1365 |
1554.143 |