## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Source files: https://github.com/djlofland/DATA607_F2019/tree/master/Assignment3
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be:
Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.
You may substitute another text file (or set of text files, or data scraped from web pages) of similar or greater complexity, and create your own assignment and solution. You may work in a small team. All of your code should be in an R markdown file (and published to rpubs.com); with your data accessible for the person running the script.
# Load the chess ELO document into R, skip over the 1st 4 header row, we won't need those
elo_data <- read_lines('tournamentinfo.txt', skip=4)
# quick sanity check to see if the data loaded
head(elo_data, n=9)## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
## [7] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [8] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [9] "-----------------------------------------------------------------------------------------"
# Let see how many rows so we can verify later munging steps - should be 3 rows per player
print(paste('Rows of Raw Data:', length(elo_data)))## [1] "Rows of Raw Data: 192"
## [1] "Players found: 64"
# Remove every 3rd line with dashes
elo_data <- elo_data[-seq(3, length(elo_data), by=3)]
# for each player, combine their 2 rows of data into 1 wide row
elo_data <- paste(elo_data[seq(1, length(elo_data), by=2)], elo_data[seq(2, length(elo_data), by=2)], sep="")
# How many player rows are still here - should match count above
print(paste('Players found:', length(elo_data)))## [1] "Players found: 64"
elo_list <- str_split(elo_data, "\\|")
elo_df <- NULL
for (r in 1:length(elo_list)) {
trim_row <- unlist(lapply(elo_list[[r]], str_trim))
elo_df <- rbind(elo_df, trim_row)
}
elo_df <- data.frame(elo_df)
elo_df <- as_tibble(elo_df)
elo_df## # A tibble: 64 x 21
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1 GARY … 6.0 W 39 W 21 W 18 W 14 W 7 D 12 D 4 ON 1544…
## 2 2 DAKSH… 6.0 W 63 W 58 L 4 W 17 W 16 W 20 W 7 MI 1459…
## 3 3 ADITY… 6.0 L 8 W 61 W 25 W 21 W 11 W 13 W 12 MI 1495…
## 4 4 PATRI… 5.5 W 23 D 28 W 2 W 26 D 5 W 19 D 1 MI 1261…
## 5 5 HANSH… 5.5 W 45 W 37 D 12 D 13 D 4 W 14 W 17 MI 1460…
## 6 6 HANSE… 5.0 W 34 D 29 L 11 W 35 D 10 W 27 W 21 OH 1505…
## 7 7 GARY … 5.0 W 57 W 46 W 13 W 11 L 1 W 9 L 2 MI 1114…
## 8 8 EZEKI… 5.0 W 3 W 32 L 14 L 9 W 47 W 28 W 19 MI 1514…
## 9 9 STEFA… 5.0 W 25 L 18 W 59 W 8 W 26 L 7 W 20 ON 1495…
## 10 10 ANVIT… 5.0 D 16 L 19 W 55 W 31 D 6 W 25 W 18 MI 1415…
## # … with 54 more rows, and 9 more variables: X13 <fct>, X14 <fct>,
## # X15 <fct>, X16 <fct>, X17 <fct>, X18 <fct>, X19 <fct>, X20 <fct>,
## # X21 <fct>
Lets calculate the number of opponent matches were played. While our dataset had a fixed 7 opponents. maybe in the future, a contest fewer or more match columns provided.
# Figure out how many opponent matches occurred
total_cols <- ncol(elo_df) # parsed columns found
fixed_cols <- 7 # "id", "name", "points", "state", "score", "num", "tail"
opponent_cols <- (total_cols - fixed_cols) / 2 # there are 2 columns per opponent
# Setup list of column names we'll appy to our DF
opp_ids <- list() # we'll also use this list later when cleaning up columns
opp_colors <- list() # we'll be deleting these columns late
# Build lists of opponent related columns
for (i in 1:opponent_cols) {
opp_ids <- c(opp_ids, paste("oppID_", i, sep=""))
opp_colors <- c(opp_colors, paste("oppColor_", i, sep=""))
}
# Now set column names on our DF
colnames(elo_df) <- c("id", "name", "points", opp_ids, "state", "score", "num", opp_colors, "trail")
names(elo_df)## [1] "id" "name" "points" "oppID_1" "oppID_2"
## [6] "oppID_3" "oppID_4" "oppID_5" "oppID_6" "oppID_7"
## [11] "state" "score" "num" "oppColor_1" "oppColor_2"
## [16] "oppColor_3" "oppColor_4" "oppColor_5" "oppColor_6" "oppColor_7"
## [21] "trail"
# drop unnecessary columns
keep <- c("id", "name", "points", "state", "score", unlist(opp_ids))
elo_df <- elo_df[keep]
names(elo_df)## [1] "id" "name" "points" "state" "score" "oppID_1" "oppID_2"
## [8] "oppID_3" "oppID_4" "oppID_5" "oppID_6" "oppID_7"
# Lets clean up our columns (fix datatypes and extract parts where necessary)
elo_df$id <- as.integer(lapply(elo_df$id, as.character))
elo_df$name <- unlist(lapply(elo_df$name, as.character))
elo_df$points <- as.numeric(elo_df$points)
# The players pre-competition score is buried in the $score string, use regex to pull out
elo_df$score <- unlist(lapply(elo_df$score, as.character))
elo_df$score <- as.integer(str_match(as.character(elo_df$score), ".*R: *(\\d+).*")[,2])
# Loop thru the opponent columns and extract the opponent ID's. We'll need those to look up and average opponent scores later
for (i in 1: length(opp_ids)) {
elo_df[[opp_ids[[i]]]] <- unlist(lapply(elo_df[[opp_ids[[i]]]], as.character))
elo_df[[opp_ids[[i]]]] <- as.integer(str_match(elo_df[[opp_ids[[i]]]], ".+ (\\d+)$")[,2])
}
str(elo_df)## Classes 'tbl_df', 'tbl' and 'data.frame': 64 obs. of 12 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ name : chr "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
## $ points : num 11 11 11 10 10 9 9 9 9 9 ...
## $ state : Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
## $ score : int 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 ...
## $ oppID_1: int 39 63 8 23 45 34 57 3 25 16 ...
## $ oppID_2: int 21 58 61 28 37 29 46 32 18 19 ...
## $ oppID_3: int 18 4 25 2 12 11 13 14 59 55 ...
## $ oppID_4: int 14 17 21 26 13 35 11 9 8 31 ...
## $ oppID_5: int 7 16 11 5 4 10 1 47 26 6 ...
## $ oppID_6: int 12 20 13 19 14 27 9 28 7 25 ...
## $ oppID_7: int 4 7 12 1 17 21 2 19 20 18 ...
# I kow there is a way to do this vectorized, probably with nested lappy(), but after many hours of head banging, I decided loops were good enough for this dataset.
rate_cols <- list()
# Loop thru each oppID_* column
for (i in 1:length(opp_ids)) {
# Create the column name that 'll hold the opponent rating
rate_col <- paste("oppRating_", i, sep="")
rate_cols <- c(rate_cols, rate_col)
# Create a new rating column (seed with id, we'll replace this with rating in the next step)
elo_df[rate_col] <- elo_df[opp_ids[[i]]]
# Loop thru all cells in the new rating column, look up the score for the id, then overwrite
# note, I'm make sure the ID isn't larger than our row count as a safety value
for (r in 1:nrow(elo_df)) {
if(!is.na(elo_df[[r, rate_col]]) && elo_df[[r, rate_col]] <= nrow(elo_df)) {
elo_df[[r, rate_col]] <- as.integer(elo_df[elo_df$id==elo_df[[r, rate_col]], 'score'])
}
}
}
# with Opponent scores populated, the mean is easy. Note that I left NA's in place on purpose so I could do quick mean calcualtion with na.rm=TRUE.
elo_df$avg_opp_ratings <- rowMeans(elo_df[unlist(rate_cols)], na.rm=TRUE)
elo_df$avg_opp_ratings## [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714 1372.143
## [8] 1468.429 1523.143 1554.143 1467.571 1506.167 1497.857 1515.000
## [15] 1483.857 1385.800 1498.571 1480.000 1426.286 1410.857 1470.429
## [22] 1300.333 1213.857 1357.000 1363.286 1506.857 1221.667 1522.143
## [29] 1313.500 1144.143 1259.857 1378.714 1276.857 1375.286 1149.714
## [36] 1388.167 1384.800 1539.167 1429.571 1390.571 1248.500 1149.857
## [43] 1106.571 1327.000 1152.000 1357.714 1392.000 1355.800 1285.800
## [50] 1296.000 1356.143 1494.571 1345.333 1206.167 1406.000 1414.400
## [57] 1363.000 1391.000 1319.000 1330.200 1327.286 1186.000 1350.200
## [64] 1263.000
## [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714 1372.143
## [8] 1468.429 1523.143 1554.143 1467.571 1506.167 1497.857 1515.000
## [15] 1483.857 1385.800 1498.571 1480.000 1426.286 1410.857 1470.429
## [22] 1300.333 1213.857 1357.000 1363.286 1506.857 1221.667 1522.143
## [29] 1313.500 1144.143 1259.857 1378.714 1276.857 1375.286 1149.714
## [36] 1388.167 1384.800 1539.167 1429.571 1390.571 1248.500 1149.857
## [43] 1106.571 1327.000 1152.000 1357.714 1392.000 1355.800 1285.800
## [50] 1296.000 1356.143 1494.571 1345.333 1206.167 1406.000 1414.400
## [57] 1363.000 1391.000 1319.000 1330.200 1327.286 1186.000 1350.200
## [64] 1263.000
## # A tibble: 64 x 5
## name state points score avg_opp_ratings
## <chr> <fct> <dbl> <int> <dbl>
## 1 GARY HUA ON 11 1794 1605.
## 2 DAKSHESH DARURI MI 11 1553 1469.
## 3 ADITYA BAJAJ MI 11 1384 1564.
## 4 PATRICK H SCHILLING MI 10 1716 1574.
## 5 HANSHI ZUO MI 10 1655 1501.
## 6 HANSEN SONG OH 9 1686 1519.
## 7 GARY DEE SWATHELL MI 9 1649 1372.
## 8 EZEKIEL HOUGHTON MI 9 1641 1468.
## 9 STEFANO LEE ON 9 1411 1523.
## 10 ANVIT RAO MI 9 1365 1554.
## # … with 54 more rows