In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of opp_players
For the first player, the information would be:
Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opp_players’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
# Initial Setup - Load required libraries
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(DT)
library(dplyr)
library(htmltools)Tournament data is located at remote location in a text file, top 4 lines are not relevant in the file so initil load will ignore these.
# Load Raw tournament data
dataURL='https://raw.githubusercontent.com/rnivas2028/MSDS/Data607/Project1/tournamentinfo.txt'
tournament_results <- read.table(url(dataURL), skip=4, header = FALSE, sep = ",", strip.white=T)
head(tournament_results)## V1
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 2 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 3 -----------------------------------------------------------------------------------------
## 4 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 5 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 6 -----------------------------------------------------------------------------------------
Below regular expressions will be used to extract expected data set from tournament data load
# To extract Pair Number
reg_ex_pair_num <- "\\d{1,2}(?=\\s\\|)"
# To extract Player's Name
reg_ex_player_name <- "\\w+[[:space:]]\\w+([[:space:]]\\w+)*"
# To extract Player's State
reg_ex_player_state <- "([[:upper:]]){2}\\s(?=\\|)"
# To extract Player's Point
reg_ex_players_point <- "\\d\\.\\d"
# To extract Player's Rating
reg_ex_pre_rating <- "(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
# To extract Player's opp_players
reg_ex_players_opp_players <- "(\\d{1,}|[[:blank:]]{1})(?=\\|)"Chess tournament results are in textual format. Information extraction need parsing of row data and extraction or information using regular expressions. Below data points will be extracted from this data set:
* Player’s Name
* Player’s State
* Total Number of Points
* Player’s Pre-Rating
This step will extract players name using regular expresssion, there are 64 players
players <- unlist(str_extract_all(unlist(tournament_results), reg_ex_player_name))This step will extract players name using regular expression, there are 64 players
players_state <- unlist(str_extract_all(unlist(tournament_results), reg_ex_player_state))Number of players from these states is below
table(players_state)## players_state
## MI OH ON
## 55 1 8
Player’s Point is extracted in textual form and converted to numeric format
players_point <- unlist(str_extract_all(unlist(tournament_results), reg_ex_players_point))
as.numeric(players_point)## [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0 4.0 4.0
## [20] 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.0
## [39] 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5 2.5 2.0 2.0 2.0 2.0 2.0
## [58] 2.0 2.0 1.5 1.5 1.0 1.0 1.0
Pair Number is extracted in textual form and converted to numeric format
pair_number <- unlist(str_extract_all(unlist(tournament_results), reg_ex_pair_num))
as.numeric(pair_number)## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [51] 51 52 53 54 55 56 57 58 59 60 61 62 63 64
Players pre rating is extracted in textual form and converted to numeric number
players_pre_rating <- unlist(str_extract_all(unlist(tournament_results), reg_ex_pre_rating))
as.numeric(players_pre_rating)## [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610 1220
## [16] 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507 1602 1522
## [31] 1494 1441 1449 1399 1438 1355 980 1423 1436 1348 1403 1332 1283 1199 1242
## [46] 377 1362 1382 1291 1056 1011 935 1393 1270 1186 1153 1092 917 853 967
## [61] 955 1530 1175 1163
Player’s opp_players are extracted using from given information in the file
opp_players <- unlist(str_extract_all(unlist(tournament_results), reg_ex_players_opp_players))Player’s opp_players are extracted using from given information in the file
# Extract and clean opponent players
opp_player1 <- as.numeric(opp_players[seq(4, length(opp_players), 10)])
opp_player1 <- as.numeric(opp_player1[seq(1, length(opp_player1), 2)])
opp_player2 <- as.numeric(opp_players[seq(5, length(opp_players), 10)])
opp_player2 <- as.numeric(opp_player2[seq(1, length(opp_player2), 2)])
opp_player3 <- as.numeric(opp_players[seq(6, length(opp_players), 10)])
opp_player3 <- as.numeric(opp_player3[seq(1, length(opp_player3), 2)])
opp_player4 <- as.numeric(opp_players[seq(7, length(opp_players), 10)])
opp_player4 <- as.numeric(opp_player4[seq(1, length(opp_player4), 2)])
opp_player5 <- as.numeric(opp_players[seq(8, length(opp_players), 10)])
opp_player5 <- as.numeric(opp_player5[seq(1, length(opp_player5), 2)])
opp_player6 <- as.numeric(opp_players[seq(9, length(opp_players), 10)])
opp_player6 <- as.numeric(opp_player6[seq(1, length(opp_player6), 2)])
opp_player7 <- as.numeric(opp_players[seq(10, length(opp_players), 10)])
opp_player7 <- as.numeric(opp_player7[seq(1, length(opp_player7), 2)])
# Create a matrix with opp_player information
opp_players <- matrix(c(opp_player1, opp_player2, opp_player3,
opp_player4, opp_player5, opp_player6, opp_player7),
nrow = 64, ncol = 7)Player’s opp_players are matched using players id to extract opp_players in the collection below
Average Pre Chess Rating of opp_players is calculated by taking mean of opp_players Pre Chess Rating
avg_pre_chess_rating <- 0
for (i in 1:(length(pair_number)))
{
avg_pre_chess_rating[i] <- mean(as.numeric(players_pre_rating[opp_players[i,]]), na.rm = T)
}Lets create a data frame with - Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of opp_players
tournament_data_result <- 0
tournament_data_result <- data.frame(players, players_state, players_point, players_pre_rating, avg_pre_chess_rating)Lets visualize tournament data results
Plot average pre chess rating with players points
ggplot(data = tournament_data_result,
aes(x = avg_pre_chess_rating, y = players_point, color=players_state,
shape=players_state)) +
geom_point()+xlab("Average pre rating")+ylab("Player's point")+
ggtitle("Player's Points and opp_player's average pre rating")Plot Player count by state
ggplot(data = tournament_data_result) +
geom_bar(mapping = aes(x = players_point, fill = players_state))+
xlab("Player's Point")+ylab("Player Count")+
ggtitle("Player count by state")Plot opp_players Average pre rating with Player’s point
ggplot(data = tournament_data_result, aes(x = players_point, y = avg_pre_chess_rating)) +
geom_boxplot()+ylab("Average pre rating")+xlab("Player's point")Create data table with search and navigation capabilities
data_table<- data.frame(players, players_state, players_point, players_pre_rating,
round(avg_pre_chess_rating,2))
names(data_table)[0] <- "Player Id"
names(data_table)[1] <- "Name"
names(data_table)[2] <- "State"
names(data_table)[3] <- "Point"
names(data_table)[4] <- "Pre Rating"
names(data_table)[5] <- "Average Pre Rating of Opponent players"
datatable(data_table, filter = 'bottom',
options = list(pageLength = 10,searching = TRUE,filter=TRUE, pageLength = TRUE))This step will write result data to a .CSV file
write.csv(tournament_data_result, file = "tournament_data_result.csv")These instructions can be used to import .CSV file into MySQL database
DROP DATABASE IF EXISTS movie_ratings;
CREATE DATABASE `tournament_data_results`
USE tournament_data_results;
DROP TABLE IF EXISTS tournament_result;
CREATE TABLE `tournament_result` (
`player_id` int NOT NULL,
`player_name` varchar(100) DEFAULT NULL,
`players_state` varchar(10) DEFAULT NULL,
`players_point` float(10,2) DEFAULT NULL,
`players_pre_rating` float(10,2) DEFAULT NULL,
`avg_pre_chess_rating` float(10,2) DEFAULT NULL,
`addedOn` datetime DEFAULT NULL,
`addedBy` varchar(100) DEFAULT NULL,
`updatedOn` datetime DEFAULT NULL,
`updatedBy` varchar(100) DEFAULT NULL,
PRIMARY KEY (`player_id`)
);
LOAD DATA INFILE '<<File Location>>\\tournament_data_results.csv'
INTO TABLE tournament_result
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
library (readr)
library(aws.s3)
bucketlist(key = <AWS Key>, secret = <AWS Secret>)
# Upload files to S3 bucket
put_object(
file = file.path(tempdir(), "tournament_data_result.csv"),
object = "tournament_data_result.csv",
bucket = "msds-data607"
)
# Get buckets and contents
get_bucket(bucket = "msds-data607")
# Display contents
s3read_using(FUN = read.csv, bucket = "msds-data607", object = "tournament_data_result.csv")
s3read_using(FUN = read.csv, object = "s3://msds-data607/tournament_data_result.csv")
s3read_using(FUN = data.table::tournament_data_result,
object = "s3://msds-data607/tournament_data_result.csv")