Project 1 Collaboration with Jack Russo. Jack did most of the extraction, and I wrangled the data and learned how to integrate python and bash into R projects. Also normalized the data for future feature extraction and mainting data integrity as either a .psv or .csv
Jack submitted on time, and I’ve been late to compile the collaboration and notes.
#install.packages("reticulate")
#install.packages("pandas")
# coding: utf-8
import pandas as pd
import re
import os
myFile = '/Users/alejandro/documents/R/DATA607/data/tournamentinfo.txt'
newFile = '/Users/alejandro/documents/R/DATA607/data/ti_2.txt'
pd.set_option('display.max_colwidth', -1)
df = pd.read_csv(myFile, header = None)
s = ""
with open(myFile) as f:
s = " ".join(line.strip() for line in f)
dirname = os.path.dirname(newFile)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(newFile, 'w') as file:
file.write(s)
file.close()
library(stringr)
newFile = '/Users/alejandro/documents/R/DATA607/data/ti_2.txt'
#my_data <- read.table(eloFile, header = TRUE)
dat <- readLines(newFile)
incomplete final line found on '/Users/alejandro/documents/R/DATA607/data/ti_2.txt'
#dat
nu <- str_replace_all(dat, "[[:punct:]]$|[-]{4,100}", "\n")
nu <- str_replace_all(nu, "[ ]{2,20}+", "")
nu <- nu[nu != ""]
nu <- strsplit(nu, "\n")
#nu <- str_replace_all(nu, '\"', "")
df <- data.frame(nu)
df <- tail(df, -2)
df <- as.data.frame(sapply(df, function(x) gsub('\"', "", x)))
df
#head(df)
#write.table((str_extract(nu, "[[:space:]]+[[:digit:]]+[[:space:]][|]")), file = "/Users/alejandro/documents/R/DATA607/data/hmm.csv",row.names=FALSE, na="",col.names=TRUE, sep="|")
#(str_extract(nu, "^[[:space:]]+[[:digit:]]+[[:space:]][|]"))
write.table(df, file = "/Users/alejandro/documents/R/DATA607/data/ti_3.csv", row.names=FALSE, col.names=FALSE, quote=FALSE)
ti_3 = "/Users/alejandro/documents/R/DATA607/data/ti_3.csv"
# Read Data in
x <- (ti_3)
data_sans_header <- read.delim(x, header = FALSE, stringsAsFactors = FALSE)
data_sans_header
# Find Player Names
regex_names <- unlist(str_extract_all(data_sans_header, "[[:upper:][:blank:][:upper:], ]{15,}" ))
regex_names <- regex_names[-1]
regex_names[28] <- "SOFIA ADINA STANESCU-BELLU"
print(length(regex_names))
[1] 64
# Find Player States of Orgin
regex_states <- unlist(str_extract_all(data_sans_header, "[:space:]MI[:space:]|[:space:]ON|[:space:]OH" ))
print(length(regex_states))
[1] 64
# Find Total Scores
regex_scores <- unlist(str_extract_all(data_sans_header, "[[:digit:]].[[:digit:]][:blank:][:blank:]" ))
regex_scores <- unlist(str_extract_all(regex_scores,"[:digit:][:punct:][:digit:]"))
print(length(regex_scores))
[1] 64
# Find Pre-Tournament Ratings
regex_pre_tournament_rating <- unlist(str_extract_all(data_sans_header, "[[:space:]][[:digit:]][[:digit:]][[:digit:]][[:digit:][P]]?[[:digit:][P]]?[[:digit:]]?[[:digit:]]?"))
argument is not an atomic vector; coercing
regex_pre_tournament_rating <- regex_pre_tournament_rating[-126]
regex_pre_tournament_rating <- regex_pre_tournament_rating[-123]
regex_pre_tournament_rating <- regex_pre_tournament_rating[-120]
regex_pre_tournament_rating <- regex_pre_tournament_rating[-117]
v <- seq(from = 2, to= 128, by = 2)
regex_pre_tournament_rating <- regex_pre_tournament_rating[v]
regex_pre_tournament_rating_sans_P <- unlist(str_extract_all(regex_pre_tournament_rating, "[[:digit:]][[:digit:]][[:digit:]][[:digit:]]?"))
Pre_Tournament_Rating <- as.numeric(regex_pre_tournament_rating_sans_P)
# Build Initial Data Frame
Queen_Takes_Bishop <- data.frame(Player = regex_names, Player_State = regex_states, Total_Player_Score = regex_scores, Pre_Tournament_Rating)
Queen_Takes_Bishop
#
# Find Average Opponent Rating
v <- seq(from = 5, to= 196, by = 3)
opponent_lines <- data_sans_header[v]
Average_Opponent_Rating <- NULL
for (i in 1:length(opponent_lines)){
opponent_lines1 <- unlist(str_extract_all(opponent_lines[i], "[:space:][[:digit:]]?[[:digit:]]" ))
opponent_lines1 <- opponent_lines1[-1]
opponent_lines1 <- unlist(str_extract_all(opponent_lines1, "[[:digit:]]?[[:digit:]]" ))
opponents_line_1_vector <- as.numeric(opponent_lines1)
Average_Opponent_Rating[i] <- round(sum(Queen_Takes_Bishop$Pre_Tournament_Rating[opponents_line_1_vector])/length(opponents_line_1_vector))
}
# Complete Data Frame
if (length(regex_names) == length(regex_states) && length(regex_scores) == length(regex_states)) {
Queen_Takes_Bishop <- data.frame(Player = regex_names, Player_State = regex_states, Total_Player_Score = regex_scores, Pre_Tournament_Rating, Average_Opponent_Rating)
} else {
print(length(regex_names))
print(length(regex_states))
print(length(regex_scores))
}
Queen_Takes_Bishop
# Export to CSV
# Note! Will export to directory where R project is stored. Use setwd() to change to desired directory.
write.csv(Queen_Takes_Bishop, file = "Chess_Tournament_Data.csv")
Lessons Learned Will try and divide every group project into data ingestion | ETL | and indexing / vizualizing.
Optional parts would be anaylitcs and system
TODO create a team git repo and combine code together through git create R automated tests learn to write boiler plate functions to collaborate and test efficiently
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKUHJvamVjdCAxIENvbGxhYm9yYXRpb24gd2l0aCBKYWNrIFJ1c3NvLiAgSmFjayBkaWQgbW9zdCBvZiB0aGUgZXh0cmFjdGlvbiwgYW5kIEkgd3JhbmdsZWQgdGhlIGRhdGEgYW5kIGxlYXJuZWQgaG93IHRvIGludGVncmF0ZQpweXRob24gYW5kIGJhc2ggaW50byBSIHByb2plY3RzLiAgQWxzbyBub3JtYWxpemVkIHRoZSBkYXRhIGZvciBmdXR1cmUgZmVhdHVyZSBleHRyYWN0aW9uIGFuZCBtYWludGluZyBkYXRhIGludGVncml0eSBhcwplaXRoZXIgYSAucHN2IG9yIC5jc3YKCkphY2sgc3VibWl0dGVkIG9uIHRpbWUsIGFuZCBJJ3ZlIGJlZW4gbGF0ZSB0byBjb21waWxlIHRoZSBjb2xsYWJvcmF0aW9uIGFuZCBub3Rlcy4KCmBgYHtyIGluc3RhbGx9CiNpbnN0YWxsLnBhY2thZ2VzKCJyZXRpY3VsYXRlIikKI2luc3RhbGwucGFja2FnZXMoInBhbmRhcyIpCmBgYApgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0KbGlicmFyeShyZXRpY3VsYXRlKQp1c2VfdmlydHVhbGVudigici1yZXRpY3VsYXRlIikKCiN1c2VfcHl0aG9uKCcvdXNyL2xvY2FsL2Jpbi9weXRob24yNycsIHJlcXVpcmVkID0gVFJVRSkKYGBgCgpgYGB7cHl0aG9ufQojIGNvZGluZzogdXRmLTgKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcmUKaW1wb3J0IG9zCgpteUZpbGUgID0gJ2FsZWphbmRyb25pY3VsZXNjdS9jdW55L0RBVEE2MDcvdG91cm5hbWVudGluZm8udHh0JwpuZXdGaWxlID0gJy9Vc2Vycy9hbGVqYW5kcm8vZG9jdW1lbnRzL1IvREFUQTYwNy9kYXRhL3RpXzIudHh0JyAKCnBkLnNldF9vcHRpb24oJ2Rpc3BsYXkubWF4X2NvbHdpZHRoJywgLTEpCmRmID0gcGQucmVhZF9jc3YobXlGaWxlLCBoZWFkZXIgPSBOb25lKQoKcyA9ICIiCgp3aXRoIG9wZW4obXlGaWxlKSBhcyBmOgogICAgcyA9ICIgIi5qb2luKGxpbmUuc3RyaXAoKSBmb3IgbGluZSBpbiBmKSAKCmRpcm5hbWUgPSBvcy5wYXRoLmRpcm5hbWUobmV3RmlsZSkKaWYgbm90IG9zLnBhdGguZXhpc3RzKGRpcm5hbWUpOgogICAgb3MubWFrZWRpcnMoZGlybmFtZSkKd2l0aCBvcGVuKG5ld0ZpbGUsICd3JykgYXMgZmlsZToKICBmaWxlLndyaXRlKHMpCgpmaWxlLmNsb3NlKCkKYGBgCgpgYGB7ciBsb2FkLWRhdGF9CmxpYnJhcnkoc3RyaW5ncikKCm5ld0ZpbGUgPSAnL1VzZXJzL2FsZWphbmRyby9kb2N1bWVudHMvUi9EQVRBNjA3L2RhdGEvdGlfMi50eHQnIAoKI215X2RhdGEgPC0gcmVhZC50YWJsZShlbG9GaWxlLCBoZWFkZXIgPSBUUlVFKQpkYXQgPC0gcmVhZExpbmVzKG5ld0ZpbGUpCiNkYXQKbnUgPC0gc3RyX3JlcGxhY2VfYWxsKGRhdCwgIltbOnB1bmN0Ol1dJHxbLV17NCwxMDB9IiwgIlxuIikKbnUgPC0gc3RyX3JlcGxhY2VfYWxsKG51LCAiWyBdezIsMjB9KyIsICIiKQoKbnUgPC0gbnVbbnUgIT0gIiJdCgpudSA8LSBzdHJzcGxpdChudSwgIlxuIikKCiNudSA8LSBzdHJfcmVwbGFjZV9hbGwobnUsICdcIicsICIiKQoKZGYgPC0gZGF0YS5mcmFtZShudSkKCmRmIDwtIHRhaWwoZGYsIC0yKQoKZGYgPC0gYXMuZGF0YS5mcmFtZShzYXBwbHkoZGYsIGZ1bmN0aW9uKHgpIGdzdWIoJ1wiJywgIiIsIHgpKSkKCmRmCgojaGVhZChkZikKI3dyaXRlLnRhYmxlKChzdHJfZXh0cmFjdChudSwgIltbOnNwYWNlOl1dK1tbOmRpZ2l0Ol1dK1tbOnNwYWNlOl1dW3xdIikpLCBmaWxlID0gIi9Vc2Vycy9hbGVqYW5kcm8vZG9jdW1lbnRzL1IvREFUQTYwNy9kYXRhL2htbS5jc3YiLHJvdy5uYW1lcz1GQUxTRSwgbmE9IiIsY29sLm5hbWVzPVRSVUUsIHNlcD0ifCIpCgojKHN0cl9leHRyYWN0KG51LCAiXltbOnNwYWNlOl1dK1tbOmRpZ2l0Ol1dK1tbOnNwYWNlOl1dW3xdIikpCgp3cml0ZS50YWJsZShkZiwgZmlsZSA9ICIvVXNlcnMvYWxlamFuZHJvL2RvY3VtZW50cy9SL0RBVEE2MDcvZGF0YS90aV8zLmNzdiIsIHJvdy5uYW1lcz1GQUxTRSwgY29sLm5hbWVzPUZBTFNFLCBxdW90ZT1GQUxTRSkKYGBgCmBgYHtyIHJlYWQtY3N2fQp0aV8zID0gIi9Vc2Vycy9hbGVqYW5kcm8vZG9jdW1lbnRzL1IvREFUQTYwNy9kYXRhL3RpXzMuY3N2IgoKIyBSZWFkIERhdGEgaW4KeCA8LSAodGlfMykKZGF0YV9zYW5zX2hlYWRlciA8LSByZWFkLmRlbGltKHgsIGhlYWRlciA9IEZBTFNFLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpCmBgYAoKYGBge3IgcGxheWVyLW5hbWVzfQojIEZpbmQgUGxheWVyIE5hbWVzCnJlZ2V4X25hbWVzIDwtIHVubGlzdChzdHJfZXh0cmFjdF9hbGwoZGF0YV9zYW5zX2hlYWRlciwgIltbOnVwcGVyOl1bOmJsYW5rOl1bOnVwcGVyOl0sIF17MTUsfSIgKSkKcmVnZXhfbmFtZXMgPC0gcmVnZXhfbmFtZXNbLTFdCnJlZ2V4X25hbWVzWzI4XSA8LSAiU09GSUEgQURJTkEgU1RBTkVTQ1UtQkVMTFUiCnByaW50KGxlbmd0aChyZWdleF9uYW1lcykpCmBgYAoKYGBge3J9CiNKUgojIEZpbmQgUGxheWVyIFN0YXRlcyBvZiBPcmlnaW4KcmVnZXhfc3RhdGVzIDwtIHVubGlzdChzdHJfZXh0cmFjdF9hbGwoZGF0YV9zYW5zX2hlYWRlciwgIls6c3BhY2U6XU1JWzpzcGFjZTpdfFs6c3BhY2U6XU9OfFs6c3BhY2U6XU9IIiApKQpwcmludChsZW5ndGgocmVnZXhfc3RhdGVzKSkKYGBgCgpgYGB7cn0KI0pSCiMgRmluZCBUb3RhbCBTY29yZXMKcmVnZXhfc2NvcmVzIDwtIHVubGlzdChzdHJfZXh0cmFjdF9hbGwoZGF0YV9zYW5zX2hlYWRlciwgIltbOmRpZ2l0Ol1dLltbOmRpZ2l0Ol1dWzpibGFuazpdWzpibGFuazpdIiApKQpyZWdleF9zY29yZXMgPC0gdW5saXN0KHN0cl9leHRyYWN0X2FsbChyZWdleF9zY29yZXMsIls6ZGlnaXQ6XVs6cHVuY3Q6XVs6ZGlnaXQ6XSIpKQpwcmludChsZW5ndGgocmVnZXhfc2NvcmVzKSkKYGBgCgpgYGB7cn0KIyBGaW5kIFByZS1Ub3VybmFtZW50IFJhdGluZ3MKcmVnZXhfcHJlX3RvdXJuYW1lbnRfcmF0aW5nIDwtIHVubGlzdChzdHJfZXh0cmFjdF9hbGwoZGF0YV9zYW5zX2hlYWRlciwgIltbOnNwYWNlOl1dW1s6ZGlnaXQ6XV1bWzpkaWdpdDpdXVtbOmRpZ2l0Ol1dW1s6ZGlnaXQ6XVtQXV0/W1s6ZGlnaXQ6XVtQXV0/W1s6ZGlnaXQ6XV0/W1s6ZGlnaXQ6XV0/IikpCnJlZ2V4X3ByZV90b3VybmFtZW50X3JhdGluZyA8LSByZWdleF9wcmVfdG91cm5hbWVudF9yYXRpbmdbLTEyNl0KcmVnZXhfcHJlX3RvdXJuYW1lbnRfcmF0aW5nIDwtIHJlZ2V4X3ByZV90b3VybmFtZW50X3JhdGluZ1stMTIzXQpyZWdleF9wcmVfdG91cm5hbWVudF9yYXRpbmcgPC0gcmVnZXhfcHJlX3RvdXJuYW1lbnRfcmF0aW5nWy0xMjBdCnJlZ2V4X3ByZV90b3VybmFtZW50X3JhdGluZyA8LSByZWdleF9wcmVfdG91cm5hbWVudF9yYXRpbmdbLTExN10KdiA8LSBzZXEoZnJvbSA9IDIsIHRvPSAxMjgsIGJ5ID0gMikKcmVnZXhfcHJlX3RvdXJuYW1lbnRfcmF0aW5nIDwtIHJlZ2V4X3ByZV90b3VybmFtZW50X3JhdGluZ1t2XQpyZWdleF9wcmVfdG91cm5hbWVudF9yYXRpbmdfc2Fuc19QIDwtIHVubGlzdChzdHJfZXh0cmFjdF9hbGwocmVnZXhfcHJlX3RvdXJuYW1lbnRfcmF0aW5nLCAiW1s6ZGlnaXQ6XV1bWzpkaWdpdDpdXVtbOmRpZ2l0Ol1dW1s6ZGlnaXQ6XV0/IikpClByZV9Ub3VybmFtZW50X1JhdGluZyA8LSBhcy5udW1lcmljKHJlZ2V4X3ByZV90b3VybmFtZW50X3JhdGluZ19zYW5zX1ApCmBgYAoKYGBge3J9CiNKUgojIEJ1aWxkIEluaXRpYWwgRGF0YSBGcmFtZQpRdWVlbl9UYWtlc19CaXNob3AgPC0gZGF0YS5mcmFtZShQbGF5ZXIgPSByZWdleF9uYW1lcywgUGxheWVyX1N0YXRlID0gcmVnZXhfc3RhdGVzLCBUb3RhbF9QbGF5ZXJfU2NvcmUgPSByZWdleF9zY29yZXMsIFByZV9Ub3VybmFtZW50X1JhdGluZykKUXVlZW5fVGFrZXNfQmlzaG9wCmBgYApgYGB7cn0KIyBGaW5kIEF2ZXJhZ2UgT3Bwb25lbnQgUmF0aW5nIAp2IDwtIHNlcShmcm9tID0gNSwgdG89IDE5NiwgYnkgPSAzKQpvcHBvbmVudF9saW5lcyA8LSBkYXRhX3NhbnNfaGVhZGVyW3ZdIApBdmVyYWdlX09wcG9uZW50X1JhdGluZyA8LSBOVUxMCgpmb3IgKGkgaW4gMTpsZW5ndGgob3Bwb25lbnRfbGluZXMpKXsKICBvcHBvbmVudF9saW5lczEgPC0gdW5saXN0KHN0cl9leHRyYWN0X2FsbChvcHBvbmVudF9saW5lc1tpXSwgIls6c3BhY2U6XVtbOmRpZ2l0Ol1dP1tbOmRpZ2l0Ol1dIiApKQogIG9wcG9uZW50X2xpbmVzMSA8LSBvcHBvbmVudF9saW5lczFbLTFdCiAgb3Bwb25lbnRfbGluZXMxIDwtIHVubGlzdChzdHJfZXh0cmFjdF9hbGwob3Bwb25lbnRfbGluZXMxLCAiW1s6ZGlnaXQ6XV0/W1s6ZGlnaXQ6XV0iICkpCiAgb3Bwb25lbnRzX2xpbmVfMV92ZWN0b3IgPC0gYXMubnVtZXJpYyhvcHBvbmVudF9saW5lczEpIAogIEF2ZXJhZ2VfT3Bwb25lbnRfUmF0aW5nW2ldIDwtIHJvdW5kKHN1bShRdWVlbl9UYWtlc19CaXNob3AkUHJlX1RvdXJuYW1lbnRfUmF0aW5nW29wcG9uZW50c19saW5lXzFfdmVjdG9yXSkvbGVuZ3RoKG9wcG9uZW50c19saW5lXzFfdmVjdG9yKSkKfQpgYGAKCgoKYGBge3J9CiMgQ29tcGxldGUgRGF0YSBGcmFtZQppZiAobGVuZ3RoKHJlZ2V4X25hbWVzKSA9PSBsZW5ndGgocmVnZXhfc3RhdGVzKSAmJiBsZW5ndGgocmVnZXhfc2NvcmVzKSA9PSBsZW5ndGgocmVnZXhfc3RhdGVzKSkgewpRdWVlbl9UYWtlc19CaXNob3AgPC0gZGF0YS5mcmFtZShQbGF5ZXIgPSByZWdleF9uYW1lcywgUGxheWVyX1N0YXRlID0gcmVnZXhfc3RhdGVzLCBUb3RhbF9QbGF5ZXJfU2NvcmUgPSByZWdleF9zY29yZXMsIFByZV9Ub3VybmFtZW50X1JhdGluZywgQXZlcmFnZV9PcHBvbmVudF9SYXRpbmcpCn0gZWxzZSB7CiAgcHJpbnQobGVuZ3RoKHJlZ2V4X25hbWVzKSkKICBwcmludChsZW5ndGgocmVnZXhfc3RhdGVzKSkKICBwcmludChsZW5ndGgocmVnZXhfc2NvcmVzKSkKICB9ClF1ZWVuX1Rha2VzX0Jpc2hvcApgYGAKCmBgYHtyfQojIEV4cG9ydCB0byBDU1YKIyBOb3RlISBXaWxsIGV4cG9ydCB0byBkaXJlY3Rvcnkgd2hlcmUgUiBwcm9qZWN0IGlzIHN0b3JlZC4gVXNlIHNldHdkKCkgdG8gY2hhbmdlIHRvIGRlc2lyZWQgZGlyZWN0b3J5Lgp3cml0ZS5jc3YoUXVlZW5fVGFrZXNfQmlzaG9wLCBmaWxlID0gIkNoZXNzX1RvdXJuYW1lbnRfRGF0YS5jc3YiKQpgYGAKCkxlc3NvbnMgTGVhcm5lZApXaWxsIHRyeSBhbmQgZGl2aWRlIGV2ZXJ5IGdyb3VwIHByb2plY3QgaW50byBkYXRhIGluZ2VzdGlvbiB8IEVUTCB8IGFuZCBpbmRleGluZyAvIHZpenVhbGl6aW5nLiAgCgpPcHRpb25hbCBwYXJ0cyB3b3VsZCBiZSBhbmF5bGl0Y3MgYW5kIHN5c3RlbQoKKlRPRE8qCmNyZWF0ZSBhIHRlYW0gZ2l0IHJlcG8gYW5kIGNvbWJpbmUgY29kZSB0b2dldGhlciB0aHJvdWdoIGdpdApjcmVhdGUgUiBhdXRvbWF0ZWQgdGVzdHMgCmxlYXJuIHRvIHdyaXRlIGJvaWxlciBwbGF0ZSBmdW5jdGlvbnMgdG8gY29sbGFib3JhdGUgYW5kIHRlc3QgZWZmaWNpZW50bHkKCgo=