txt file to an easier to manipulate data frame. The below image is the current structure of the txt file and the below code will convert it to the desired data frame.library(stringr)
library(sqldf)
library(knitr)
library(tcltk)
tinfo <- file(github.url, open="r" )
chess.data <- readLines(tinfo) # read lines
chess.data <- data.frame(chess.data, do.call(rbind, strsplit(chess.data, split = "|", fixed = TRUE)))
chess.data <- data.frame(lapply(chess.data, as.character), stringsAsFactors=FALSE)
chess.data <- chess.data[!grepl(".+--.+", chess.data$X1),] # removes "------" lines
chess.data <- chess.data[,-c(1,12)]
combine_rows <- function(dataframe){
df <- dataframe[0,] ##create empty dataframe copy
for (i in colnames(dataframe)){ ## for all columns in data frame
for (j in seq(1:length(dataframe[[i]]))){ ## for all rows in dataframe
df[j,i] <- gsub("\\s+"," ",(paste(dataframe[[i]][j:(j+1)], collapse = "")))
## gsub removes extra spaces, then collapses two rows
}
}
return(df) # return the dataframe
}
chess.data <- (combine_rows(chess.data)) # run function on data frame
colnames(chess.data) <- chess.data[1,] # create column names from first row
chess.data <- chess.data[-(1:2),] # remove header information
index <- !is.na(as.numeric(gsub( " .*$", "", str_trim(chess.data[,1])))) #gets the part of string, if a numeric value first it converts to True
chess.data <- chess.data[index,]
chess.data$ID <- str_trim(str_extract(chess.data[,1], "\\s[[0-9]]{1,}"))
chess.data$State <- str_trim(str_extract(chess.data[,1], "\\s[^[0-9]].+"))
chess.data$Name <- str_extract(chess.data[,2], ".[^[0-9]]{1,}")
chess.data$Pre_score <- str_extract(str_extract(chess.data[,2], ":.[[0-9]]{1,}.[^[-]]"), "[[0-9]]{1,}")
chess.data$Total_points <- str_extract(chess.data[,3], "[[0-9]].[[0-9]]\\s")
split.round.columns <- function(dataframe){
df.colname <- colnames(dataframe[!is.na(str_extract(colnames(dataframe), "Round.+"))])
df <- dataframe[!is.na(str_extract(colnames(dataframe), "Round.+"))]
df_prescore <- dataframe[!is.na(str_extract(colnames(dataframe), "^[[ID]]|Pre_score"))]
df.1 <- df.2 <- df.3 <- df[0,] #empty dataframe copies
for (i in df.colname){
for (j in seq(1:length(dataframe[[i]]))){
df.1[j,i] <- str_extract(dataframe[[i]][j], "[[:alpha:]]")
df.2[j,i] <- str_extract(dataframe[[i]][j], "[[0-9]]{1,}")
df.3[j,i] <- as.numeric(ifelse(!is.na(df.2[j,i]),
dataframe$Pre_score[dataframe$ID == df.2[j,i]],
NA))
}
}
colnames(df.1) <- str_c(str_trim(df.colname), " Results")
colnames(df.2) <- str_c(str_trim(df.colname), " Opponent")
colnames(df.3) <- str_c(str_trim(df.colname), " Opponent Prescore")
for (t in (seq(1:ncol(df)))){
dataframe <- cbind(dataframe, df.1[, t, drop = FALSE])
dataframe <- cbind(dataframe, df.2[, t, drop = FALSE])
dataframe <- cbind(dataframe, df.3[, t, drop = FALSE])
}
return(dataframe)
}
chess.data <- split.round.columns(chess.data)
chess.data <- chess.data[,-c(1:10)] # remove original columns
x <- t(head(chess.data, n = 4))
colnames(x) <- x[1, ]
x = x[-1,]
kable(x, align = "c", padding = 4) #check dataframe
| 1 | 2 | 3 | 4 | |
|---|---|---|---|---|
| State | ON | MI | MI | MI |
| Name | GARY HUA | DAKSHESH DARURI | ADITYA BAJAJ | PATRICK H SCHILLING |
| Pre_score | 1794 | 1553 | 1384 | 1716 |
| Total_points | 6.0 | 6.0 | 6.0 | 5.5 |
| Round 1 Results | W | W | L | W |
| Round 1 Opponent | 39 | 63 | 8 | 23 |
| Round 1 Opponent Prescore | 1436 | 1175 | 1641 | 1363 |
| Round 2 Results | W | W | W | D |
| Round 2 Opponent | 21 | 58 | 61 | 28 |
| Round 2 Opponent Prescore | 1563 | 917 | 955 | 1507 |
| Round 3 Results | W | L | W | W |
| Round 3 Opponent | 18 | 4 | 25 | 2 |
| Round 3 Opponent Prescore | 1600 | 1716 | 1745 | 1553 |
| Round 4 Results | W | W | W | W |
| Round 4 Opponent | 14 | 17 | 21 | 26 |
| Round 4 Opponent Prescore | 1610 | 1629 | 1563 | 1579 |
| Round 5 Results | W | W | W | D |
| Round 5 Opponent | 7 | 16 | 11 | 5 |
| Round 5 Opponent Prescore | 1649 | 1604 | 1712 | 1655 |
| Round 6 Results | D | W | W | W |
| Round 6 Opponent | 12 | 20 | 13 | 19 |
| Round 6 Opponent Prescore | 1663 | 1595 | 1666 | 1564 |
| Round 7 Results | D | W | W | D |
| Round 7 Opponent | 4 | 7 | 12 | 1 |
| Round 7 Opponent Prescore | 1716 | 1649 | 1663 | 1794 |
get.average.opp.prescore <- function(dataframe){
df_prescore <- dataframe[!is.na(str_extract(colnames(dataframe), "[[ID]]|Prescore"))]
df_prescore[] <- lapply(df_prescore, function(columns) as.numeric(columns))
df_prescore$avg_prescore <- rowMeans(df_prescore[c(2:(ncol(df_prescore)))], na.rm = TRUE)
dataframe <- df_prescore[c(1, (ncol(df_prescore)))]
return(dataframe)
}
ID.AverOPscore <- get.average.opp.prescore(chess.data)
kable(head(ID.AverOPscore), row.names = FALSE, align = "c")
| ID | avg_prescore |
|---|---|
| 1 | 1605.286 |
| 2 | 1469.286 |
| 3 | 1563.571 |
| 4 | 1573.571 |
| 5 | 1500.857 |
| 6 | 1518.714 |
Player's Name, Player's State, Total Number of Points, Player's Pre-Rating, and Average Pre Chess Rating of Opponents.results <- sqldf("SELECT Name `Player's Name`, State `Player's State`,
Total_points `Total Number of Points`, Pre_score `Player's Pre-Rating`,
avg_prescore `Average Pre Chess Rating of Opponents`
FROM 'chess.data' C LEFT OUTER JOIN 'ID.AverOPscore' I on C.ID = I.ID")
kable(results, align = "c")
| Player’s Name | Player’s State | Total Number of Points | Player’s Pre-Rating | Average Pre Chess Rating of Opponents |
|---|---|---|---|---|
| GARY HUA | ON | 6.0 | 1794 | 1605.286 |
| DAKSHESH DARURI | MI | 6.0 | 1553 | 1469.286 |
| ADITYA BAJAJ | MI | 6.0 | 1384 | 1563.571 |
| PATRICK H SCHILLING | MI | 5.5 | 1716 | 1573.571 |
| HANSHI ZUO | MI | 5.5 | 1655 | 1500.857 |
| HANSEN SONG | OH | 5.0 | 1686 | 1518.714 |
| GARY DEE SWATHELL | MI | 5.0 | 1649 | 1372.143 |
| EZEKIEL HOUGHTON | MI | 5.0 | 1641 | 1468.429 |
| STEFANO LEE | ON | 5.0 | 1411 | 1523.143 |
| ANVIT RAO | MI | 5.0 | 1365 | 1554.143 |
| CAMERON WILLIAM MC LEMAN | MI | 4.5 | 1712 | 1467.571 |
| KENNETH J TACK | MI | 4.5 | 1663 | 1506.167 |
| TORRANCE HENRY JR | MI | 4.5 | 1666 | 1497.857 |
| BRADLEY SHAW | MI | 4.5 | 1610 | 1515.000 |
| ZACHARY JAMES HOUGHTON | MI | 4.5 | 1220 | 1483.857 |
| MIKE NIKITIN | MI | 4.0 | 1604 | 1385.800 |
| RONALD GRZEGORCZYK | MI | 4.0 | 1629 | 1498.571 |
| DAVID SUNDEEN | MI | 4.0 | 1600 | 1480.000 |
| DIPANKAR ROY | MI | 4.0 | 1564 | 1426.286 |
| JASON ZHENG | MI | 4.0 | 1595 | 1410.857 |
| DINH DANG BUI | ON | 4.0 | 1563 | 1470.429 |
| EUGENE L MCCLURE | MI | 4.0 | 1555 | 1300.333 |
| ALAN BUI | ON | 4.0 | 1363 | 1213.857 |
| MICHAEL R ALDRICH | MI | 4.0 | 1229 | 1357.000 |
| LOREN SCHWIEBERT | MI | 3.5 | 1745 | 1363.286 |
| MAX ZHU | ON | 3.5 | 1579 | 1506.857 |
| GAURAV GIDWANI | MI | 3.5 | 1552 | 1221.667 |
| SOFIA ADINA STANESCU-BELLU | MI | 3.5 | 1507 | 1522.143 |
| CHIEDOZIE OKORIE | MI | 3.5 | 1602 | 1313.500 |
| GEORGE AVERY JONES | ON | 3.5 | 1522 | 1144.143 |
| RISHI SHETTY | MI | 3.5 | 1494 | 1259.857 |
| JOSHUA PHILIP MATHEWS | ON | 3.5 | 1441 | 1378.714 |
| JADE GE | MI | 3.5 | 1449 | 1276.857 |
| MICHAEL JEFFERY THOMAS | MI | 3.5 | 1399 | 1375.286 |
| JOSHUA DAVID LEE | MI | 3.5 | 1438 | 1149.714 |
| SIDDHARTH JHA | MI | 3.5 | 1355 | 1388.167 |
| AMIYATOSH PWNANANDAM | MI | 3.5 | 980 | 1384.800 |
| BRIAN LIU | MI | 3.0 | 1423 | 1539.167 |
| JOEL R HENDON | MI | 3.0 | 1436 | 1429.571 |
| FOREST ZHANG | MI | 3.0 | 1348 | 1390.571 |
| KYLE WILLIAM MURPHY | MI | 3.0 | 1403 | 1248.500 |
| JARED GE | MI | 3.0 | 1332 | 1149.857 |
| ROBERT GLEN VASEY | MI | 3.0 | 1283 | 1106.571 |
| JUSTIN D SCHILLING | MI | 3.0 | 1199 | 1327.000 |
| DEREK YAN | MI | 3.0 | 1242 | 1152.000 |
| JACOB ALEXANDER LAVALLEY | MI | 3.0 | 377 | 1357.714 |
| ERIC WRIGHT | MI | 2.5 | 1362 | 1392.000 |
| DANIEL KHAIN | MI | 2.5 | 1382 | 1355.800 |
| MICHAEL J MARTIN | MI | 2.5 | 1291 | 1285.800 |
| SHIVAM JHA | MI | 2.5 | 1056 | 1296.000 |
| TEJAS AYYAGARI | MI | 2.5 | 1011 | 1356.143 |
| ETHAN GUO | MI | 2.5 | 935 | 1494.571 |
| JOSE C YBARRA | MI | 2.0 | 1393 | 1345.333 |
| LARRY HODGE | MI | 2.0 | 1270 | 1206.167 |
| ALEX KONG | MI | 2.0 | 1186 | 1406.000 |
| MARISA RICCI | MI | 2.0 | 1153 | 1414.400 |
| MICHAEL LU | MI | 2.0 | 1092 | 1363.000 |
| VIRAJ MOHILE | MI | 2.0 | 917 | 1391.000 |
| SEAN M MC CORMICK | MI | 2.0 | 853 | 1319.000 |
| JULIA SHEN | MI | 1.5 | 967 | 1330.200 |
| JEZZEL FARKAS | ON | 1.5 | 955 | 1327.286 |
| ASHWIN BALAJI | MI | 1.0 | 1530 | 1186.000 |
| THOMAS JOSEPH HOSMER | MI | 1.0 | 1175 | 1350.200 |
| BEN LI | MI | 1.0 | 1163 | 1263.000 |
write.csv(results, paste(as.character(Sys.Date()), "CHUNT Chess Data results.csv", collapse = ""))