Load data into R by importing .txt file into github and using the raw file as the import file.
chess <- read.table(file ="https://raw.githubusercontent.com/ntlrs/data607project1/master/tournamentinfo.txt", skip = 1, sep = "\n", header = FALSE, stringsAsFactors = FALSE)
library(stringr)
Build Data Frame out of .txt file
#extract player names from data
player.names <- unlist(str_extract_all(chess, "\\| [[:upper:]- ]{4,} \\|"))
player.names <- gsub("^\\W\\s+", "", player.names)
player.names <- gsub("\\s\\W$", "", player.names)
player.names <- str_trim(player.names, side = "right")
player.names <- as.character(player.names)
#extract players state
player.state <-unlist(str_extract_all(chess, "[[:UPPER:]]{2} \\|"))
player.state <- gsub("\\s\\W$", "", player.state)
player.state <- as.character(player.state)
#extract total number of players points
total.points <- unlist(str_extract_all(chess, "\\|[[:digit:].[:digit:]]{3}"))
total.points <- gsub("^\\W", "", total.points)
total.points <- as.numeric(total.points)
#extract player's pre-rating score
pre.rating <- unlist(str_extract_all(chess, "\\:\\s+\\d*"))
pre.rating <- gsub("\\:\\s", "", pre.rating)
pre.rating <-as.numeric(pre.rating)
#extract opponents wins, loses, draws
opponents <- unlist(str_extract_all(chess,"\\|[WDL]{1,}[[:space:]]+\\d+"))
#seems that I have left out some data (H,U). Going to build a data frame of the information I know will match and figure out adding the W,L,D,H,U after.
chess.df <- data.frame(player.name = player.names, player.state = player.state, total.points = total.points, pre.rating = pre.rating)
str(chess.df)
## 'data.frame': 64 obs. of 4 variables:
## $ player.name : Factor w/ 64 levels "ADITYA BAJAJ",..: 24 12 1 51 28 27 23 21 59 5 ...
## $ player.state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
## $ total.points: num 6 6 6 5.5 5.5 5 5 5 5 5 ...
## $ pre.rating : num 1794 1553 1384 1716 1655 ...
chess.df$player.num<- 1:nrow(chess.df)
chess.df$player.num<- as.numeric(chess.df$player.num)
chess.df <- subset(chess.df, select = c("player.name", "player.state", "total.points", "pre.rating", "player.num"))
#make a second data frame with name of player and wins/losses/draws
opponents1 <- unlist(str_extract_all(chess, "\\d{1,}\\s\\| [[:upper:]- ]{4,} \\|(.*?)[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*"))
str(opponents1)
## chr [1:64] "1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4" ...
#attempt to turn opponents1 into a data frame. reference this article: https://stackoverflow.com/questions/22455884/how-to-split-a-character-vector-into-data-frame
library(reshape2)
colsplit(opponents1, "\\|", names=c("player num", "player.name", "total points", "round 1", "round 2", "round 3", "round 4", "round 5", "round 6", "round 7"))
## player num player.name total points round 1
## 1 1 GARY HUA 6.0 W 39
## 2 2 DAKSHESH DARURI 6.0 W 63
## 3 3 ADITYA BAJAJ 6.0 L 8
## 4 4 PATRICK H SCHILLING 5.5 W 23
## 5 5 HANSHI ZUO 5.5 W 45
## 6 6 HANSEN SONG 5.0 W 34
## 7 7 GARY DEE SWATHELL 5.0 W 57
## 8 8 EZEKIEL HOUGHTON 5.0 W 3
## 9 9 STEFANO LEE 5.0 W 25
## 10 10 ANVIT RAO 5.0 D 16
## 11 11 CAMERON WILLIAM MC LEMAN 4.5 D 38
## 12 12 KENNETH J TACK 4.5 W 42
## 13 13 TORRANCE HENRY JR 4.5 W 36
## 14 14 BRADLEY SHAW 4.5 W 54
## 15 15 ZACHARY JAMES HOUGHTON 4.5 D 19
## 16 16 MIKE NIKITIN 4.0 D 10
## 17 17 RONALD GRZEGORCZYK 4.0 W 48
## 18 18 DAVID SUNDEEN 4.0 W 47
## 19 19 DIPANKAR ROY 4.0 D 15
## 20 20 JASON ZHENG 4.0 L 40
## 21 21 DINH DANG BUI 4.0 W 43
## 22 22 EUGENE L MCCLURE 4.0 W 64
## 23 23 ALAN BUI 4.0 L 4
## 24 24 MICHAEL R ALDRICH 4.0 L 28
## 25 25 LOREN SCHWIEBERT 3.5 L 9
## 26 26 MAX ZHU 3.5 W 49
## 27 27 GAURAV GIDWANI 3.5 W 51
## 28 28 SOFIA ADINA STANESCU-BELLU 3.5 W 24
## 29 29 CHIEDOZIE OKORIE 3.5 W 50
## 30 30 GEORGE AVERY JONES 3.5 L 52
## 31 31 RISHI SHETTY 3.5 L 58
## 32 32 JOSHUA PHILIP MATHEWS 3.5 W 61
## 33 33 JADE GE 3.5 W 60
## 34 34 MICHAEL JEFFERY THOMAS 3.5 L 6
## 35 35 JOSHUA DAVID LEE 3.5 L 46
## 36 36 SIDDHARTH JHA 3.5 L 13
## 37 37 AMIYATOSH PWNANANDAM 3.5 B
## 38 38 BRIAN LIU 3.0 D 11
## 39 39 JOEL R HENDON 3.0 L 1
## 40 40 FOREST ZHANG 3.0 W 20
## 41 41 KYLE WILLIAM MURPHY 3.0 W 59
## 42 42 JARED GE 3.0 L 12
## 43 43 ROBERT GLEN VASEY 3.0 L 21
## 44 44 JUSTIN D SCHILLING 3.0 B
## 45 45 DEREK YAN 3.0 L 5
## 46 46 JACOB ALEXANDER LAVALLEY 3.0 W 35
## 47 47 ERIC WRIGHT 2.5 L 18
## 48 48 DANIEL KHAIN 2.5 L 17
## 49 49 MICHAEL J MARTIN 2.5 L 26
## 50 50 SHIVAM JHA 2.5 L 29
## 51 51 TEJAS AYYAGARI 2.5 L 27
## 52 52 ETHAN GUO 2.5 W 30
## 53 53 JOSE C YBARRA 2.0 H
## 54 54 LARRY HODGE 2.0 L 14
## 55 55 ALEX KONG 2.0 L 62
## 56 56 MARISA RICCI 2.0 H
## 57 57 MICHAEL LU 2.0 L 7
## 58 58 VIRAJ MOHILE 2.0 W 31
## 59 59 SEAN M MC CORMICK 2.0 L 41
## 60 60 JULIA SHEN 1.5 L 33
## 61 61 JEZZEL FARKAS 1.5 L 32
## 62 62 ASHWIN BALAJI 1.0 W 55
## 63 63 THOMAS JOSEPH HOSMER 1.0 L 2
## 64 64 BEN LI 1.0 L 22
## round 2 round 3 round 4 round 5 round 6 round 7
## 1 W 21 W 18 W 14 W 7 D 12 D 4
## 2 W 58 L 4 W 17 W 16 W 20 W 7
## 3 W 61 W 25 W 21 W 11 W 13 W 12
## 4 D 28 W 2 W 26 D 5 W 19 D 1
## 5 W 37 D 12 D 13 D 4 W 14 W 17
## 6 D 29 L 11 W 35 D 10 W 27 W 21
## 7 W 46 W 13 W 11 L 1 W 9 L 2
## 8 W 32 L 14 L 9 W 47 W 28 W 19
## 9 L 18 W 59 W 8 W 26 L 7 W 20
## 10 L 19 W 55 W 31 D 6 W 25 W 18
## 11 W 56 W 6 L 7 L 3 W 34 W 26
## 12 W 33 D 5 W 38 H D 1 L 3
## 13 W 27 L 7 D 5 W 33 L 3 W 32
## 14 W 44 W 8 L 1 D 27 L 5 W 31
## 15 L 16 W 30 L 22 W 54 W 33 W 38
## 16 W 15 H W 39 L 2 W 36 U
## 17 W 41 L 26 L 2 W 23 W 22 L 5
## 18 W 9 L 1 W 32 L 19 W 38 L 10
## 19 W 10 W 52 D 28 W 18 L 4 L 8
## 20 W 49 W 23 W 41 W 28 L 2 L 9
## 21 L 1 W 47 L 3 W 40 W 39 L 6
## 22 D 52 L 28 W 15 H L 17 W 40
## 23 W 43 L 20 W 58 L 17 W 37 W 46
## 24 L 47 W 43 L 25 W 60 W 44 W 39
## 25 W 53 L 3 W 24 D 34 L 10 W 47
## 26 W 40 W 17 L 4 L 9 D 32 L 11
## 27 L 13 W 46 W 37 D 14 L 6 U
## 28 D 4 W 22 D 19 L 20 L 8 D 36
## 29 D 6 L 38 L 34 W 52 W 48 U
## 30 D 64 L 15 W 55 L 31 W 61 W 50
## 31 D 55 W 64 L 10 W 30 W 50 L 14
## 32 L 8 W 44 L 18 W 51 D 26 L 13
## 33 L 12 W 50 D 36 L 13 L 15 W 51
## 34 W 60 L 37 W 29 D 25 L 11 W 52
## 35 L 38 W 56 L 6 W 57 D 52 W 48
## 36 W 57 W 51 D 33 H L 16 D 28
## 37 L 5 W 34 L 27 H L 23 W 61
## 38 W 35 W 29 L 12 H L 18 L 15
## 39 W 54 W 40 L 16 W 44 L 21 L 24
## 40 L 26 L 39 W 59 L 21 W 56 L 22
## 41 L 17 W 58 L 20 X U U
## 42 L 50 L 57 D 60 D 61 W 64 W 56
## 43 L 23 L 24 W 63 W 59 L 46 W 55
## 44 L 14 L 32 W 53 L 39 L 24 W 59
## 45 L 51 D 60 L 56 W 63 D 55 W 58
## 46 L 7 L 27 L 50 W 64 W 43 L 23
## 47 W 24 L 21 W 61 L 8 D 51 L 25
## 48 W 63 H D 52 H L 29 L 35
## 49 L 20 D 63 D 64 W 58 H U
## 50 W 42 L 33 W 46 H L 31 L 30
## 51 W 45 L 36 W 57 L 32 D 47 L 33
## 52 D 22 L 19 D 48 L 29 D 35 L 34
## 53 L 25 H L 44 U W 57 U
## 54 L 39 L 61 B L 15 L 59 W 64
## 55 D 31 L 10 L 30 B D 45 L 43
## 56 L 11 L 35 W 45 H L 40 L 42
## 57 L 36 W 42 L 51 L 35 L 53 B
## 58 L 2 L 41 L 23 L 49 B L 45
## 59 B L 9 L 40 L 43 W 54 L 44
## 60 L 34 D 45 D 42 L 24 H U
## 61 L 3 W 54 L 47 D 42 L 30 L 37
## 62 U U U U U U
## 63 L 48 D 49 L 43 L 45 H U
## 64 D 30 L 31 D 49 L 46 L 42 L 54
df <- data.frame(opponents1)
df <- str_split_fixed(opponents1, "\\|", 10)
colnames(df) <- c("player number", "player name", "total point", "round 1", "round 2", "round 3", "round 4", "round 5", "round 6", "round 7")
df <- data.frame(df)
df$player.number<- as.numeric(df$player.number)
df$round.1 <- as.numeric(df$round.1)
df$round.2 <- as.numeric(df$round.2)
df$round.3 <- as.numeric(df$round.3)
df$round.4 <- as.numeric(df$round.4)
df$round.5 <- as.numeric(df$round.5)
df$round.6 <- as.numeric(df$round.6)
df$round.7 <- as.numeric(df$round.7)
str(df)
## 'data.frame': 64 obs. of 10 variables:
## $ player.number: num 1 12 23 34 45 56 62 63 64 2 ...
## $ player.name : Factor w/ 64 levels " ADITYA BAJAJ ",..: 24 12 1 51 28 27 23 21 59 5 ...
## $ total.point : Factor w/ 11 levels "1.0 ","1.5 ",..: 11 11 11 10 10 9 9 9 9 9 ...
## $ round.1 : num 46 61 15 38 49 43 57 36 40 5 ...
## $ round.2 : num 42 61 63 5 48 6 55 45 24 25 ...
## $ round.3 : num 41 10 44 36 2 14 39 15 61 58 ...
## $ round.4 : num 42 44 45 47 3 51 41 20 40 49 ...
## $ round.5 : num 35 37 36 3 2 5 13 46 40 4 ...
## $ round.6 : num 3 42 39 41 40 45 38 46 18 44 ...
## $ round.7 : num 3 32 33 2 34 38 6 36 37 35 ...
chess.df$player.num <- as.numeric(chess.df$player.num)
df2 <- subset(chess.df, select = c("player.name", "player.state", "player.num", "pre.rating"))
df3 <-merge(df, df2, by.x = "player.number", by.y = "player.num")
df4 <- subset(df3, select = c("player.number", "player.name.x", "player.state", "total.point", "round.1", "round.2", "round.3", "round.4", "round.5", "round.6", "round.7", "pre.rating"))
#I'd like to replace the opponent's number with their prerating within each round. I will try to do something like a VLOOKUP to replace the values. Shout out to Kai for helping me with this! and this stack overflow article: https://stackoverflow.com/questions/35636315/replace-values-in-a-dataframe-based-on-lookup-table
lookup <- subset(df4, select = c("player.number", "pre.rating"))
df5 <- lapply(df4, function(x) lookup$pre.rating[match(x, lookup$player.number)])
df5 <- data.frame(df5)
df5 <- subset(df5, select = c("round.1", "round.2", "round.3", "round.4", "round.5", "round.6", "round.7"))
opp.avg <- round(rowMeans(df5, na.rm=TRUE),0)
df6 <- data.frame(opp.avg)
#combine all data together
df7 <- data.frame(player.names, player.state, total.points, pre.rating, opp.avg)
head(df7)
## player.names player.state total.points pre.rating opp.avg
## 1 GARY HUA ON 6.0 1794 1236
## 2 DAKSHESH DARURI MI 6.0 1553 1423
## 3 ADITYA BAJAJ MI 6.0 1384 1298
## 4 PATRICK H SCHILLING MI 5.5 1716 1399
## 5 HANSHI ZUO MI 5.5 1655 1429
## 6 HANSEN SONG OH 5.0 1686 1458
#write CSV file for finished project
write.csv(df7, file= "/Users/ntlmllghn/Dropbox/Data/Data 607/project1.csv")