Data 607 Project 1

Project 1 Data 607

Load data into R by importing .txt file into github and using the raw file as the import file.

chess <- read.table(file ="https://raw.githubusercontent.com/ntlrs/data607project1/master/tournamentinfo.txt", skip = 1, sep = "\n", header = FALSE, stringsAsFactors = FALSE)
library(stringr)

Build Data Frame out of .txt file

#extract player names from data
player.names <- unlist(str_extract_all(chess, "\\| [[:upper:]- ]{4,} \\|"))
player.names <- gsub("^\\W\\s+", "", player.names)
player.names <- gsub("\\s\\W$", "", player.names)
player.names <- str_trim(player.names, side = "right")
player.names <- as.character(player.names)

#extract players state
player.state <-unlist(str_extract_all(chess, "[[:UPPER:]]{2} \\|"))
player.state <- gsub("\\s\\W$", "", player.state)
player.state <- as.character(player.state)

#extract total number of players points
total.points <- unlist(str_extract_all(chess, "\\|[[:digit:].[:digit:]]{3}"))
total.points <- gsub("^\\W", "", total.points)
total.points <- as.numeric(total.points)

#extract player's pre-rating score
pre.rating <- unlist(str_extract_all(chess, "\\:\\s+\\d*"))
pre.rating <- gsub("\\:\\s", "", pre.rating)
pre.rating <-as.numeric(pre.rating)

#extract opponents wins, loses, draws
opponents <- unlist(str_extract_all(chess,"\\|[WDL]{1,}[[:space:]]+\\d+"))

#seems that I have left out some data (H,U). Going to build a data frame of the information I know will match and figure out adding the W,L,D,H,U after.
chess.df <- data.frame(player.name = player.names, player.state = player.state, total.points = total.points, pre.rating = pre.rating)
str(chess.df)

## 'data.frame':    64 obs. of  4 variables:
##  $ player.name : Factor w/ 64 levels "ADITYA BAJAJ",..: 24 12 1 51 28 27 23 21 59 5 ...
##  $ player.state: Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
##  $ total.points: num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ pre.rating  : num  1794 1553 1384 1716 1655 ...

chess.df$player.num<- 1:nrow(chess.df)
chess.df$player.num<- as.numeric(chess.df$player.num)
chess.df <- subset(chess.df, select = c("player.name", "player.state", "total.points", "pre.rating", "player.num"))

#make a second data frame with name of player and wins/losses/draws
opponents1 <- unlist(str_extract_all(chess, "\\d{1,}\\s\\| [[:upper:]- ]{4,} \\|(.*?)[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*\\|*[WDLHUBXU]{1,}[[:space:]]+\\d*"))

str(opponents1)

##  chr [1:64] "1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4" ...

#attempt to turn opponents1 into a data frame. reference this article: https://stackoverflow.com/questions/22455884/how-to-split-a-character-vector-into-data-frame

library(reshape2)
colsplit(opponents1, "\\|", names=c("player num", "player.name", "total points", "round 1", "round 2", "round 3", "round 4", "round 5", "round 6", "round 7"))

##    player num                       player.name total points round 1
## 1           1  GARY HUA                                  6.0   W  39
## 2           2  DAKSHESH DARURI                           6.0   W  63
## 3           3  ADITYA BAJAJ                              6.0   L   8
## 4           4  PATRICK H SCHILLING                       5.5   W  23
## 5           5  HANSHI ZUO                                5.5   W  45
## 6           6  HANSEN SONG                               5.0   W  34
## 7           7  GARY DEE SWATHELL                         5.0   W  57
## 8           8  EZEKIEL HOUGHTON                          5.0   W   3
## 9           9  STEFANO LEE                               5.0   W  25
## 10         10  ANVIT RAO                                 5.0   D  16
## 11         11  CAMERON WILLIAM MC LEMAN                  4.5   D  38
## 12         12  KENNETH J TACK                            4.5   W  42
## 13         13  TORRANCE HENRY JR                         4.5   W  36
## 14         14  BRADLEY SHAW                              4.5   W  54
## 15         15  ZACHARY JAMES HOUGHTON                    4.5   D  19
## 16         16  MIKE NIKITIN                              4.0   D  10
## 17         17  RONALD GRZEGORCZYK                        4.0   W  48
## 18         18  DAVID SUNDEEN                             4.0   W  47
## 19         19  DIPANKAR ROY                              4.0   D  15
## 20         20  JASON ZHENG                               4.0   L  40
## 21         21  DINH DANG BUI                             4.0   W  43
## 22         22  EUGENE L MCCLURE                          4.0   W  64
## 23         23  ALAN BUI                                  4.0   L   4
## 24         24  MICHAEL R ALDRICH                         4.0   L  28
## 25         25  LOREN SCHWIEBERT                          3.5   L   9
## 26         26  MAX ZHU                                   3.5   W  49
## 27         27  GAURAV GIDWANI                            3.5   W  51
## 28         28  SOFIA ADINA STANESCU-BELLU                3.5   W  24
## 29         29  CHIEDOZIE OKORIE                          3.5   W  50
## 30         30  GEORGE AVERY JONES                        3.5   L  52
## 31         31  RISHI SHETTY                              3.5   L  58
## 32         32  JOSHUA PHILIP MATHEWS                     3.5   W  61
## 33         33  JADE GE                                   3.5   W  60
## 34         34  MICHAEL JEFFERY THOMAS                    3.5   L   6
## 35         35  JOSHUA DAVID LEE                          3.5   L  46
## 36         36  SIDDHARTH JHA                             3.5   L  13
## 37         37  AMIYATOSH PWNANANDAM                      3.5   B    
## 38         38  BRIAN LIU                                 3.0   D  11
## 39         39  JOEL R HENDON                             3.0   L   1
## 40         40  FOREST ZHANG                              3.0   W  20
## 41         41  KYLE WILLIAM MURPHY                       3.0   W  59
## 42         42  JARED GE                                  3.0   L  12
## 43         43  ROBERT GLEN VASEY                         3.0   L  21
## 44         44  JUSTIN D SCHILLING                        3.0   B    
## 45         45  DEREK YAN                                 3.0   L   5
## 46         46  JACOB ALEXANDER LAVALLEY                  3.0   W  35
## 47         47  ERIC WRIGHT                               2.5   L  18
## 48         48  DANIEL KHAIN                              2.5   L  17
## 49         49  MICHAEL J MARTIN                          2.5   L  26
## 50         50  SHIVAM JHA                                2.5   L  29
## 51         51  TEJAS AYYAGARI                            2.5   L  27
## 52         52  ETHAN GUO                                 2.5   W  30
## 53         53  JOSE C YBARRA                             2.0   H    
## 54         54  LARRY HODGE                               2.0   L  14
## 55         55  ALEX KONG                                 2.0   L  62
## 56         56  MARISA RICCI                              2.0   H    
## 57         57  MICHAEL LU                                2.0   L   7
## 58         58  VIRAJ MOHILE                              2.0   W  31
## 59         59  SEAN M MC CORMICK                         2.0   L  41
## 60         60  JULIA SHEN                                1.5   L  33
## 61         61  JEZZEL FARKAS                             1.5   L  32
## 62         62  ASHWIN BALAJI                             1.0   W  55
## 63         63  THOMAS JOSEPH HOSMER                      1.0   L   2
## 64         64  BEN LI                                    1.0   L  22
##    round 2 round 3 round 4 round 5 round 6 round 7
## 1    W  21   W  18   W  14   W   7   D  12   D   4
## 2    W  58   L   4   W  17   W  16   W  20   W   7
## 3    W  61   W  25   W  21   W  11   W  13   W  12
## 4    D  28   W   2   W  26   D   5   W  19   D   1
## 5    W  37   D  12   D  13   D   4   W  14   W  17
## 6    D  29   L  11   W  35   D  10   W  27   W  21
## 7    W  46   W  13   W  11   L   1   W   9   L   2
## 8    W  32   L  14   L   9   W  47   W  28   W  19
## 9    L  18   W  59   W   8   W  26   L   7   W  20
## 10   L  19   W  55   W  31   D   6   W  25   W  18
## 11   W  56   W   6   L   7   L   3   W  34   W  26
## 12   W  33   D   5   W  38   H       D   1   L   3
## 13   W  27   L   7   D   5   W  33   L   3   W  32
## 14   W  44   W   8   L   1   D  27   L   5   W  31
## 15   L  16   W  30   L  22   W  54   W  33   W  38
## 16   W  15   H       W  39   L   2   W  36   U    
## 17   W  41   L  26   L   2   W  23   W  22   L   5
## 18   W   9   L   1   W  32   L  19   W  38   L  10
## 19   W  10   W  52   D  28   W  18   L   4   L   8
## 20   W  49   W  23   W  41   W  28   L   2   L   9
## 21   L   1   W  47   L   3   W  40   W  39   L   6
## 22   D  52   L  28   W  15   H       L  17   W  40
## 23   W  43   L  20   W  58   L  17   W  37   W  46
## 24   L  47   W  43   L  25   W  60   W  44   W  39
## 25   W  53   L   3   W  24   D  34   L  10   W  47
## 26   W  40   W  17   L   4   L   9   D  32   L  11
## 27   L  13   W  46   W  37   D  14   L   6   U    
## 28   D   4   W  22   D  19   L  20   L   8   D  36
## 29   D   6   L  38   L  34   W  52   W  48   U    
## 30   D  64   L  15   W  55   L  31   W  61   W  50
## 31   D  55   W  64   L  10   W  30   W  50   L  14
## 32   L   8   W  44   L  18   W  51   D  26   L  13
## 33   L  12   W  50   D  36   L  13   L  15   W  51
## 34   W  60   L  37   W  29   D  25   L  11   W  52
## 35   L  38   W  56   L   6   W  57   D  52   W  48
## 36   W  57   W  51   D  33   H       L  16   D  28
## 37   L   5   W  34   L  27   H       L  23   W  61
## 38   W  35   W  29   L  12   H       L  18   L  15
## 39   W  54   W  40   L  16   W  44   L  21   L  24
## 40   L  26   L  39   W  59   L  21   W  56   L  22
## 41   L  17   W  58   L  20   X       U       U    
## 42   L  50   L  57   D  60   D  61   W  64   W  56
## 43   L  23   L  24   W  63   W  59   L  46   W  55
## 44   L  14   L  32   W  53   L  39   L  24   W  59
## 45   L  51   D  60   L  56   W  63   D  55   W  58
## 46   L   7   L  27   L  50   W  64   W  43   L  23
## 47   W  24   L  21   W  61   L   8   D  51   L  25
## 48   W  63   H       D  52   H       L  29   L  35
## 49   L  20   D  63   D  64   W  58   H       U    
## 50   W  42   L  33   W  46   H       L  31   L  30
## 51   W  45   L  36   W  57   L  32   D  47   L  33
## 52   D  22   L  19   D  48   L  29   D  35   L  34
## 53   L  25   H       L  44   U       W  57   U    
## 54   L  39   L  61   B       L  15   L  59   W  64
## 55   D  31   L  10   L  30   B       D  45   L  43
## 56   L  11   L  35   W  45   H       L  40   L  42
## 57   L  36   W  42   L  51   L  35   L  53   B    
## 58   L   2   L  41   L  23   L  49   B       L  45
## 59   B       L   9   L  40   L  43   W  54   L  44
## 60   L  34   D  45   D  42   L  24   H       U    
## 61   L   3   W  54   L  47   D  42   L  30   L  37
## 62   U       U       U       U       U       U    
## 63   L  48   D  49   L  43   L  45   H       U    
## 64   D  30   L  31   D  49   L  46   L  42   L  54

df <- data.frame(opponents1)

df <- str_split_fixed(opponents1, "\\|", 10)
colnames(df) <- c("player number", "player name", "total point", "round 1", "round 2", "round 3", "round 4", "round 5", "round 6", "round 7")
df <- data.frame(df)
df$player.number<- as.numeric(df$player.number)
df$round.1 <- as.numeric(df$round.1)
df$round.2 <- as.numeric(df$round.2)
df$round.3 <- as.numeric(df$round.3)
df$round.4 <- as.numeric(df$round.4)
df$round.5 <- as.numeric(df$round.5)
df$round.6 <- as.numeric(df$round.6)
df$round.7 <- as.numeric(df$round.7)
str(df)

## 'data.frame':    64 obs. of  10 variables:
##  $ player.number: num  1 12 23 34 45 56 62 63 64 2 ...
##  $ player.name  : Factor w/ 64 levels " ADITYA BAJAJ                    ",..: 24 12 1 51 28 27 23 21 59 5 ...
##  $ total.point  : Factor w/ 11 levels "1.0  ","1.5  ",..: 11 11 11 10 10 9 9 9 9 9 ...
##  $ round.1      : num  46 61 15 38 49 43 57 36 40 5 ...
##  $ round.2      : num  42 61 63 5 48 6 55 45 24 25 ...
##  $ round.3      : num  41 10 44 36 2 14 39 15 61 58 ...
##  $ round.4      : num  42 44 45 47 3 51 41 20 40 49 ...
##  $ round.5      : num  35 37 36 3 2 5 13 46 40 4 ...
##  $ round.6      : num  3 42 39 41 40 45 38 46 18 44 ...
##  $ round.7      : num  3 32 33 2 34 38 6 36 37 35 ...

chess.df$player.num <- as.numeric(chess.df$player.num)
df2 <- subset(chess.df, select = c("player.name", "player.state", "player.num", "pre.rating"))
df3 <-merge(df, df2, by.x = "player.number", by.y = "player.num")

df4 <- subset(df3, select = c("player.number", "player.name.x", "player.state", "total.point", "round.1", "round.2", "round.3", "round.4", "round.5", "round.6", "round.7", "pre.rating"))

#I'd like to replace the opponent's number with their prerating within each round. I will try to do something like a VLOOKUP to replace the values. Shout out to Kai for helping me with this! and this stack overflow article: https://stackoverflow.com/questions/35636315/replace-values-in-a-dataframe-based-on-lookup-table
lookup <- subset(df4, select = c("player.number", "pre.rating"))
df5 <- lapply(df4, function(x) lookup$pre.rating[match(x, lookup$player.number)])
df5 <- data.frame(df5)
df5 <- subset(df5, select = c("round.1", "round.2", "round.3", "round.4", "round.5", "round.6", "round.7"))

opp.avg <- round(rowMeans(df5, na.rm=TRUE),0)
df6 <- data.frame(opp.avg)
#combine all data together
df7 <- data.frame(player.names, player.state, total.points, pre.rating, opp.avg)
head(df7)

##          player.names player.state total.points pre.rating opp.avg
## 1            GARY HUA           ON          6.0       1794    1236
## 2     DAKSHESH DARURI           MI          6.0       1553    1423
## 3        ADITYA BAJAJ           MI          6.0       1384    1298
## 4 PATRICK H SCHILLING           MI          5.5       1716    1399
## 5          HANSHI ZUO           MI          5.5       1655    1429
## 6         HANSEN SONG           OH          5.0       1686    1458

#write CSV file for finished project
write.csv(df7, file= "/Users/ntlmllghn/Dropbox/Data/Data 607/project1.csv")

Data 607 Project 1

Natalie Mollaghan

9/22/2017

Project 1 Data 607