library(stringr)
setwd("~/CUNY/DATA 607 Data Acquisition and Management/project 1")
#Importing Tournament text file
projectraw <- read.delim("tournamentinfo.txt")
#Making copy of text file as a back up
rawcopy <- projectraw
#Removing empty rows 
projectnorows <- projectraw[-(seq(3,195, by=3)),]
#Extracting first and last names. Each first name is longer than 2 letters so making the first name at least 3 letters lets us miss the 2 letter state.
projectnames1 <- unlist(str_extract_all(projectnorows, "[A-Z]{2,}([\\s]?[A-Z]*-?)*([^\\s|/])"))
#Removing the extracted column name
projectnames <- projectnames1[-1]
#Extracting states using the vertical lines to stop phrase
projectstatedirty <- unlist(str_extract_all(projectnorows, "[A-Z]{2}[\\s]\\|"))
#Removing the extra space and vertical line
projectstate <- gsub('.{2}$', '', projectstatedirty)
#Extracting points, each as a decimal we can use to idenifty them
projectpoints <- unlist(str_extract_all(projectnorows, "\\d\\.\\d"))
#Extracting pre-rating using 'R:' to identify them
projectpreratingdirty <- unlist(str_extract_all(projectnorows, "R:\\s*\\d*[^\\s-[:alpha:]]"))
#Removing R, :, and spaces
projectprerating <- gsub('^.{3}', '', projectpreratingdirty)
#Extracting list of opponents
projectopponentsdirty <- unlist(str_extract_all(projectnorows, "\\..*?$"))
#Removing letters, lines, and spaces
projectopponentsdirty2 <- gsub('^.{3}', '', projectopponentsdirty)
projectopponentsdirty3 <- gsub('\\|', '', projectopponentsdirty2)
projectopponentsdirty4 <- gsub('[WLDBHUX]', '', projectopponentsdirty3)
projectopponentsdirty5 <- gsub('[ ]{1,}', ',', projectopponentsdirty4)
projectopponents <- gsub('^.{1}', '', projectopponentsdirty5)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
#Putting finished extracted variables into data frame
project <- data.frame(projectnames, projectstate, projectpoints, projectprerating)
#Putting dirty extracted variables into data frame
projectdirty <- data.frame(projectnames, projectstate, projectpoints, projectprerating, projectopponents)
#separate concatenated opponents into variables
projectdirty1 <- separate(projectdirty, projectopponents, c("opponent1", "opponent2", "opponent3", "opponent4", "opponent5", "opponent6", "opponent7"), sep = ",")
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 20 rows [12,
## 16, 22, 36, 37, 38, 41, 44, 48, 49, 50, 53, 54, 55, 56, 58, 59, 60, 62,
## 63].
#replace na in opponents with 0
projectdirty2 <- replace(projectdirty1,is.na(projectdirty1),0)

#making opponents numeric in separate variables
opponent1 <- as.numeric(as.character(projectdirty2$opponent1))
opponent2 <- as.numeric(as.character(projectdirty2$opponent2))
opponent3 <- as.numeric(as.character(projectdirty2$opponent3))
opponent4 <- as.numeric(as.character(projectdirty2$opponent4))
opponent5 <- as.numeric(as.character(projectdirty2$opponent5))
opponent6 <- as.numeric(as.character(projectdirty2$opponent6))
opponent7 <- as.numeric(as.character(projectdirty2$opponent7))
numericopponents <- data.frame(opponent1, opponent2, opponent3, opponent4, opponent5, opponent6, opponent7)
#preping data for use as a row number where 0 = 1
numericopponents1 <- numericopponents - 1

#make preratingsum by summing prerating numbers from rows numbered by numericopponents1 value 
numericopponents2 <- mutate(numericopponents1, projectpreratingsum1 = projectdirty2$projectprerating[(row_number(opponent1))])
numericopponents3 <- mutate(numericopponents2, projectpreratingsum2 = projectdirty2$projectprerating[(row_number(opponent2))])
numericopponents4 <- mutate(numericopponents3, projectpreratingsum3 = projectdirty2$projectprerating[(row_number(opponent3))])
numericopponents5 <- mutate(numericopponents4, projectpreratingsum4 = projectdirty2$projectprerating[(row_number(opponent4))])
numericopponents6 <- mutate(numericopponents5, projectpreratingsum5 = projectdirty2$projectprerating[(row_number(opponent5))])
numericopponents7 <- mutate(numericopponents6, projectpreratingsum6 = projectdirty2$projectprerating[(row_number(opponent6))])
numericopponents8 <- mutate(numericopponents7, projectpreratingsum7 = projectdirty2$projectprerating[(row_number(opponent7))])
#making sums numeric
projectpreratingsum1 <- as.numeric(as.character(numericopponents8$projectpreratingsum1))
projectpreratingsum2 <- as.numeric(as.character(numericopponents8$projectpreratingsum2))
projectpreratingsum3 <- as.numeric(as.character(numericopponents8$projectpreratingsum3))
projectpreratingsum4 <- as.numeric(as.character(numericopponents8$projectpreratingsum4))
projectpreratingsum5 <- as.numeric(as.character(numericopponents8$projectpreratingsum5))
projectpreratingsum6 <- as.numeric(as.character(numericopponents8$projectpreratingsum6))
projectpreratingsum7 <- as.numeric(as.character(numericopponents8$projectpreratingsum7))

#Data frame of scores to be summed
projectpreratingsum <- data.frame(projectpreratingsum1, projectpreratingsum2, projectpreratingsum3, projectpreratingsum4, projectpreratingsum5, projectpreratingsum6, projectpreratingsum7)
projectpreratingsum1 <- replace(projectpreratingsum,is.na(projectpreratingsum),0)

#Summing pre rating scores
tobesummed <- c("projectpreratingsum1", "projectpreratingsum2", "projectpreratingsum3", "projectpreratingsum4", "projectpreratingsum5", "projectpreratingsum6", "projectpreratingsum7")
project$prerattingsum <- rowSums(projectpreratingsum1[, tobesummed])

#Renaming columns
names(project) = c("Names", "State", "Points", "Pre-rating", "Sum of Opponenets Pre-Rating")
#Create CSV to be used in SQL
write.csv(project, "~/CUNY/DATA 607 Data Acquisition and Management/project 1/project1.csv")
#Check CSV
read.csv("~/CUNY/DATA 607 Data Acquisition and Management/project 1/project1.csv")
##     X                      Names State Points Pre.rating
## 1   1                   GARY HUA    ON    6.0       1794
## 2   2            DAKSHESH DARURI    MI    6.0       1553
## 3   3               ADITYA BAJAJ    MI    6.0       1384
## 4   4        PATRICK H SCHILLING    MI    5.5       1716
## 5   5                 HANSHI ZUO    MI    5.5       1655
## 6   6                HANSEN SONG    OH    5.0       1686
## 7   7          GARY DEE SWATHELL    MI    5.0       1649
## 8   8           EZEKIEL HOUGHTON    MI    5.0       1641
## 9   9                STEFANO LEE    ON    5.0       1411
## 10 10                  ANVIT RAO    MI    5.0       1365
## 11 11   CAMERON WILLIAM MC LEMAN    MI    4.5       1712
## 12 12             KENNETH J TACK    MI    4.5       1663
## 13 13          TORRANCE HENRY JR    MI    4.5       1666
## 14 14               BRADLEY SHAW    MI    4.5       1610
## 15 15     ZACHARY JAMES HOUGHTON    MI    4.5       1220
## 16 16               MIKE NIKITIN    MI    4.0       1604
## 17 17         RONALD GRZEGORCZYK    MI    4.0       1629
## 18 18              DAVID SUNDEEN    MI    4.0       1600
## 19 19               DIPANKAR ROY    MI    4.0       1564
## 20 20                JASON ZHENG    MI    4.0       1595
## 21 21              DINH DANG BUI    ON    4.0       1563
## 22 22           EUGENE L MCCLURE    MI    4.0       1555
## 23 23                   ALAN BUI    ON    4.0       1363
## 24 24          MICHAEL R ALDRICH    MI    4.0       1229
## 25 25           LOREN SCHWIEBERT    MI    3.5       1745
## 26 26                    MAX ZHU    ON    3.5       1579
## 27 27             GAURAV GIDWANI    MI    3.5       1552
## 28 28 SOFIA ADINA STANESCU-BELLU    MI    3.5       1507
## 29 29           CHIEDOZIE OKORIE    MI    3.5       1602
## 30 30         GEORGE AVERY JONES    ON    3.5       1522
## 31 31               RISHI SHETTY    MI    3.5       1494
## 32 32      JOSHUA PHILIP MATHEWS    ON    3.5       1441
## 33 33                    JADE GE    MI    3.5       1449
## 34 34     MICHAEL JEFFERY THOMAS    MI    3.5       1399
## 35 35           JOSHUA DAVID LEE    MI    3.5       1438
## 36 36              SIDDHARTH JHA    MI    3.5       1355
## 37 37       AMIYATOSH PWNANANDAM    MI    3.5        980
## 38 38                  BRIAN LIU    MI    3.0       1423
## 39 39              JOEL R HENDON    MI    3.0       1436
## 40 40               FOREST ZHANG    MI    3.0       1348
## 41 41        KYLE WILLIAM MURPHY    MI    3.0       1403
## 42 42                   JARED GE    MI    3.0       1332
## 43 43          ROBERT GLEN VASEY    MI    3.0       1283
## 44 44         JUSTIN D SCHILLING    MI    3.0       1199
## 45 45                  DEREK YAN    MI    3.0       1242
## 46 46   JACOB ALEXANDER LAVALLEY    MI    3.0        377
## 47 47                ERIC WRIGHT    MI    2.5       1362
## 48 48               DANIEL KHAIN    MI    2.5       1382
## 49 49           MICHAEL J MARTIN    MI    2.5       1291
## 50 50                 SHIVAM JHA    MI    2.5       1056
## 51 51             TEJAS AYYAGARI    MI    2.5       1011
## 52 52                  ETHAN GUO    MI    2.5        935
## 53 53              JOSE C YBARRA    MI    2.0       1393
## 54 54                LARRY HODGE    MI    2.0       1270
## 55 55                  ALEX KONG    MI    2.0       1186
## 56 56               MARISA RICCI    MI    2.0       1153
## 57 57                 MICHAEL LU    MI    2.0       1092
## 58 58               VIRAJ MOHILE    MI    2.0        917
## 59 59          SEAN M MC CORMICK    MI    2.0        853
## 60 60                 JULIA SHEN    MI    1.5        967
## 61 61              JEZZEL FARKAS    ON    1.5        955
## 62 62              ASHWIN BALAJI    MI    1.0       1530
## 63 63       THOMAS JOSEPH HOSMER    MI    1.0       1175
## 64 64                     BEN LI    MI    1.0       1163
##    Sum.of.Opponenets.Pre.Rating
## 1                         10503
## 2                          9690
## 3                          9548
## 4                         10805
## 5                          9905
## 6                         10838
## 7                          9945
## 8                         10476
## 9                          9691
## 10                        10448
## 11                         9848
## 12                        10905
## 13                         9500
## 14                         9782
## 15                        10319
## 16                         9073
## 17                        10473
## 18                        10433
## 19                         9982
## 20                         9786
## 21                         9613
## 22                        10079
## 23                         9402
## 24                         8863
## 25                        10278
## 26                        10282
## 27                         7719
## 28                        10741
## 29                         7442
## 30                         8353
## 31                         8805
## 32                         9409
## 33                         9644
## 34                         9684
## 35                         9448
## 36                         9012
## 37                        10670
## 38                        11078
## 39                        10357
## 40                         8377
## 41                         7491
## 42                         7455
## 43                         9012
## 44                        10114
## 45                         8400
## 46                         9873
## 47                         9300
## 48                         9740
## 49                         7990
## 50                        10104
## 51                         7435
## 52                         9388
## 53                         8934
## 54                         9745
## 55                         8987
## 56                         9660
## 57                         8287
## 58                         9913
## 59                         8684
## 60                         7200
## 61                         9004
## 62                         9544
## 63                         8565
## 64                         9357