library(stringr)
setwd("~/CUNY/DATA 607 Data Acquisition and Management/project 1")
#Importing Tournament text file
projectraw <- read.delim("tournamentinfo.txt")
#Making copy of text file as a back up
rawcopy <- projectraw
#Removing empty rows
projectnorows <- projectraw[-(seq(3,195, by=3)),]
#Extracting first and last names. Each first name is longer than 2 letters so making the first name at least 3 letters lets us miss the 2 letter state.
projectnames1 <- unlist(str_extract_all(projectnorows, "[A-Z]{2,}([\\s]?[A-Z]*-?)*([^\\s|/])"))
#Removing the extracted column name
projectnames <- projectnames1[-1]
#Extracting states using the vertical lines to stop phrase
projectstatedirty <- unlist(str_extract_all(projectnorows, "[A-Z]{2}[\\s]\\|"))
#Removing the extra space and vertical line
projectstate <- gsub('.{2}$', '', projectstatedirty)
#Extracting points, each as a decimal we can use to idenifty them
projectpoints <- unlist(str_extract_all(projectnorows, "\\d\\.\\d"))
#Extracting pre-rating using 'R:' to identify them
projectpreratingdirty <- unlist(str_extract_all(projectnorows, "R:\\s*\\d*[^\\s-[:alpha:]]"))
#Removing R, :, and spaces
projectprerating <- gsub('^.{3}', '', projectpreratingdirty)
#Extracting list of opponents
projectopponentsdirty <- unlist(str_extract_all(projectnorows, "\\..*?$"))
#Removing letters, lines, and spaces
projectopponentsdirty2 <- gsub('^.{3}', '', projectopponentsdirty)
projectopponentsdirty3 <- gsub('\\|', '', projectopponentsdirty2)
projectopponentsdirty4 <- gsub('[WLDBHUX]', '', projectopponentsdirty3)
projectopponentsdirty5 <- gsub('[ ]{1,}', ',', projectopponentsdirty4)
projectopponents <- gsub('^.{1}', '', projectopponentsdirty5)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
#Putting finished extracted variables into data frame
project <- data.frame(projectnames, projectstate, projectpoints, projectprerating)
#Putting dirty extracted variables into data frame
projectdirty <- data.frame(projectnames, projectstate, projectpoints, projectprerating, projectopponents)
#separate concatenated opponents into variables
projectdirty1 <- separate(projectdirty, projectopponents, c("opponent1", "opponent2", "opponent3", "opponent4", "opponent5", "opponent6", "opponent7"), sep = ",")
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 20 rows [12,
## 16, 22, 36, 37, 38, 41, 44, 48, 49, 50, 53, 54, 55, 56, 58, 59, 60, 62,
## 63].
#replace na in opponents with 0
projectdirty2 <- replace(projectdirty1,is.na(projectdirty1),0)
#making opponents numeric in separate variables
opponent1 <- as.numeric(as.character(projectdirty2$opponent1))
opponent2 <- as.numeric(as.character(projectdirty2$opponent2))
opponent3 <- as.numeric(as.character(projectdirty2$opponent3))
opponent4 <- as.numeric(as.character(projectdirty2$opponent4))
opponent5 <- as.numeric(as.character(projectdirty2$opponent5))
opponent6 <- as.numeric(as.character(projectdirty2$opponent6))
opponent7 <- as.numeric(as.character(projectdirty2$opponent7))
numericopponents <- data.frame(opponent1, opponent2, opponent3, opponent4, opponent5, opponent6, opponent7)
#preping data for use as a row number where 0 = 1
numericopponents1 <- numericopponents - 1
#make preratingsum by summing prerating numbers from rows numbered by numericopponents1 value
numericopponents2 <- mutate(numericopponents1, projectpreratingsum1 = projectdirty2$projectprerating[(row_number(opponent1))])
numericopponents3 <- mutate(numericopponents2, projectpreratingsum2 = projectdirty2$projectprerating[(row_number(opponent2))])
numericopponents4 <- mutate(numericopponents3, projectpreratingsum3 = projectdirty2$projectprerating[(row_number(opponent3))])
numericopponents5 <- mutate(numericopponents4, projectpreratingsum4 = projectdirty2$projectprerating[(row_number(opponent4))])
numericopponents6 <- mutate(numericopponents5, projectpreratingsum5 = projectdirty2$projectprerating[(row_number(opponent5))])
numericopponents7 <- mutate(numericopponents6, projectpreratingsum6 = projectdirty2$projectprerating[(row_number(opponent6))])
numericopponents8 <- mutate(numericopponents7, projectpreratingsum7 = projectdirty2$projectprerating[(row_number(opponent7))])
#making sums numeric
projectpreratingsum1 <- as.numeric(as.character(numericopponents8$projectpreratingsum1))
projectpreratingsum2 <- as.numeric(as.character(numericopponents8$projectpreratingsum2))
projectpreratingsum3 <- as.numeric(as.character(numericopponents8$projectpreratingsum3))
projectpreratingsum4 <- as.numeric(as.character(numericopponents8$projectpreratingsum4))
projectpreratingsum5 <- as.numeric(as.character(numericopponents8$projectpreratingsum5))
projectpreratingsum6 <- as.numeric(as.character(numericopponents8$projectpreratingsum6))
projectpreratingsum7 <- as.numeric(as.character(numericopponents8$projectpreratingsum7))
#Data frame of scores to be summed
projectpreratingsum <- data.frame(projectpreratingsum1, projectpreratingsum2, projectpreratingsum3, projectpreratingsum4, projectpreratingsum5, projectpreratingsum6, projectpreratingsum7)
projectpreratingsum1 <- replace(projectpreratingsum,is.na(projectpreratingsum),0)
#Summing pre rating scores
tobesummed <- c("projectpreratingsum1", "projectpreratingsum2", "projectpreratingsum3", "projectpreratingsum4", "projectpreratingsum5", "projectpreratingsum6", "projectpreratingsum7")
project$prerattingsum <- rowSums(projectpreratingsum1[, tobesummed])
#Renaming columns
names(project) = c("Names", "State", "Points", "Pre-rating", "Sum of Opponenets Pre-Rating")
#Create CSV to be used in SQL
write.csv(project, "~/CUNY/DATA 607 Data Acquisition and Management/project 1/project1.csv")
#Check CSV
read.csv("~/CUNY/DATA 607 Data Acquisition and Management/project 1/project1.csv")
## X Names State Points Pre.rating
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
## 7 7 GARY DEE SWATHELL MI 5.0 1649
## 8 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 9 STEFANO LEE ON 5.0 1411
## 10 10 ANVIT RAO MI 5.0 1365
## 11 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 12 12 KENNETH J TACK MI 4.5 1663
## 13 13 TORRANCE HENRY JR MI 4.5 1666
## 14 14 BRADLEY SHAW MI 4.5 1610
## 15 15 ZACHARY JAMES HOUGHTON MI 4.5 1220
## 16 16 MIKE NIKITIN MI 4.0 1604
## 17 17 RONALD GRZEGORCZYK MI 4.0 1629
## 18 18 DAVID SUNDEEN MI 4.0 1600
## 19 19 DIPANKAR ROY MI 4.0 1564
## 20 20 JASON ZHENG MI 4.0 1595
## 21 21 DINH DANG BUI ON 4.0 1563
## 22 22 EUGENE L MCCLURE MI 4.0 1555
## 23 23 ALAN BUI ON 4.0 1363
## 24 24 MICHAEL R ALDRICH MI 4.0 1229
## 25 25 LOREN SCHWIEBERT MI 3.5 1745
## 26 26 MAX ZHU ON 3.5 1579
## 27 27 GAURAV GIDWANI MI 3.5 1552
## 28 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507
## 29 29 CHIEDOZIE OKORIE MI 3.5 1602
## 30 30 GEORGE AVERY JONES ON 3.5 1522
## 31 31 RISHI SHETTY MI 3.5 1494
## 32 32 JOSHUA PHILIP MATHEWS ON 3.5 1441
## 33 33 JADE GE MI 3.5 1449
## 34 34 MICHAEL JEFFERY THOMAS MI 3.5 1399
## 35 35 JOSHUA DAVID LEE MI 3.5 1438
## 36 36 SIDDHARTH JHA MI 3.5 1355
## 37 37 AMIYATOSH PWNANANDAM MI 3.5 980
## 38 38 BRIAN LIU MI 3.0 1423
## 39 39 JOEL R HENDON MI 3.0 1436
## 40 40 FOREST ZHANG MI 3.0 1348
## 41 41 KYLE WILLIAM MURPHY MI 3.0 1403
## 42 42 JARED GE MI 3.0 1332
## 43 43 ROBERT GLEN VASEY MI 3.0 1283
## 44 44 JUSTIN D SCHILLING MI 3.0 1199
## 45 45 DEREK YAN MI 3.0 1242
## 46 46 JACOB ALEXANDER LAVALLEY MI 3.0 377
## 47 47 ERIC WRIGHT MI 2.5 1362
## 48 48 DANIEL KHAIN MI 2.5 1382
## 49 49 MICHAEL J MARTIN MI 2.5 1291
## 50 50 SHIVAM JHA MI 2.5 1056
## 51 51 TEJAS AYYAGARI MI 2.5 1011
## 52 52 ETHAN GUO MI 2.5 935
## 53 53 JOSE C YBARRA MI 2.0 1393
## 54 54 LARRY HODGE MI 2.0 1270
## 55 55 ALEX KONG MI 2.0 1186
## 56 56 MARISA RICCI MI 2.0 1153
## 57 57 MICHAEL LU MI 2.0 1092
## 58 58 VIRAJ MOHILE MI 2.0 917
## 59 59 SEAN M MC CORMICK MI 2.0 853
## 60 60 JULIA SHEN MI 1.5 967
## 61 61 JEZZEL FARKAS ON 1.5 955
## 62 62 ASHWIN BALAJI MI 1.0 1530
## 63 63 THOMAS JOSEPH HOSMER MI 1.0 1175
## 64 64 BEN LI MI 1.0 1163
## Sum.of.Opponenets.Pre.Rating
## 1 10503
## 2 9690
## 3 9548
## 4 10805
## 5 9905
## 6 10838
## 7 9945
## 8 10476
## 9 9691
## 10 10448
## 11 9848
## 12 10905
## 13 9500
## 14 9782
## 15 10319
## 16 9073
## 17 10473
## 18 10433
## 19 9982
## 20 9786
## 21 9613
## 22 10079
## 23 9402
## 24 8863
## 25 10278
## 26 10282
## 27 7719
## 28 10741
## 29 7442
## 30 8353
## 31 8805
## 32 9409
## 33 9644
## 34 9684
## 35 9448
## 36 9012
## 37 10670
## 38 11078
## 39 10357
## 40 8377
## 41 7491
## 42 7455
## 43 9012
## 44 10114
## 45 8400
## 46 9873
## 47 9300
## 48 9740
## 49 7990
## 50 10104
## 51 7435
## 52 9388
## 53 8934
## 54 9745
## 55 8987
## 56 9660
## 57 8287
## 58 9913
## 59 8684
## 60 7200
## 61 9004
## 62 9544
## 63 8565
## 64 9357