# Download the file from github and store it in your home directory
download <- download.file('https://raw.githubusercontent.com/kelloggjohnd/DATA607/master/tournamentinfo.txt', destfile = "tournamentinfo.txt", method = "wininet")
# manipulate the data into a data frame
data_raw <- data.frame(
n.readLines (paste("./tournamentinfo.txt", sep = "|"),
header = FALSE,
n = 192,
skip = 4)
)## Warning in readLines(fn, n + cc): incomplete final line found on './
## tournamentinfo.txt'
## raw
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 2 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 3 -----------------------------------------------------------------------------------------
## 4 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 5 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 6 -----------------------------------------------------------------------------------------
## 7 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|
## 8 MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |
## 9 -----------------------------------------------------------------------------------------
## 10 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|
## 11 MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |
## 12 -----------------------------------------------------------------------------------------
## 13 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|
## 14 MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |
## 15 -----------------------------------------------------------------------------------------
## 16 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|
## 17 OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |
## 18 -----------------------------------------------------------------------------------------
## 19 7 | GARY DEE SWATHELL |5.0 |W 57|W 46|W 13|W 11|L 1|W 9|L 2|
## 20 MI | 11146376 / R: 1649 ->1673 |N:3 |W |B |W |B |B |W |W |
# R will seperate the data frame in to seperate columns
data_raw <- data.frame(do.call('rbind', strsplit(as.character(data_raw$raw),'|',fixed=TRUE)))
names(data_raw) <- c("Pair_Num", "Player_Info", "Total_Pts", "Round1", "Round2","Round3","Round4","Round5","Round6","Round7")
head(data_raw, 20)## Pair_Num
## 1 1
## 2 ON
## 3 -----------------------------------------------------------------------------------------
## 4 2
## 5 MI
## 6 -----------------------------------------------------------------------------------------
## 7 3
## 8 MI
## 9 -----------------------------------------------------------------------------------------
## 10 4
## 11 MI
## 12 -----------------------------------------------------------------------------------------
## 13 5
## 14 MI
## 15 -----------------------------------------------------------------------------------------
## 16 6
## 17 OH
## 18 -----------------------------------------------------------------------------------------
## 19 7
## 20 MI
## Player_Info
## 1 GARY HUA
## 2 15445895 / R: 1794 ->1817
## 3 -----------------------------------------------------------------------------------------
## 4 DAKSHESH DARURI
## 5 14598900 / R: 1553 ->1663
## 6 -----------------------------------------------------------------------------------------
## 7 ADITYA BAJAJ
## 8 14959604 / R: 1384 ->1640
## 9 -----------------------------------------------------------------------------------------
## 10 PATRICK H SCHILLING
## 11 12616049 / R: 1716 ->1744
## 12 -----------------------------------------------------------------------------------------
## 13 HANSHI ZUO
## 14 14601533 / R: 1655 ->1690
## 15 -----------------------------------------------------------------------------------------
## 16 HANSEN SONG
## 17 15055204 / R: 1686 ->1687
## 18 -----------------------------------------------------------------------------------------
## 19 GARY DEE SWATHELL
## 20 11146376 / R: 1649 ->1673
## Total_Pts
## 1 6.0
## 2 N:2
## 3 -----------------------------------------------------------------------------------------
## 4 6.0
## 5 N:2
## 6 -----------------------------------------------------------------------------------------
## 7 6.0
## 8 N:2
## 9 -----------------------------------------------------------------------------------------
## 10 5.5
## 11 N:2
## 12 -----------------------------------------------------------------------------------------
## 13 5.5
## 14 N:2
## 15 -----------------------------------------------------------------------------------------
## 16 5.0
## 17 N:3
## 18 -----------------------------------------------------------------------------------------
## 19 5.0
## 20 N:3
## Round1
## 1 W 39
## 2 W
## 3 -----------------------------------------------------------------------------------------
## 4 W 63
## 5 B
## 6 -----------------------------------------------------------------------------------------
## 7 L 8
## 8 W
## 9 -----------------------------------------------------------------------------------------
## 10 W 23
## 11 W
## 12 -----------------------------------------------------------------------------------------
## 13 W 45
## 14 B
## 15 -----------------------------------------------------------------------------------------
## 16 W 34
## 17 W
## 18 -----------------------------------------------------------------------------------------
## 19 W 57
## 20 W
## Round2
## 1 W 21
## 2 B
## 3 -----------------------------------------------------------------------------------------
## 4 W 58
## 5 W
## 6 -----------------------------------------------------------------------------------------
## 7 W 61
## 8 B
## 9 -----------------------------------------------------------------------------------------
## 10 D 28
## 11 B
## 12 -----------------------------------------------------------------------------------------
## 13 W 37
## 14 W
## 15 -----------------------------------------------------------------------------------------
## 16 D 29
## 17 B
## 18 -----------------------------------------------------------------------------------------
## 19 W 46
## 20 B
## Round3
## 1 W 18
## 2 W
## 3 -----------------------------------------------------------------------------------------
## 4 L 4
## 5 B
## 6 -----------------------------------------------------------------------------------------
## 7 W 25
## 8 W
## 9 -----------------------------------------------------------------------------------------
## 10 W 2
## 11 W
## 12 -----------------------------------------------------------------------------------------
## 13 D 12
## 14 B
## 15 -----------------------------------------------------------------------------------------
## 16 L 11
## 17 W
## 18 -----------------------------------------------------------------------------------------
## 19 W 13
## 20 W
## Round4
## 1 W 14
## 2 B
## 3 -----------------------------------------------------------------------------------------
## 4 W 17
## 5 W
## 6 -----------------------------------------------------------------------------------------
## 7 W 21
## 8 B
## 9 -----------------------------------------------------------------------------------------
## 10 W 26
## 11 B
## 12 -----------------------------------------------------------------------------------------
## 13 D 13
## 14 W
## 15 -----------------------------------------------------------------------------------------
## 16 W 35
## 17 B
## 18 -----------------------------------------------------------------------------------------
## 19 W 11
## 20 B
## Round5
## 1 W 7
## 2 W
## 3 -----------------------------------------------------------------------------------------
## 4 W 16
## 5 B
## 6 -----------------------------------------------------------------------------------------
## 7 W 11
## 8 W
## 9 -----------------------------------------------------------------------------------------
## 10 D 5
## 11 W
## 12 -----------------------------------------------------------------------------------------
## 13 D 4
## 14 B
## 15 -----------------------------------------------------------------------------------------
## 16 D 10
## 17 B
## 18 -----------------------------------------------------------------------------------------
## 19 L 1
## 20 B
## Round6
## 1 D 12
## 2 B
## 3 -----------------------------------------------------------------------------------------
## 4 W 20
## 5 W
## 6 -----------------------------------------------------------------------------------------
## 7 W 13
## 8 B
## 9 -----------------------------------------------------------------------------------------
## 10 W 19
## 11 B
## 12 -----------------------------------------------------------------------------------------
## 13 W 14
## 14 W
## 15 -----------------------------------------------------------------------------------------
## 16 W 27
## 17 W
## 18 -----------------------------------------------------------------------------------------
## 19 W 9
## 20 W
## Round7
## 1 D 4
## 2 W
## 3 -----------------------------------------------------------------------------------------
## 4 W 7
## 5 B
## 6 -----------------------------------------------------------------------------------------
## 7 W 12
## 8 W
## 9 -----------------------------------------------------------------------------------------
## 10 D 1
## 11 B
## 12 -----------------------------------------------------------------------------------------
## 13 W 17
## 14 B
## 15 -----------------------------------------------------------------------------------------
## 16 W 21
## 17 B
## 18 -----------------------------------------------------------------------------------------
## 19 L 2
## 20 W
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI"
Player_number <- unlist(str_extract_all(data_raw$Pair_Num, "[0-9]{1,2}"))
Player_number <- as.integer(str_trim(Player_number, "both"))
head(Player_number,10)## [1] 1 2 3 4 5 6 7 8 9 10
Player_raw<- str_extract_all(data_raw$Player_Info,"[[:alpha:]-?[:alpha:] ?]{2,}")
Player_name <- str_detect(unlist(Player_raw),"[[:alpha:]]{3,}")
Player_name <- unlist(Player_raw)[Player_name]
Player_name <- as.character(str_trim(Player_name, "both"))
head(Player_name, 20)## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [1] "154458" "145989" "149596" "126160" "146015" "150552" "111463"
## [8] "151422" "149545" "141503" "125815" "126812" "150829" "101314"
## [15] "156191" "102950" "102977" "113420" "148623" "145290"
total_points<- (str_extract(data_raw$Total_Pts, "\\d\\.\\d*"))
total_points <-na.omit (total_points)
head(total_points,20)## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0"
# Grabbing the Player Info column from the Raw file and performing a sequence skip to only have second "row" of data
column2 <- data.frame(data_raw$Player_Info [seq(2,192,by= 3)])
names(column2)[1]<- "raw"
# Since the data is now in a single column, we can split it based on seperate delimiters
# Split between USCF_ID and Player Ratings
Data_split <- data.frame(do.call('rbind', strsplit(as.character(column2$raw),':',fixed=TRUE)))
#Split between Pre and Post ratings
rating_split <- data.frame(do.call('rbind', strsplit(as.character(Data_split$X2),'->',fixed=TRUE)))
#Splitting off the P ranks to work with later
prerating_split <- data.matrix(do.call('rbind', strsplit(as.character(rating_split$X1),'P',fixed=TRUE)))
postrating_split <- data.matrix(do.call('rbind', strsplit(as.character(rating_split$X2),'P',fixed=TRUE)))
#Trimming the final product of the Pre and Post numbers
Pre_num<- as.integer(str_trim(prerating_split[,1]))
Post_num <- as.integer(str_trim(postrating_split[,1]))
#Test
head(Pre_num, 20)## [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595
## [1] 1817 1663 1640 1744 1690 1687 1673 1657 1564 1544 1696 1670 1662 1618
## [15] 1416 1613 1610 1600 1570 1569
pre_p<- data.matrix(as.integer(str_trim(prerating_split[,2])))
pre_p[,1] <- ifelse( pre_p[,1] > 100, NA, pre_p[,1])
Prerating_P <- pre_p[,1]
# Keeping only the P ratings below 100 which are the actual adjustments
Post_p<- data.matrix(as.integer(str_trim(postrating_split[,2])))
Post_p[,1] <- ifelse( Post_p[,1] > 100, NA, Post_p[,1])
Postrating_P <- Post_p[,1]
#Test
head(Prerating_P, 20)## [1] NA NA NA NA NA NA NA 17 NA NA NA NA NA NA 13 NA NA NA NA NA
## [1] NA NA NA NA NA NA NA 24 NA NA NA NA NA NA 20 NA NA NA NA NA
# Checking the work product thus far
Tournament<- data.frame(Player_number, USCF_ID, Player_name, location,total_points, Pre_num, Prerating_P, Post_num, Postrating_P)
head(Tournament, 10)## Player_number USCF_ID Player_name location total_points Pre_num
## 1 1 154458 GARY HUA ON 6.0 1794
## 2 2 145989 DAKSHESH DARURI MI 6.0 1553
## 3 3 149596 ADITYA BAJAJ MI 6.0 1384
## 4 4 126160 PATRICK H SCHILLING MI 5.5 1716
## 5 5 146015 HANSHI ZUO MI 5.5 1655
## 6 6 150552 HANSEN SONG OH 5.0 1686
## 7 7 111463 GARY DEE SWATHELL MI 5.0 1649
## 8 8 151422 EZEKIEL HOUGHTON MI 5.0 1641
## 9 9 149545 STEFANO LEE ON 5.0 1411
## 10 10 141503 ANVIT RAO MI 5.0 1365
## Prerating_P Post_num Postrating_P
## 1 NA 1817 NA
## 2 NA 1663 NA
## 3 NA 1640 NA
## 4 NA 1744 NA
## 5 NA 1690 NA
## 6 NA 1687 NA
## 7 NA 1673 NA
## 8 17 1657 24
## 9 NA 1564 NA
## 10 NA 1544 NA
# Extracting the round's opponents’ number.
round_1_opp <- unlist(str_extract_all(data_raw$Round1, "[0-9]{1,2}"))
# replaces any Zero in the data with NA
round_1_opp [round_1_opp == 0]<- NA
round_2_opp <- unlist(str_extract_all(data_raw$Round2, "[0-9]{1,2}"))
round_2_opp [round_2_opp == 0]<- NA
round_3_opp <- unlist(str_extract_all(data_raw$Round3, "[0-9]{1,2}"))
round_3_opp [round_3_opp == 0]<- NA
round_4_opp <- unlist(str_extract_all(data_raw$Round4, "[0-9]{1,2}"))
round_4_opp [round_4_opp == 0]<- NA
round_5_opp <- unlist(str_extract_all(data_raw$Round5, "[0-9]{1,2}"))
round_5_opp [round_5_opp == 0]<- NA
round_6_opp <- unlist(str_extract_all(data_raw$Round6, "[0-9]{1,2}"))
round_6_opp [round_6_opp == 0]<- NA
round_7_opp <- unlist(str_extract_all(data_raw$Round7, "[0-9]{1,2}"))
round_7_opp [round_7_opp == 0]<- NA
# Binding the opponents’ numbers to the master Data Frame
Tournament <- cbind(Tournament,round_1_opp,round_2_opp,round_3_opp,round_4_opp,round_5_opp,round_6_opp,round_7_opp)round1_opp_score<- as.integer(Tournament$Pre_num[as.integer(round_1_opp)])
round2_opp_score<- as.integer(Tournament$Pre_num[as.integer(round_2_opp)])
round3_opp_score<- as.integer(Tournament$Pre_num[as.integer(round_3_opp)])
round4_opp_score<- as.integer(Tournament$Pre_num[as.integer(round_4_opp)])
round5_opp_score<- as.integer(Tournament$Pre_num[as.integer(round_5_opp)])
round6_opp_score<- as.integer(Tournament$Pre_num[as.integer(round_6_opp)])
round7_opp_score<- as.integer(Tournament$Pre_num[as.integer(round_7_opp)])
# creating a sepetate data frame for ease of troubleshooting
# Ensure Round 3, line 16 and Round 5, line 12 are NA
per_round <- data.frame(round1_opp_score,round2_opp_score,round3_opp_score,round4_opp_score,round5_opp_score,round6_opp_score,round7_opp_score)
head(per_round,10)## round1_opp_score round2_opp_score round3_opp_score round4_opp_score
## 1 1436 1563 1600 1610
## 2 1175 917 1716 1629
## 3 1641 955 1745 1563
## 4 1363 1507 1553 1579
## 5 1242 980 1663 1666
## 6 1399 1602 1712 1438
## 7 1092 377 1666 1712
## 8 1384 1441 1610 1411
## 9 1745 1600 853 1641
## 10 1604 1564 1186 1494
## round5_opp_score round6_opp_score round7_opp_score
## 1 1649 1663 1716
## 2 1604 1595 1649
## 3 1712 1666 1663
## 4 1655 1564 1794
## 5 1716 1610 1629
## 6 1365 1552 1563
## 7 1794 1411 1553
## 8 1362 1507 1564
## 9 1579 1649 1595
## 10 1686 1745 1600
#create the average of each round and bind to the data frame from previous chunk
opp_average <- as.integer(round(rowMeans(subset(per_round, select = c(1,2,3,4,5,6,7)),na.rm = TRUE)))
per_round <- cbind(per_round,opp_average)
head(per_round,10)## round1_opp_score round2_opp_score round3_opp_score round4_opp_score
## 1 1436 1563 1600 1610
## 2 1175 917 1716 1629
## 3 1641 955 1745 1563
## 4 1363 1507 1553 1579
## 5 1242 980 1663 1666
## 6 1399 1602 1712 1438
## 7 1092 377 1666 1712
## 8 1384 1441 1610 1411
## 9 1745 1600 853 1641
## 10 1604 1564 1186 1494
## round5_opp_score round6_opp_score round7_opp_score opp_average
## 1 1649 1663 1716 1605
## 2 1604 1595 1649 1469
## 3 1712 1666 1663 1564
## 4 1655 1564 1794 1574
## 5 1716 1610 1629 1501
## 6 1365 1552 1563 1519
## 7 1794 1411 1553 1372
## 8 1362 1507 1564 1468
## 9 1579 1649 1595 1523
## 10 1686 1745 1600 1554
# final quality control of finished project
Tournament <- cbind(Tournament,per_round)
head(Tournament,10)## Player_number USCF_ID Player_name location total_points Pre_num
## 1 1 154458 GARY HUA ON 6.0 1794
## 2 2 145989 DAKSHESH DARURI MI 6.0 1553
## 3 3 149596 ADITYA BAJAJ MI 6.0 1384
## 4 4 126160 PATRICK H SCHILLING MI 5.5 1716
## 5 5 146015 HANSHI ZUO MI 5.5 1655
## 6 6 150552 HANSEN SONG OH 5.0 1686
## 7 7 111463 GARY DEE SWATHELL MI 5.0 1649
## 8 8 151422 EZEKIEL HOUGHTON MI 5.0 1641
## 9 9 149545 STEFANO LEE ON 5.0 1411
## 10 10 141503 ANVIT RAO MI 5.0 1365
## Prerating_P Post_num Postrating_P round_1_opp round_2_opp round_3_opp
## 1 NA 1817 NA 39 21 18
## 2 NA 1663 NA 63 58 4
## 3 NA 1640 NA 8 61 25
## 4 NA 1744 NA 23 28 2
## 5 NA 1690 NA 45 37 12
## 6 NA 1687 NA 34 29 11
## 7 NA 1673 NA 57 46 13
## 8 17 1657 24 3 32 14
## 9 NA 1564 NA 25 18 59
## 10 NA 1544 NA 16 19 55
## round_4_opp round_5_opp round_6_opp round_7_opp round1_opp_score
## 1 14 7 12 4 1436
## 2 17 16 20 7 1175
## 3 21 11 13 12 1641
## 4 26 5 19 1 1363
## 5 13 4 14 17 1242
## 6 35 10 27 21 1399
## 7 11 1 9 2 1092
## 8 9 47 28 19 1384
## 9 8 26 7 20 1745
## 10 31 6 25 18 1604
## round2_opp_score round3_opp_score round4_opp_score round5_opp_score
## 1 1563 1600 1610 1649
## 2 917 1716 1629 1604
## 3 955 1745 1563 1712
## 4 1507 1553 1579 1655
## 5 980 1663 1666 1716
## 6 1602 1712 1438 1365
## 7 377 1666 1712 1794
## 8 1441 1610 1411 1362
## 9 1600 853 1641 1579
## 10 1564 1186 1494 1686
## round6_opp_score round7_opp_score opp_average
## 1 1663 1716 1605
## 2 1595 1649 1469
## 3 1666 1663 1564
## 4 1564 1794 1574
## 5 1610 1629 1501
## 6 1552 1563 1519
## 7 1411 1553 1372
## 8 1507 1564 1468
## 9 1649 1595 1523
## 10 1745 1600 1554
ggplot(Tournament) + geom_histogram(aes(x = Pre_num, fill = total_points), binwidth = 150) +
facet_wrap(~ total_points)ggplot(Tournament) + geom_histogram(aes(x = Post_num, fill = total_points), binwidth = 150) +
facet_wrap(~ total_points)ggplot (data = Tournament, aes_string(x="total_points", y="Pre_num"))+
geom_boxplot(
color = "blue",
fill = "blue",
alpha=0.2,
outlier.color = "red",
outlier.size = 1,
notch = FALSE
)+
theme(axis.text.x = element_text(angle=90, vjust = .04, hjust = 1)
)ggplot (data = Tournament, aes_string(x="total_points", y="Post_num"))+
geom_boxplot(
color = "blue",
fill = "blue",
alpha=0.2,
outlier.color = "red",
outlier.size = 1,
notch = FALSE,
)+
theme(axis.text.x = element_text(angle=90, vjust = .04, hjust = 1)
)