R Markdown
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
#Load the data, delimited by "|", etc.
player_data <- read.delim('https://raw.githubusercontent.com/murphystout/data-607/master/tournamentinfo.txt',skip = 1, sep = "|", header = TRUE)
#Delete the annoying dash lines.
line_to_delete <- player_data[2,1]
player_data <- filter(player_data, Pair != line_to_delete)
#Take even number rows (i.e. player name and associated data), into own data frame
even_num_rows <- player_data[seq(2,length(player_data[,1]),2),]
#Same with odd, and give it appropriate column names
odd_num_rows <- player_data[seq(3,length(player_data[,1]),2),]
names(odd_num_rows) <- c('State', 'USCF_ID_Rating_Pre_Post','Total','Round1_Outcome','Round2_Outcome','Round3_Outcome','Round4_Outcome','Round5_Outcome','Round6_Outcome','Round7_Outcome')
#Combine the two data frame for ease of analysis.
player_data_wide <- cbind(even_num_rows, odd_num_rows)
#Rename row names to be sequential, as they kept the even numbered names from steps above
row.names(player_data_wide) <- 1:length(player_data_wide[,1])
#Extract Pre Rating
rating_split <- player_data_wide$USCF_ID_Rating_Pre_Post
pre_rating <- substring(rating_split,15,19)
pre_rating <- str_trim(pre_rating)
pre_rating <- as.numeric(pre_rating)
#For the heck of it, Extract End Rating as well
rating_split <- player_data_wide$USCF_ID_Rating_Pre_Post
end_rating <- str_extract_all(rating_split,">([0-9 ]*)")
end_rating <- str_trim(substring(end_rating,2))
#Add to new columns of dataframe
player_data_wide <- cbind(player_data_wide,pre_rating)
player_data_wide <- cbind(player_data_wide,end_rating)
#Extract Opponents IDs
round_1_opp_id <- str_extract(player_data_wide$Round,"[0-9]{1,2}")
round_2_opp_id <- str_extract(player_data_wide$Round.1,"[0-9]{1,2}")
round_3_opp_id <- str_extract(player_data_wide$Round.2,"[0-9]{1,2}")
round_4_opp_id <- str_extract(player_data_wide$Round.3,"[0-9]{1,2}")
round_5_opp_id <- str_extract(player_data_wide$Round.4,"[0-9]{1,2}")
round_6_opp_id <- str_extract(player_data_wide$Round.5,"[0-9]{1,2}")
round_7_opp_id <- str_extract(player_data_wide$Round.6,"[0-9]{1,2}")
#Add all these to the dataframe
player_data_wide <- cbind(player_data_wide,round_1_opp_id,round_2_opp_id,round_3_opp_id,round_4_opp_id,round_5_opp_id,round_6_opp_id, round_7_opp_id)
#Create a linking table with opponent IDs and Pre Ratings. Will use to look up ratings in next steps.
player_data_wide$Pair <- as.numeric(as.character(player_data_wide$Pair))
linking_table <- player_data_wide[c("Pair","pre_rating")]
#Series of joins to pull in data. Using the merge function. We also want to create a redundant data frame to avoid messing up the previous one.
player_data_wide_2 <- player_data_wide
colnames(linking_table)[1]<-"round_1_opp_id"
colnames(linking_table)[2]<-"round_1_pre_rating"
player_data_wide_2<-merge(player_data_wide_2,linking_table,by="round_1_opp_id", all.x = TRUE)
## Warning in merge.data.frame(player_data_wide_2, linking_table, by =
## "round_1_opp_id", : column name 'Total' is duplicated in the result
colnames(linking_table)[1]<-"round_2_opp_id"
colnames(linking_table)[2]<-"round_2_pre_rating"
player_data_wide_2<-merge(player_data_wide_2,linking_table,by="round_2_opp_id", all.x = TRUE)
## Warning in merge.data.frame(player_data_wide_2, linking_table, by =
## "round_2_opp_id", : column name 'Total' is duplicated in the result
colnames(linking_table)[1]<-"round_3_opp_id"
colnames(linking_table)[2]<-"round_3_pre_rating"
player_data_wide_2<-merge(player_data_wide_2,linking_table,by="round_3_opp_id", all.x = TRUE)
## Warning in merge.data.frame(player_data_wide_2, linking_table, by =
## "round_3_opp_id", : column name 'Total' is duplicated in the result
colnames(linking_table)[1]<-"round_4_opp_id"
colnames(linking_table)[2]<-"round_4_pre_rating"
player_data_wide_2<-merge(player_data_wide_2,linking_table,by="round_4_opp_id", all.x = TRUE)
## Warning in merge.data.frame(player_data_wide_2, linking_table, by =
## "round_4_opp_id", : column name 'Total' is duplicated in the result
colnames(linking_table)[1]<-"round_5_opp_id"
colnames(linking_table)[2]<-"round_5_pre_rating"
player_data_wide_2<-merge(player_data_wide_2,linking_table,by="round_5_opp_id", all.x = TRUE)
## Warning in merge.data.frame(player_data_wide_2, linking_table, by =
## "round_5_opp_id", : column name 'Total' is duplicated in the result
colnames(linking_table)[1]<-"round_6_opp_id"
colnames(linking_table)[2]<-"round_6_pre_rating"
player_data_wide_2<-merge(player_data_wide_2,linking_table,by="round_6_opp_id", all.x = TRUE)
## Warning in merge.data.frame(player_data_wide_2, linking_table, by =
## "round_6_opp_id", : column name 'Total' is duplicated in the result
colnames(linking_table)[1]<-"round_7_opp_id"
colnames(linking_table)[2]<-"round_7_pre_rating"
player_data_wide_2<-merge(player_data_wide_2,linking_table,by="round_7_opp_id", all.x = TRUE)
## Warning in merge.data.frame(player_data_wide_2, linking_table, by =
## "round_7_opp_id", : column name 'Total' is duplicated in the result
# Calculate Means
player_data_wide_2$avg_opp_rating <- round(rowMeans(player_data_wide_2[c("round_1_pre_rating","round_2_pre_rating","round_3_pre_rating","round_4_pre_rating","round_5_pre_rating","round_6_pre_rating","round_7_pre_rating")],na.rm = TRUE))
#Put together final solution table
#Sort first
player_data_wide_2 <- player_data_wide_2[order(player_data_wide_2$Pair),]
solution <- player_data_wide_2[,c("Player.Name","State", "Total", "pre_rating","avg_opp_rating")]
colnames(solution) <- c("Player.Name", "State", "Total.Points", "Pre.Rating", "Opponent.Avg.Pre.Rating")
solution
## Player.Name State Total.Points Pre.Rating
## 31 GARY HUA ON 6.0 1794
## 52 DAKSHESH DARURI MI 6.0 1553
## 4 ADITYA BAJAJ MI 6.0 1384
## 1 PATRICK H SCHILLING MI 5.5 1716
## 8 HANSHI ZUO MI 5.5 1655
## 13 HANSEN SONG OH 5.0 1686
## 11 GARY DEE SWATHELL MI 5.0 1649
## 10 EZEKIEL HOUGHTON MI 5.0 1641
## 12 STEFANO LEE ON 5.0 1411
## 9 ANVIT RAO MI 5.0 1365
## 18 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 20 KENNETH J TACK MI 4.5 1663
## 23 TORRANCE HENRY JR MI 4.5 1666
## 22 BRADLEY SHAW MI 4.5 1610
## 29 ZACHARY JAMES HOUGHTON MI 4.5 1220
## 58 MIKE NIKITIN MI 4.0 1604
## 40 RONALD GRZEGORCZYK MI 4.0 1629
## 2 DAVID SUNDEEN MI 4.0 1600
## 53 DIPANKAR ROY MI 4.0 1564
## 54 JASON ZHENG MI 4.0 1595
## 49 DINH DANG BUI ON 4.0 1563
## 32 EUGENE L MCCLURE MI 4.0 1555
## 37 ALAN BUI ON 4.0 1363
## 30 MICHAEL R ALDRICH MI 4.0 1229
## 38 LOREN SCHWIEBERT MI 3.5 1745
## 3 MAX ZHU ON 3.5 1579
## 55 GAURAV GIDWANI MI 3.5 1552
## 27 SOFIA ADINA STANESCU-BELLU MI 3.5 1507
## 57 CHIEDOZIE OKORIE MI 3.5 1602
## 41 GEORGE AVERY JONES ON 3.5 1522
## 6 RISHI SHETTY MI 3.5 1494
## 5 JOSHUA PHILIP MATHEWS ON 3.5 1441
## 42 JADE GE MI 3.5 1449
## 43 MICHAEL JEFFERY THOMAS MI 3.5 1399
## 39 JOSHUA DAVID LEE MI 3.5 1438
## 19 SIDDHARTH JHA MI 3.5 1355
## 50 AMIYATOSH PWNANANDAM MI 3.5 980
## 7 BRIAN LIU MI 3.0 1423
## 16 JOEL R HENDON MI 3.0 1436
## 14 FOREST ZHANG MI 3.0 1348
## 60 KYLE WILLIAM MURPHY MI 3.0 1403
## 46 JARED GE MI 3.0 1332
## 45 ROBERT GLEN VASEY MI 3.0 1283
## 48 JUSTIN D SCHILLING MI 3.0 1199
## 47 DEREK YAN MI 3.0 1242
## 15 JACOB ALEXANDER LAVALLEY MI 3.0 377
## 17 ERIC WRIGHT MI 2.5 1362
## 26 DANIEL KHAIN MI 2.5 1382
## 56 MICHAEL J MARTIN MI 2.5 1291
## 21 SHIVAM JHA MI 2.5 1056
## 24 TEJAS AYYAGARI MI 2.5 1011
## 25 ETHAN GUO MI 2.5 935
## 64 JOSE C YBARRA MI 2.0 1393
## 51 LARRY HODGE MI 2.0 1270
## 34 ALEX KONG MI 2.0 1186
## 33 MARISA RICCI MI 2.0 1153
## 61 MICHAEL LU MI 2.0 1092
## 36 VIRAJ MOHILE MI 2.0 917
## 35 SEAN M MC CORMICK MI 2.0 853
## 59 JULIA SHEN MI 1.5 967
## 28 JEZZEL FARKAS ON 1.5 955
## 63 ASHWIN BALAJI MI 1.0 1530
## 62 THOMAS JOSEPH HOSMER MI 1.0 1175
## 44 BEN LI MI 1.0 1163
## Opponent.Avg.Pre.Rating
## 31 1605
## 52 1469
## 4 1564
## 1 1574
## 8 1501
## 13 1519
## 11 1372
## 10 1468
## 12 1523
## 9 1554
## 18 1468
## 20 1506
## 23 1498
## 22 1515
## 29 1484
## 58 1386
## 40 1499
## 2 1480
## 53 1426
## 54 1411
## 49 1470
## 32 1300
## 37 1214
## 30 1357
## 38 1363
## 3 1507
## 55 1222
## 27 1522
## 57 1314
## 41 1144
## 6 1260
## 5 1379
## 42 1277
## 43 1375
## 39 1150
## 19 1388
## 50 1385
## 7 1539
## 16 1430
## 14 1391
## 60 1248
## 46 1150
## 45 1107
## 48 1327
## 47 1152
## 15 1358
## 17 1392
## 26 1356
## 56 1286
## 21 1296
## 24 1356
## 25 1495
## 64 1345
## 51 1206
## 34 1406
## 33 1414
## 61 1363
## 36 1391
## 35 1319
## 59 1330
## 28 1327
## 63 1186
## 62 1350
## 44 1263