This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
** Project 1 * In this project, you’re given a text file with chess tournament results where the information has some structure.
Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
If you have questions about the meaning of the data or the results, please post them on the discussion forum.
Data science, like chess, is a game of back and forth… The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments. You may substitute another text file (or set of text files, or data scraped from web pages) of similar or greater complexity, and create your own assignment and solution. You may work in a small team. All of your code should be in an R markdown file (and published to rpubs.com); with your data accessible for the person running the script. **
#install.packages("readr")
#install.packages("tidyverse")
#install.packages("DBI")
#install.packages("odbc")
#install.packages("RSQLite")
#install.packages("dblyr")
#install.packages("RMySQL")
#install.packages("stringr")
library(readr)
library(stringr)
library(DBI)
library(odbc)
library(RSQLite)
library(dbplyr)
library(RMySQL)
##
## Attaching package: 'RMySQL'
## The following object is masked from 'package:RSQLite':
##
## isIdCurrent
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ lubridate 1.9.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::ident() masks dbplyr::ident()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::sql() masks dbplyr::sql()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RODBC)
library(readxl)
library(dplyr)
library(odbc)
tournamentinfo_data <- readLines("https://raw.githubusercontent.com/asadny82/Data607/refs/heads/main/tournamentinfo.txt")
## Warning in
## readLines("https://raw.githubusercontent.com/asadny82/Data607/refs/heads/main/tournamentinfo.txt"):
## incomplete final line found on
## 'https://raw.githubusercontent.com/asadny82/Data607/refs/heads/main/tournamentinfo.txt'
player_number <- as.numeric(unlist(str_extract_all(tournamentinfo_data,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))
player_name <- unlist(str_extract_all(tournamentinfo_data,"(?<=\\d\\s\\|\\s)([A-z, -]*\\s){1,}[[:alpha:]]*(?=\\s*\\|)"))
player_state <- unlist(str_extract_all(tournamentinfo_data, "[[:upper:]]{2}(?=\\s\\|)"))
total_pts <- as.numeric(unlist(str_extract_all(tournamentinfo_data, "(?<=\\|)\\d\\.\\d")))
player_pre_rat <- as.numeric(unlist(str_extract_all(tournamentinfo_data, "(?<=R:\\s{1,2})(\\d{3,4}(?=\\s))|(\\d{3,4}(?=P\\d{1,2}\\s*-))")))
# Create a data frame
dataframe <- data.frame(player_number, player_name, player_state, total_pts, player_pre_rat)
dataframe
## player_number player_name player_state total_pts
## 1 1 GARY HUA ON 6.0
## 2 2 DAKSHESH DARURI MI 6.0
## 3 3 ADITYA BAJAJ MI 6.0
## 4 4 PATRICK H SCHILLING MI 5.5
## 5 5 HANSHI ZUO MI 5.5
## 6 6 HANSEN SONG OH 5.0
## 7 7 GARY DEE SWATHELL MI 5.0
## 8 8 EZEKIEL HOUGHTON MI 5.0
## 9 9 STEFANO LEE ON 5.0
## 10 10 ANVIT RAO MI 5.0
## 11 11 CAMERON WILLIAM MC LEMAN MI 4.5
## 12 12 KENNETH J TACK MI 4.5
## 13 13 TORRANCE HENRY JR MI 4.5
## 14 14 BRADLEY SHAW MI 4.5
## 15 15 ZACHARY JAMES HOUGHTON MI 4.5
## 16 16 MIKE NIKITIN MI 4.0
## 17 17 RONALD GRZEGORCZYK MI 4.0
## 18 18 DAVID SUNDEEN MI 4.0
## 19 19 DIPANKAR ROY MI 4.0
## 20 20 JASON ZHENG MI 4.0
## 21 21 DINH DANG BUI ON 4.0
## 22 22 EUGENE L MCCLURE MI 4.0
## 23 23 ALAN BUI ON 4.0
## 24 24 MICHAEL R ALDRICH MI 4.0
## 25 25 LOREN SCHWIEBERT MI 3.5
## 26 26 MAX ZHU ON 3.5
## 27 27 GAURAV GIDWANI MI 3.5
## 28 28 SOFIA ADINA STANESCU-BELLU MI 3.5
## 29 29 CHIEDOZIE OKORIE MI 3.5
## 30 30 GEORGE AVERY JONES ON 3.5
## 31 31 RISHI SHETTY MI 3.5
## 32 32 JOSHUA PHILIP MATHEWS ON 3.5
## 33 33 JADE GE MI 3.5
## 34 34 MICHAEL JEFFERY THOMAS MI 3.5
## 35 35 JOSHUA DAVID LEE MI 3.5
## 36 36 SIDDHARTH JHA MI 3.5
## 37 37 AMIYATOSH PWNANANDAM MI 3.5
## 38 38 BRIAN LIU MI 3.0
## 39 39 JOEL R HENDON MI 3.0
## 40 40 FOREST ZHANG MI 3.0
## 41 41 KYLE WILLIAM MURPHY MI 3.0
## 42 42 JARED GE MI 3.0
## 43 43 ROBERT GLEN VASEY MI 3.0
## 44 44 JUSTIN D SCHILLING MI 3.0
## 45 45 DEREK YAN MI 3.0
## 46 46 JACOB ALEXANDER LAVALLEY MI 3.0
## 47 47 ERIC WRIGHT MI 2.5
## 48 48 DANIEL KHAIN MI 2.5
## 49 49 MICHAEL J MARTIN MI 2.5
## 50 50 SHIVAM JHA MI 2.5
## 51 51 TEJAS AYYAGARI MI 2.5
## 52 52 ETHAN GUO MI 2.5
## 53 53 JOSE C YBARRA MI 2.0
## 54 54 LARRY HODGE MI 2.0
## 55 55 ALEX KONG MI 2.0
## 56 56 MARISA RICCI MI 2.0
## 57 57 MICHAEL LU MI 2.0
## 58 58 VIRAJ MOHILE MI 2.0
## 59 59 SEAN M MC CORMICK MI 2.0
## 60 60 JULIA SHEN MI 1.5
## 61 61 JEZZEL FARKAS ON 1.5
## 62 62 ASHWIN BALAJI MI 1.0
## 63 63 THOMAS JOSEPH HOSMER MI 1.0
## 64 64 BEN LI MI 1.0
## player_pre_rat
## 1 1794
## 2 1553
## 3 1384
## 4 1716
## 5 1655
## 6 1686
## 7 1649
## 8 1641
## 9 1411
## 10 1365
## 11 1712
## 12 1663
## 13 1666
## 14 1610
## 15 1220
## 16 1604
## 17 1629
## 18 1600
## 19 1564
## 20 1595
## 21 1563
## 22 1555
## 23 1363
## 24 1229
## 25 1745
## 26 1579
## 27 1552
## 28 1507
## 29 1602
## 30 1522
## 31 1494
## 32 1441
## 33 1449
## 34 1399
## 35 1438
## 36 1355
## 37 980
## 38 1423
## 39 1436
## 40 1348
## 41 1403
## 42 1332
## 43 1283
## 44 1199
## 45 1242
## 46 377
## 47 1362
## 48 1382
## 49 1291
## 50 1056
## 51 1011
## 52 935
## 53 1393
## 54 1270
## 55 1186
## 56 1153
## 57 1092
## 58 917
## 59 853
## 60 967
## 61 955
## 62 1530
## 63 1175
## 64 1163
numbers_rows <- tournamentinfo_data[seq(5,length(tournamentinfo_data),3)]
opponent_player_numbers <- as.numeric(unlist(str_extract_all(numbers_rows, "(?<=\\|(W|L|D)\\s{2,3})[[:digit:]]{1,2}(?=\\|)|((?<!->)(?<=\\|(U|H|B|X))\\s{4}(?=\\|))")))
opponent_player_numbers
## [1] 39 21 18 14 7 12 4 63 58 4 17 16 20 7 8 61 25 21 11 13 12 23 28 2 26
## [26] 5 19 1 45 37 12 13 4 14 17 34 29 11 35 10 27 21 57 46 13 11 1 9 2 3
## [51] 32 14 9 47 28 19 25 18 59 8 26 7 20 16 19 55 31 6 25 18 38 56 6 7 3
## [76] 34 26 42 33 5 38 NA 1 3 36 27 7 5 33 3 32 54 44 8 1 27 5 31 19 16
## [101] 30 22 54 33 38 10 15 NA 39 2 36 NA 48 41 26 2 23 22 5 47 9 1 32 19 38
## [126] 10 15 10 52 28 18 4 8 40 49 23 41 28 2 9 43 1 47 3 40 39 6 64 52 28
## [151] 15 NA 17 40 4 43 20 58 17 37 46 28 47 43 25 60 44 39 9 53 3 24 34 10 47
## [176] 49 40 17 4 9 32 11 51 13 46 37 14 6 NA 24 4 22 19 20 8 36 50 6 38 34
## [201] 52 48 NA 52 64 15 55 31 61 50 58 55 64 10 30 50 14 61 8 44 18 51 26 13 60
## [226] 12 50 36 13 15 51 6 60 37 29 25 11 52 46 38 56 6 57 52 48 13 57 51 33 NA
## [251] 16 28 NA 5 34 27 NA 23 61 11 35 29 12 NA 18 15 1 54 40 16 44 21 24 20 26
## [276] 39 59 21 56 22 59 17 58 20 NA NA NA 12 50 57 60 61 64 56 21 23 24 63 59 46
## [301] 55 NA 14 32 53 39 24 59 5 51 60 56 63 55 58 35 7 27 50 64 43 23 18 24 21
## [326] 61 8 51 25 17 63 NA 52 NA 29 35 26 20 63 64 58 NA NA 29 42 33 46 NA 31 30
## [351] 27 45 36 57 32 47 33 30 22 19 48 29 35 34 NA 25 NA 44 NA 57 NA 14 39 61 NA
## [376] 15 59 64 62 31 10 30 NA 45 43 NA 11 35 45 NA 40 42 7 36 42 51 35 53 NA 31
## [401] 2 41 23 49 NA 45 41 NA 9 40 43 54 44 33 34 45 42 24 NA NA 32 3 54 47 42
## [426] 30 37 55 NA NA NA NA NA NA 2 48 49 43 45 NA NA 22 30 31 49 46 42 54
mTournament <- matrix(unlist(tournamentinfo_data), byrow=TRUE)
matrix1 <- mTournament[seq(5,(length(mTournament)),3)]
head(matrix1)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
matrix2 <- mTournament[seq(6,length(mTournament),3)]
head(matrix2)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
pcr_matrix <- matrix(data = NA, nrow = 64, ncol = 2)
colnames(pcr_matrix) <- c("total_opp_pcr", "average_opp_pcr")
row_count <- 0
for(i in seq(from=1, to=length(opponent_player_numbers)-6, by=7)){
row_count <- row_count + 1
pcr_matrix[row_count, 1] <- (sum(subset(dataframe$player_pre_rat, dataframe$player_number %in% opponent_player_numbers[seq(from=i, to=i+6, by=1)])))
pcr_matrix[row_count, 2] <- pcr_matrix[row_count, 1] / length(subset(opponent_player_numbers[seq(from=i, to=i+6, by=1)],!is.na(opponent_player_numbers[seq(from=i, to=i+6, by=1)])))
}
head(pcr_matrix, 5)
## total_opp_pcr average_opp_pcr
## [1,] 11237 1605.286
## [2,] 10285 1469.286
## [3,] 10945 1563.571
## [4,] 11015 1573.571
## [5,] 10506 1500.857
# Round the nearest whole number
Rounds <- str_extract_all(matrix1, '[A-Z]\\s{2,}\\d+')
# matching numbers
Rounds <- str_extract_all(Rounds, '\\d+')
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
AvgOppPreChessRating <- c()
for(i in c(1:length(Rounds))){
AvgOppPreChessRating[i] <- round(mean(player_pre_rat[as.numeric(Rounds[[i]])]),0)
}
AvgOppPreChessRating
## [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515 1484
## [16] 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522 1314 1144
## [31] 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150 1107 1327 1152
## [46] 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414 1363 1391 1319 1330
## [61] 1327 1186 1350 1263
Project1 <- head(dataframe,5)
head(Project1)
## player_number player_name player_state total_pts
## 1 1 GARY HUA ON 6.0
## 2 2 DAKSHESH DARURI MI 6.0
## 3 3 ADITYA BAJAJ MI 6.0
## 4 4 PATRICK H SCHILLING MI 5.5
## 5 5 HANSHI ZUO MI 5.5
## player_pre_rat
## 1 1794
## 2 1553
## 3 1384
## 4 1716
## 5 1655
dt<-write_csv(Project1, 'tournament.csv' , append = FALSE)
dt
## player_number player_name player_state total_pts
## 1 1 GARY HUA ON 6.0
## 2 2 DAKSHESH DARURI MI 6.0
## 3 3 ADITYA BAJAJ MI 6.0
## 4 4 PATRICK H SCHILLING MI 5.5
## 5 5 HANSHI ZUO MI 5.5
## player_pre_rat
## 1 1794
## 2 1553
## 3 1384
## 4 1716
## 5 1655
library(odbc)
# Creating connection
mysqldbTBConn = dbConnect(RMySQL::MySQL(),
dbname= 'md.asaduzzaman39',
host= 'cunydata607sql.mysql.database.azure.com',
port=3306,
user='md.asaduzzaman39',
password='c1706f410226ffca')
dbListTables(mysqldbTBConn)
## [1] "airline_data" "color" "manufacturer" "model" "product"
## [6] "tournament"
Tournament_data = dbSendQuery(mysqldbTBConn,"select * from tournament")
TournamentData <- fetch(Tournament_data)
print(TournamentData)
## player_num player_name player_state total_pts
## 1 1 GARY HUA ON 6.0
## 2 2 DAKSHESH DARURI MI 6.0
## 3 3 ADITYA BAJAJ MI 6.0
## 4 4 PATRICK H SCHILLING MI 5.5
## 5 5 HANSHI ZUO MI 5.5
## player_pre_rat
## 1 1794
## 2 1553
## 3 1384
## 4 1716
## 5 1655