Title: Data607 Project 01

Author: Md. Tanzil Ehsan”

# Load necessary libraries
if (!requireNamespace("rstudioapi", quietly = TRUE)) {install.packages("rstudioapi")}
if (!requireNamespace("readr", quietly = TRUE)){ install.packages("readr")}
if (!requireNamespace("ggplot2", quietly = TRUE)) {install.packages("ggplot2")}
if (!requireNamespace("tidyverse", quietly = TRUE)) {install.packages("tidyverse")}
if (!requireNamespace("tidyr", quietly = TRUE)) {install.packages("tidyr")}

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(tibble)
library(tidyr)
library(ggplot2)
library(devtools)

## Warning: package 'devtools' was built under R version 4.4.3

## Loading required package: usethis

raw_data <- readLines("https://raw.githubusercontent.com/tanzil64/DATA607-Project-01/refs/heads/main/tournamentinfo.txt")

##raw_data <- readLines ("C:/Users/tanzi/OneDrive/DATA/607/Week4/tournamentinfo.txt")

You can add options to executable code like this

#| echo: false
# Extract the required data using regular expressions
player_num <- as.numeric(unlist(str_extract_all(raw_data,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))
player_name <- unlist(str_extract_all(raw_data,"(?<=\\d\\s\\|\\s)([A-z, -]*\\s){1,}[[:alpha:]]*(?=\\s*\\|)"))
player_state <- unlist(str_extract_all(raw_data, "[[:upper:]]{2}(?=\\s\\|)"))
total_pts <- as.numeric(unlist(str_extract_all(raw_data, "(?<=\\|)\\d\\.\\d")))
player_pre_rat <- as.numeric(unlist(str_extract_all(raw_data, "(?<=R:\\s{1,2})(\\d{3,4}(?=\\s))|(\\d{3,4}(?=P\\d{1,2}\\s*-))")))

# Take the extracted data and put it into a data frame
processed_data <- data.frame(player_num, player_name, player_state, total_pts, player_pre_rat)

# Check the data frame's structure to make sure it is as intended (i.e. number columns are numeric, character columns are character, etc..., and that it has the correct number of rows)
str(processed_data)

## 'data.frame':    64 obs. of  5 variables:
##  $ player_num    : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_name   : chr  "GARY HUA                        " "DAKSHESH DARURI                 " "ADITYA BAJAJ                    " "PATRICK H SCHILLING             " ...
##  $ player_state  : chr  "ON" "MI" "MI" "MI" ...
##  $ total_pts     : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ player_pre_rat: num  1794 1553 1384 1716 1655 ...

# Had some initial challenges doing this with only regex so to make it simpler and a bit more robust I created a new list that only had included the relevant rows from the raw data file.
# selects every third line starting at line 5.
secondary_rows <- raw_data[seq(5, 196, 3)]

#opponent_num will contain a vector of opponent numbers for each player.
opponent_num <- as.numeric(unlist(str_extract_all(secondary_rows, "(?<=\\|(W|L|D)\\s{2,3})[[:digit:]]{1,2}(?=\\|)|((?<!->)(?<=\\|(U|H|B|X))\\s{4}(?=\\|))")))

# Create matrix to store data calculated in the for loop.  Pre-populating values with NA for more efficient processing in R.
# Creates a 64-row by 2-column matrix filled with NA values.
# Each row represents a player (since there are 64 players).
# Columns:
#"total_opp_pcr" → The sum of all opponent pre-ratings.
#"avg_opp_pcr" → The average of opponent pre-ratings.


pcr_matrix <- matrix(data = NA, nrow = 64, ncol = 2)

# Assign readable names for the matrix
colnames(pcr_matrix) <- c("total_opp_pcr", "avg_opp_pcr")

# Initialize a variable to be used as a counter in the for loop to fill the corresponding matrix row
row_counter <- 0

# Start of for loop
# Each player plays 7 games, so we process their 7 opponents' numbers at a time.
#Loops through opponent_num in steps of 7.
# length(opponent_num) - 6 → Ensures we stop before exceeding the vector length.
for(i in seq(from=1, to=length(opponent_num)-6, by=7)){
  row_counter <- row_counter + 1
  
# Perform a lookup of each competitor's score based on their player number and add the up for each row (corresponding to each sequence of 7 data points, w/ value from for loop serving as row 'anchor')
 # Extract the opponent numbers for the current player
current_opponents <- opponent_num[seq(from=i, to=i+6, by=1)]

# Find the pre-tournament ratings of these opponents
opponent_ratings <- subset(processed_data$player_pre_rat, 
                           processed_data$player_num %in% current_opponents)

# Calculate the total opponent pre-tournament rating
total_opp_pcr <- sum(opponent_ratings, na.rm = TRUE)  # Use na.rm = TRUE to ignore NA values

# Store the result in the matrix
pcr_matrix[row_counter, 1] <- total_opp_pcr

  
# Calculate the average score for each row, excluding missing entries
  pcr_matrix[row_counter, 2] <- pcr_matrix[row_counter, 1] / length(subset(opponent_num[seq(from=i, to=i+6, by=1)],!is.na(opponent_num[seq(from=i, to=i+6, by=1)])))
  
}
# End of for loop

# Verify that matrix was processed properly by looking at the first few rows of output
head(pcr_matrix, 5)

##      total_opp_pcr avg_opp_pcr
## [1,]         11237    1605.286
## [2,]         10285    1469.286
## [3,]         10945    1563.571
## [4,]         11015    1573.571
## [5,]         10506    1500.857

Round the figures to the nearest whole number

pcr_matrix[, 2] <- round(pcr_matrix[,2], digits = 0)
# Add average scores to data frame with other processed data and rename for readability
processed_data <- cbind(processed_data, pcr_matrix[, 2])
processed_data <- rename(processed_data, avg_opp_pcr = `pcr_matrix[, 2]`)

processed_data

##    player_num                      player_name player_state total_pts
## 1           1 GARY HUA                                   ON       6.0
## 2           2 DAKSHESH DARURI                            MI       6.0
## 3           3 ADITYA BAJAJ                               MI       6.0
## 4           4 PATRICK H SCHILLING                        MI       5.5
## 5           5 HANSHI ZUO                                 MI       5.5
## 6           6 HANSEN SONG                                OH       5.0
## 7           7 GARY DEE SWATHELL                          MI       5.0
## 8           8 EZEKIEL HOUGHTON                           MI       5.0
## 9           9 STEFANO LEE                                ON       5.0
## 10         10 ANVIT RAO                                  MI       5.0
## 11         11 CAMERON WILLIAM MC LEMAN                   MI       4.5
## 12         12 KENNETH J TACK                             MI       4.5
## 13         13 TORRANCE HENRY JR                          MI       4.5
## 14         14 BRADLEY SHAW                               MI       4.5
## 15         15 ZACHARY JAMES HOUGHTON                     MI       4.5
## 16         16 MIKE NIKITIN                               MI       4.0
## 17         17 RONALD GRZEGORCZYK                         MI       4.0
## 18         18 DAVID SUNDEEN                              MI       4.0
## 19         19 DIPANKAR ROY                               MI       4.0
## 20         20 JASON ZHENG                                MI       4.0
## 21         21 DINH DANG BUI                              ON       4.0
## 22         22 EUGENE L MCCLURE                           MI       4.0
## 23         23 ALAN BUI                                   ON       4.0
## 24         24 MICHAEL R ALDRICH                          MI       4.0
## 25         25 LOREN SCHWIEBERT                           MI       3.5
## 26         26 MAX ZHU                                    ON       3.5
## 27         27 GAURAV GIDWANI                             MI       3.5
## 28         28 SOFIA ADINA STANESCU-BELLU                 MI       3.5
## 29         29 CHIEDOZIE OKORIE                           MI       3.5
## 30         30 GEORGE AVERY JONES                         ON       3.5
## 31         31 RISHI SHETTY                               MI       3.5
## 32         32 JOSHUA PHILIP MATHEWS                      ON       3.5
## 33         33 JADE GE                                    MI       3.5
## 34         34 MICHAEL JEFFERY THOMAS                     MI       3.5
## 35         35 JOSHUA DAVID LEE                           MI       3.5
## 36         36 SIDDHARTH JHA                              MI       3.5
## 37         37 AMIYATOSH PWNANANDAM                       MI       3.5
## 38         38 BRIAN LIU                                  MI       3.0
## 39         39 JOEL R HENDON                              MI       3.0
## 40         40 FOREST ZHANG                               MI       3.0
## 41         41 KYLE WILLIAM MURPHY                        MI       3.0
## 42         42 JARED GE                                   MI       3.0
## 43         43 ROBERT GLEN VASEY                          MI       3.0
## 44         44 JUSTIN D SCHILLING                         MI       3.0
## 45         45 DEREK YAN                                  MI       3.0
## 46         46 JACOB ALEXANDER LAVALLEY                   MI       3.0
## 47         47 ERIC WRIGHT                                MI       2.5
## 48         48 DANIEL KHAIN                               MI       2.5
## 49         49 MICHAEL J MARTIN                           MI       2.5
## 50         50 SHIVAM JHA                                 MI       2.5
## 51         51 TEJAS AYYAGARI                             MI       2.5
## 52         52 ETHAN GUO                                  MI       2.5
## 53         53 JOSE C YBARRA                              MI       2.0
## 54         54 LARRY HODGE                                MI       2.0
## 55         55 ALEX KONG                                  MI       2.0
## 56         56 MARISA RICCI                               MI       2.0
## 57         57 MICHAEL LU                                 MI       2.0
## 58         58 VIRAJ MOHILE                               MI       2.0
## 59         59 SEAN M MC CORMICK                          MI       2.0
## 60         60 JULIA SHEN                                 MI       1.5
## 61         61 JEZZEL FARKAS                              ON       1.5
## 62         62 ASHWIN BALAJI                              MI       1.0
## 63         63 THOMAS JOSEPH HOSMER                       MI       1.0
## 64         64 BEN LI                                     MI       1.0
##    player_pre_rat avg_opp_pcr
## 1            1794        1605
## 2            1553        1469
## 3            1384        1564
## 4            1716        1574
## 5            1655        1501
## 6            1686        1519
## 7            1649        1372
## 8            1641        1468
## 9            1411        1523
## 10           1365        1554
## 11           1712        1468
## 12           1663        1506
## 13           1666        1498
## 14           1610        1515
## 15           1220        1484
## 16           1604        1386
## 17           1629        1499
## 18           1600        1480
## 19           1564        1426
## 20           1595        1411
## 21           1563        1470
## 22           1555        1300
## 23           1363        1214
## 24           1229        1357
## 25           1745        1363
## 26           1579        1507
## 27           1552        1222
## 28           1507        1522
## 29           1602        1314
## 30           1522        1144
## 31           1494        1260
## 32           1441        1379
## 33           1449        1277
## 34           1399        1375
## 35           1438        1150
## 36           1355        1388
## 37            980        1385
## 38           1423        1539
## 39           1436        1430
## 40           1348        1391
## 41           1403        1248
## 42           1332        1150
## 43           1283        1107
## 44           1199        1327
## 45           1242        1152
## 46            377        1358
## 47           1362        1392
## 48           1382        1356
## 49           1291        1286
## 50           1056        1296
## 51           1011        1356
## 52            935        1495
## 53           1393        1345
## 54           1270        1206
## 55           1186        1406
## 56           1153        1414
## 57           1092        1363
## 58            917        1391
## 59            853        1319
## 60            967        1330
## 61            955        1327
## 62           1530        1186
## 63           1175        1350
## 64           1163        1263

Get working directory path

path <- getwd()

# Export file to working directory.  The file.path function has been used to ensure platform independence (i.e. take into account the different path syntaxes for various operating systems)
write.csv(processed_data, file.path(path, "chess_processed_data.csv"), row.names = FALSE)

head(processed_data, 5)

##   player_num                      player_name player_state total_pts
## 1          1 GARY HUA                                   ON       6.0
## 2          2 DAKSHESH DARURI                            MI       6.0
## 3          3 ADITYA BAJAJ                               MI       6.0
## 4          4 PATRICK H SCHILLING                        MI       5.5
## 5          5 HANSHI ZUO                                 MI       5.5
##   player_pre_rat avg_opp_pcr
## 1           1794        1605
## 2           1553        1469
## 3           1384        1564
## 4           1716        1574
## 5           1655        1501

Conclusion:

In Project 1, we successfully read and cleaned a movie ratings dataset directly from a GitHub repository. By converting the Excel file to a usable format and removing duplicate entries, we prepared the data for further analysis. This process demonstrates essential skills in data acquisition and preprocessing—key steps in any data science workflow. The cleaned dataset is now ready for exploratory analysis or modeling in future projects.