df<- read.delim(url, sep="|", header=FALSE,skip=4)
head(df)
## V1
## 1 1
## 2 ON
## 3 -----------------------------------------------------------------------------------------
## 4 2
## 5 MI
## 6 -----------------------------------------------------------------------------------------
## V2 V3 V4 V5 V6 V7 V8 V9
## 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12
## 2 15445895 / R: 1794 ->1817 N:2 W B W B W B
## 3
## 4 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16 W 20
## 5 14598900 / R: 1553 ->1663 N:2 B W B W B W
## 6
## V10 V11
## 1 D 4 NA
## 2 W NA
## 3 NA
## 4 W 7 NA
## 5 B NA
## 6 NA
rows_to_remove<- seq(from= 0, to = nrow(df), by=3) #removing extra separator lines
clean_df<- df[-rows_to_remove, ] #anything before the "," indicates the row
head(clean_df)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W B W
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 5 MI 14598900 / R: 1553 ->1663 N:2 B W B W B
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 8 MI 14959604 / R: 1384 ->1640 N:2 W B W B W
## V9 V10 V11
## 1 D 12 D 4 NA
## 2 B W NA
## 4 W 20 W 7 NA
## 5 W B NA
## 7 W 13 W 12 NA
## 8 B W NA
#filter out every second row and merge two data frames together into one:
second_row<- seq(2, nrow(clean_df), by =2)
second_row_df<-clean_df[second_row, ]
head(data.frame(second_row_df))
## V1 V2 V3 V4 V5 V6 V7 V8
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W B W
## 5 MI 14598900 / R: 1553 ->1663 N:2 B W B W B
## 8 MI 14959604 / R: 1384 ->1640 N:2 W B W B W
## 11 MI 12616049 / R: 1716 ->1744 N:2 W B W B W
## 14 MI 14601533 / R: 1655 ->1690 N:2 B W B W B
## 17 OH 15055204 / R: 1686 ->1687 N:3 W B W B B
## V9 V10 V11
## 2 B W NA
## 5 W B NA
## 8 B W NA
## 11 B B NA
## 14 W B NA
## 17 W B NA
first_row_df<-clean_df[-second_row, ]
head(first_row_df)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 10 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5
## 13 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4
## 16 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35 D 10
## V9 V10 V11
## 1 D 12 D 4 NA
## 4 W 20 W 7 NA
## 7 W 13 W 12 NA
## 10 W 19 D 1 NA
## 13 W 14 W 17 NA
## 16 W 27 W 21 NA
#to combine the two dataframes:
com_df<- cbind(first_row_df, second_row_df)
head(com_df)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 10 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5
## 13 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4
## 16 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35 D 10
## V9 V10 V11 V1 V2 V3 V4 V5
## 1 D 12 D 4 NA ON 15445895 / R: 1794 ->1817 N:2 W B
## 4 W 20 W 7 NA MI 14598900 / R: 1553 ->1663 N:2 B W
## 7 W 13 W 12 NA MI 14959604 / R: 1384 ->1640 N:2 W B
## 10 W 19 D 1 NA MI 12616049 / R: 1716 ->1744 N:2 W B
## 13 W 14 W 17 NA MI 14601533 / R: 1655 ->1690 N:2 B W
## 16 W 27 W 21 NA OH 15055204 / R: 1686 ->1687 N:3 W B
## V6 V7 V8 V9 V10 V11
## 1 W B W B W NA
## 4 B W B W B NA
## 7 W B W B W NA
## 10 W B W B B NA
## 13 B W B W B NA
## 16 W B B W B NA
com_df<- com_df %>%
subset(select=c(1:10, 12:13))
colnames(com_df)<- c("PairID", "Name", "Points", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7", "State", "USCF_ID")
head(com_df)
## PairID Name Points Round1 Round2 Round3 Round4
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 10 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 13 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 16 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## Round5 Round6 Round7 State USCF_ID
## 1 W 7 D 12 D 4 ON 15445895 / R: 1794 ->1817
## 4 W 16 W 20 W 7 MI 14598900 / R: 1553 ->1663
## 7 W 11 W 13 W 12 MI 14959604 / R: 1384 ->1640
## 10 D 5 W 19 D 1 MI 12616049 / R: 1716 ->1744
## 13 D 4 W 14 W 17 MI 14601533 / R: 1655 ->1690
## 16 D 10 W 27 W 21 OH 15055204 / R: 1686 ->1687
com_df<- com_df %>%
separate_wider_delim('USCF_ID',
delim="/",
names= c('USCF_ID',
'Pre_rating'))
com_df$Pre_rating<- trimws(str_sub(com_df$Pre_rating, 4,8))
print(com_df)
## # A tibble: 64 × 13
## PairID Name Points Round1 Round2 Round3 Round4 Round5 Round6 Round7 State
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 " 1 " " GAR… "6.0 … W 39 W 21 W 18 W 14 W 7 D 12 D 4 " …
## 2 " 2 " " DAK… "6.0 … W 63 W 58 L 4 W 17 W 16 W 20 W 7 " …
## 3 " 3 " " ADI… "6.0 … L 8 W 61 W 25 W 21 W 11 W 13 W 12 " …
## 4 " 4 " " PAT… "5.5 … W 23 D 28 W 2 W 26 D 5 W 19 D 1 " …
## 5 " 5 " " HAN… "5.5 … W 45 W 37 D 12 D 13 D 4 W 14 W 17 " …
## 6 " 6 " " HAN… "5.0 … W 34 D 29 L 11 W 35 D 10 W 27 W 21 " …
## 7 " 7 " " GAR… "5.0 … W 57 W 46 W 13 W 11 L 1 W 9 L 2 " …
## 8 " 8 " " EZE… "5.0 … W 3 W 32 L 14 L 9 W 47 W 28 W 19 " …
## 9 " 9 " " STE… "5.0 … W 25 L 18 W 59 W 8 W 26 L 7 W 20 " …
## 10 " 10 " " ANV… "5.0 … D 16 L 19 W 55 W 31 D 6 W 25 W 18 " …
## # ℹ 54 more rows
## # ℹ 2 more variables: USCF_ID <chr>, Pre_rating <chr>
com_df[, 4:10]<- com_df[, 4:10] %>%
lapply(function(x) trimws(gsub("[^0-9]", "", x)))
#to convert columns to be numeric:
com_df[,c(1, 3:10, 12:13)]<- lapply(com_df[,c(1, 3:10, 12:13)], as.numeric)
print(com_df)
## # A tibble: 64 × 13
## PairID Name Points Round1 Round2 Round3 Round4 Round5 Round6 Round7 State
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 " GARY … 6 39 21 18 14 7 12 4 " …
## 2 2 " DAKSH… 6 63 58 4 17 16 20 7 " …
## 3 3 " ADITY… 6 8 61 25 21 11 13 12 " …
## 4 4 " PATRI… 5.5 23 28 2 26 5 19 1 " …
## 5 5 " HANSH… 5.5 45 37 12 13 4 14 17 " …
## 6 6 " HANSE… 5 34 29 11 35 10 27 21 " …
## 7 7 " GARY … 5 57 46 13 11 1 9 2 " …
## 8 8 " EZEKI… 5 3 32 14 9 47 28 19 " …
## 9 9 " STEFA… 5 25 18 59 8 26 7 20 " …
## 10 10 " ANVIT… 5 16 19 55 31 6 25 18 " …
## # ℹ 54 more rows
## # ℹ 2 more variables: USCF_ID <dbl>, Pre_rating <dbl>
com_df[, 4:10]<- com_df[, 4:10] %>% lapply(function(x) trimws(str_sub(x, 3,5))) view(com_df)
fetch<- function(df) {
lapply(df, function(x) {
x<- x[!is.na(x)] #to remove NA from matching
if (length(x)==0) {
return(NA)
}
P_rating<- com_df$Pre_rating[com_df$PairID == x]
if (length(P_rating) > 0) {
return(P_rating)
} else {
return(NA)}
})}
#transformation to make each round PairID in exchange for the pre_rating
com_df[, 4:10]<- lapply(com_df[, 4:10], fetch)
view(com_df)
# need to convert the Rounds back to numeric again:
com_df[, 4:10]<- lapply(com_df[, 4:10], as.numeric)
summary(com_df)
## PairID Name Points Round1
## Min. : 1.00 Length:64 Min. :1.000 Min. : 377
## 1st Qu.:16.75 Class :character 1st Qu.:2.500 1st Qu.:1263
## Median :32.50 Mode :character Median :3.500 Median :1430
## Mean :32.50 Mean :3.438 Mean :1392
## 3rd Qu.:48.25 3rd Qu.:4.000 3rd Qu.:1596
## Max. :64.00 Max. :6.000 Max. :1794
## NA's :4
## Round2 Round3 Round4 Round5 Round6
## Min. : 377 Min. : 377 Min. : 377 Min. : 377 Min. : 377
## 1st Qu.:1232 1st Qu.:1215 1st Qu.:1222 1st Qu.:1239 1st Qu.:1249
## Median :1407 Median :1407 Median :1407 Median :1437 Median :1430
## Mean :1385 Mean :1372 Mean :1378 Mean :1387 Mean :1396
## 3rd Qu.:1591 3rd Qu.:1583 3rd Qu.:1591 3rd Qu.:1600 3rd Qu.:1599
## Max. :1794 Max. :1794 Max. :1794 Max. :1794 Max. :1794
## NA's :2 NA's :4 NA's :2 NA's :12 NA's :6
## Round7 State USCF_ID Pre_rating
## Min. : 377 Length:64 Min. :10131499 Min. : 377
## 1st Qu.:1232 Class :character 1st Qu.:12902535 1st Qu.:1227
## Median :1417 Mode :character Median :14736764 Median :1407
## Mean :1382 Mean :14167956 Mean :1378
## 3rd Qu.:1591 3rd Qu.:15117878 3rd Qu.:1583
## Max. :1794 Max. :15771592 Max. :1794
## NA's :10
com_df<- com_df %>%
mutate(Avg_op_rating = floor(rowMeans(com_df[1:64, 4:10], na.rm = TRUE)))
print(com_df)
## # A tibble: 64 × 14
## PairID Name Points Round1 Round2 Round3 Round4 Round5 Round6 Round7 State
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 " GARY … 6 1436 1563 1600 1610 1649 1663 1716 " …
## 2 2 " DAKSH… 6 1175 917 1716 1629 1604 1595 1649 " …
## 3 3 " ADITY… 6 1641 955 1745 1563 1712 1666 1663 " …
## 4 4 " PATRI… 5.5 1363 1507 1553 1579 1655 1564 1794 " …
## 5 5 " HANSH… 5.5 1242 980 1663 1666 1716 1610 1629 " …
## 6 6 " HANSE… 5 1399 1602 1712 1438 1365 1552 1563 " …
## 7 7 " GARY … 5 1092 377 1666 1712 1794 1411 1553 " …
## 8 8 " EZEKI… 5 1384 1441 1610 1411 1362 1507 1564 " …
## 9 9 " STEFA… 5 1745 1600 853 1641 1579 1649 1595 " …
## 10 10 " ANVIT… 5 1604 1564 1186 1494 1686 1745 1600 " …
## # ℹ 54 more rows
## # ℹ 3 more variables: USCF_ID <dbl>, Pre_rating <dbl>, Avg_op_rating <dbl>
final<- com_df %>%
subset(select=c(Name, State, Points, Pre_rating, Avg_op_rating))
print(final)
## # A tibble: 64 × 5
## Name State Points Pre_rating Avg_op_rating
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 " GARY HUA " " ON " 6 1794 1605
## 2 " DAKSHESH DARURI " " MI " 6 1553 1469
## 3 " ADITYA BAJAJ " " MI " 6 1384 1563
## 4 " PATRICK H SCHILLING " " MI " 5.5 1716 1573
## 5 " HANSHI ZUO " " MI " 5.5 1655 1500
## 6 " HANSEN SONG " " OH " 5 1686 1518
## 7 " GARY DEE SWATHELL " " MI " 5 1649 1372
## 8 " EZEKIEL HOUGHTON " " MI " 5 1641 1468
## 9 " STEFANO LEE " " ON " 5 1411 1523
## 10 " ANVIT RAO " " MI " 5 1365 1554
## # ℹ 54 more rows
write.csv(final, file='Project_Chess.csv', row.names = FALSE)