This assignment is about using R to tidy the given data, moving between “wide” and “long” formats to best support our analysis. Following libraries will be used “base R”, tidyr and dplyr to complete the assignment.
Load the libraries
library(tidyr) # gather() and spread() functions are used to tidy the data
library(dplyr) # functions select(), summarise() and filter() are from this library
library(stringr) # used for pattern matching and data cleansing
Sample CSV file is created using data instructions provided in Assignment Tidying and Transforming Data.pdf.
localDir <- getwd()
flightsFile <- paste(str_trim(localDir),"//Flights.csv",sep="")
# Data into a data frame
flights.raw <- read.csv(flightsFile, header = T)
# Rename the column names accordingly
flights.raw <- rename(flights.raw, `Los Angeles` = `Los.Angeles`, `San Diego` =`San.Diego`, `San Francisco` = `San.Francisco`, Airline = X, ArrivalInfo = `X.1`)
flights.raw
## Airline ArrivalInfo Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# Remove rows that are empty.
# flights.raw <- flights.raw %>% filter(`Los Angeles` != "NA")
flights.raw <- flights.raw %>%
filter(!is.na(`Los Angeles`))
flights.raw
## Airline ArrivalInfo Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
# Format data
# 2 rows make up 1 row, following command populate Airline name in second row.
# Fill in missing data.
flights.raw[seq(2,nrow(flights.raw),2),1] <- flights.raw[seq(1,nrow(flights.raw),2),1]
flights.raw
## Airline ArrivalInfo Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
# tidy data
# flights.raw dataset has 3 variables(Airline, ArrivalInfo, Cities) and 2 observations(on time, delayed).
# 1. Airline has 2 possible values (Alaska, AM West)
# 2. ArrivalInfo has 2 possible values(on time and delayed)
# 3. Cities has 5 possible values(Los Angeles, Phoenix, San Diego, San Francisco, Seattle)
# 4. Observations have 20 values (497,221,212,503,1841,62,12,20,102,305,694,4840,383,320,201,117,415,65,129,61)
flights.table1 <- flights.raw %>%
gather(City, Count, -Airline, -ArrivalInfo, na.rm = TRUE)
# A tidy version of the flights.raw data looks like this
flights.table1
## Airline ArrivalInfo City Count
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Data set is mostly tidy except for ArrivalInfo column. ArrivalInfo stores the names of variables.
# Expand rows into columns
flights.table2 <- flights.table1 %>%
spread(ArrivalInfo, Count) %>%
select (Airline, City,`on time`, delayed)
flights.table2
## Airline City on time delayed
## 1 ALASKA Los Angeles 497 62
## 2 ALASKA Phoenix 221 12
## 3 ALASKA San Diego 212 20
## 4 ALASKA San Francisco 503 102
## 5 ALASKA Seattle 1841 305
## 6 AM WEST Los Angeles 694 117
## 7 AM WEST Phoenix 4840 415
## 8 AM WEST San Diego 383 65
## 9 AM WEST San Francisco 320 129
## 10 AM WEST Seattle 201 61
Analysis of the data.
# Add a new column to find total flights by airline using mutate function
flights.table3 <- flights.table2 %>%
mutate(TotalFlights = flights.table2$`on time`+flights.table2$delayed ,
PercentageOfOnTime = round(flights.table2$`on time` * 100/TotalFlights,2) ,
PercentageOfDelay = round(flights.table2$delayed * 100/TotalFlights,2)
) %>%
filter(City %in% c("Los Angeles","Phoenix")) %>%
select(Airline,City,PercentageOfOnTime,PercentageOfDelay) %>%
arrange(City,Airline,PercentageOfDelay)
# By looking at the data for cities Los Angeles and Phoenix both airlines perform close enough. However, ALASKA airlines is better with on time arrival.
flights.table3
## Airline City PercentageOfOnTime PercentageOfDelay
## 1 ALASKA Los Angeles 88.91 11.09
## 2 AM WEST Los Angeles 85.57 14.43
## 3 ALASKA Phoenix 94.85 5.15
## 4 AM WEST Phoenix 92.10 7.90
# Data for San Diego, San Francisco and Seattle
flights.table3 <- flights.table2 %>%
mutate(TotalFlights = flights.table2$`on time`+flights.table2$delayed ,
PercentageOfOnTime = round(flights.table2$`on time` * 100/TotalFlights,2) ,
PercentageOfDelay = round(flights.table2$delayed * 100/TotalFlights,2)
) %>%
filter(City %in% c("San Diego","San Francisco","Seattle")) %>%
select(Airline,City,PercentageOfOnTime,PercentageOfDelay) %>%
arrange(City,Airline,PercentageOfDelay)
# By looking at the data for cities San Diego, San Francisco and Seattle, ALASKA airlines is better with on time arrival and also has less delays.
flights.table3
## Airline City PercentageOfOnTime PercentageOfDelay
## 1 ALASKA San Diego 91.38 8.62
## 2 AM WEST San Diego 85.49 14.51
## 3 ALASKA San Francisco 83.14 16.86
## 4 AM WEST San Francisco 71.27 28.73
## 5 ALASKA Seattle 85.79 14.21
## 6 AM WEST Seattle 76.72 23.28
Additional analysis, Seattle airport is used as example.
# Data for Seattle
flights.table3 <- flights.table1 %>%
filter(City == "Seattle") %>%
select(Airline,ArrivalInfo,Count) %>%
spread(ArrivalInfo,Count)
# Bind row totals
flights.table3$Totals <- rowSums(flights.table3[,-1] )
# Bind column totals
flights.table3 %<>%
summarise(Airline = "Totals",
delayed = sum(delayed),
`on time` = sum(`on time`),
Totals = sum(Totals)) %>%
bind_rows(flights.table3,.)
flights.table3
## Airline delayed on time Totals
## 1 ALASKA 305 1841 2146
## 2 AM WEST 61 201 262
## 3 Totals 366 2042 2408
Assuming ALASKA and AM WEST are the olny airlines flying to Seattle,
Given random flight being delayed at Seattle airport, probability that it belongs to ALASKA airlines : 0.14
Given random flight being delayed at Seattle airport, probability that it belongs to AM WEST airlines : 0.23
Given random flight being on time at Seattle airport, probability that it belongs to ALASKA airlines : 0.86
Given random flight being on time at Seattle airport, probability that it belongs to AM WEST airlines : 0.77
Of all the airlines flying to Seattle, percentage of ALASKA airlines flights : 89.12
Of all the airlines flying to Seattle, percentage of AM WEST airlines flights : 10.88
It is fair to say ALASKA airlines has better on time flights over AM WEST airlines.
Libraries used for this project.
library(readr) # used for reading the text file
library(stringr) # used for extracting text using regular expressions
library(tidyr) # replaced sqldf
library(dplyr) # replaced sqldf
library(utils) # used for generating CSV file
Load Chess Tournament Cross Table file.
# Get local directory and load the file
localDir <- getwd()
chessFile <- paste(str_trim(localDir),"//tournamentinfo.txt",sep="")
# Skip top 3 lines as it contains header info
chess.text<-read_lines(chessFile,skip = 3)
head(chess.text,10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [3] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [6] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [9] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [10] "-----------------------------------------------------------------------------------------"
There are total of 64 players. File needs formatting in order to perform calculations and generate CSV output.
# First level of formatting removes hypen("-") characters
chess.text.level1 <- gsub("-","",chess.text)
head(chess.text.level1, 6)
## [1] ""
## [2] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [3] " ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W |"
## [4] ""
## [5] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [6] " MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B |"
# Second level of formatting removes empty lines
# This makes a continous data without newline characters
chess.text.level2 <- paste(unlist(chess.text.level1), collapse = " ")
head(chess.text.level2,10)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W | 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7| MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B | 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12| MI | 14959604 / R: 1384 >1640 |N:2 |W |B |W |B |W |B |W | 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1| MI | 12616049 / R: 1716 >1744 |N:2 |W |B |W |B |W |B |B | 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17| MI | 14601533 / R: 1655 >1690 |N:2 |B |W |B |W |B |W |B | 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21| OH | 15055204 / R: 1686 >1687 |N:3 |W |B |W |B |B |W |B | 7 | GARY DEE SWATHELL |5.0 |W 57|W 46|W 13|W 11|L 1|W 9|L 2| MI | 11146376 / R: 1649 >1673 |N:3 |W |B |W |B |B |W |W | 8 | EZEKIEL HOUGHTON |5.0 |W 3|W 32|L 14|L 9|W 47|W 28|W 19| MI | 15142253 / R: 1641P17>1657P24 |N:3 |B |W |B |W |B |W |W | 9 | STEFANO LEE |5.0 |W 25|L 18|W 59|W 8|W 26|L 7|W 20| ON | 14954524 / R: 1411 >1564 |N:2 |W |B |W |B |W |B |B | 10 | ANVIT RAO |5.0 |D 16|L 19|W 55|W 31|D 6|W 25|W 18| MI | 14150362 / R: 1365 >1544 |N:3 |W |W |B |B |W |B |W | 11 | CAMERON WILLIAM MC LEMAN |4.5 |D 38|W 56|W 6|L 7|L 3|W 34|W 26| MI | 12581589 / R: 1712 >1696 |N:3 |B |W |B |W |B |W |B | 12 | KENNETH J TACK |4.5 |W 42|W 33|D 5|W 38|H |D 1|L 3| MI | 12681257 / R: 1663 >1670 |N:3 |W |B |W |B | |W |B | 13 | TORRANCE HENRY JR |4.5 |W 36|W 27|L 7|D 5|W 33|L 3|W 32| MI | 15082995 / R: 1666 >1662 |N:3 |B |W |B |B |W |W |B | 14 | BRADLEY SHAW |4.5 |W 54|W 44|W 8|L 1|D 27|L 5|W 31| MI | 10131499 / R: 1610 >1618 |N:3 |W |B |W |W |B |B |W | 15 | ZACHARY JAMES HOUGHTON |4.5 |D 19|L 16|W 30|L 22|W 54|W 33|W 38| MI | 15619130 / R: 1220P13>1416P20 |N:3 |B |B |W |W |B |B |W | 16 | MIKE NIKITIN |4.0 |D 10|W 15|H |W 39|L 2|W 36|U | MI | 10295068 / R: 1604 >1613 |N:3 |B |W | |B |W |B | | 17 | RONALD GRZEGORCZYK |4.0 |W 48|W 41|L 26|L 2|W 23|W 22|L 5| MI | 10297702 / R: 1629 >1610 |N:3 |W |B |W |B |W |B |W | 18 | DAVID SUNDEEN |4.0 |W 47|W 9|L 1|W 32|L 19|W 38|L 10| MI | 11342094 / R: 1600 >1600 |N:3 |B |W |B |W |B |W |B | 19 | DIPANKAR ROY |4.0 |D 15|W 10|W 52|D 28|W 18|L 4|L 8| MI | 14862333 / R: 1564 >1570 |N:3 |W |B |W |B |W |W |B | 20 | JASON ZHENG |4.0 |L 40|W 49|W 23|W 41|W 28|L 2|L 9| MI | 14529060 / R: 1595 >1569 |N:4 |W |B |W |B |W |B |W | 21 | DINH DANG BUI |4.0 |W 43|L 1|W 47|L 3|W 40|W 39|L 6| ON | 15495066 / R: 1563P22>1562 |N:3 |B |W |B |W |W |B |W | 22 | EUGENE L MCCLURE |4.0 |W 64|D 52|L 28|W 15|H |L 17|W 40| MI | 12405534 / R: 1555 >1529 |N:4 |W |B |W |B | |W |B | 23 | ALAN BUI |4.0 |L 4|W 43|L 20|W 58|L 17|W 37|W 46| ON | 15030142 / R: 1363 >1371 | |B |W |B |W |B |W |B | 24 | MICHAEL R ALDRICH |4.0 |L 28|L 47|W 43|L 25|W 60|W 44|W 39| MI | 13469010 / R: 1229 >1300 |N:4 |B |W |B |B |W |W |B | 25 | LOREN SCHWIEBERT |3.5 |L 9|W 53|L 3|W 24|D 34|L 10|W 47| MI | 12486656 / R: 1745 >1681 |N:4 |B |W |B |W |B |W |B | 26 | MAX ZHU |3.5 |W 49|W 40|W 17|L 4|L 9|D 32|L 11| ON | 15131520 / R: 1579 >1564 |N:4 |B |W |B |W |B |W |W | 27 | GAURAV GIDWANI |3.5 |W 51|L 13|W 46|W 37|D 14|L 6|U | MI | 14476567 / R: 1552 >1539 |N:4 |W |B |W |B |W |B | | 28 | SOFIA ADINA STANESCUBELLU |3.5 |W 24|D 4|W 22|D 19|L 20|L 8|D 36| MI | 14882954 / R: 1507 >1513 |N:3 |W |W |B |W |B |B |W | 29 | CHIEDOZIE OKORIE |3.5 |W 50|D 6|L 38|L 34|W 52|W 48|U | MI | 15323285 / R: 1602P6 >1508P12 |N:4 |B |W |B |W |W |B | | 30 | GEORGE AVERY JONES |3.5 |L 52|D 64|L 15|W 55|L 31|W 61|W 50| ON | 12577178 / R: 1522 >1444 | |W |B |B |W |W |B |B | 31 | RISHI SHETTY |3.5 |L 58|D 55|W 64|L 10|W 30|W 50|L 14| MI | 15131618 / R: 1494 >1444 | |B |W |B |W |B |W |B | 32 | JOSHUA PHILIP MATHEWS |3.5 |W 61|L 8|W 44|L 18|W 51|D 26|L 13| ON | 14073750 / R: 1441 >1433 |N:4 |W |B |W |B |W |B |W | 33 | JADE GE |3.5 |W 60|L 12|W 50|D 36|L 13|L 15|W 51| MI | 14691842 / R: 1449 >1421 | |B |W |B |W |B |W |B | 34 | MICHAEL JEFFERY THOMAS |3.5 |L 6|W 60|L 37|W 29|D 25|L 11|W 52| MI | 15051807 / R: 1399 >1400 | |B |W |B |B |W |B |W | 35 | JOSHUA DAVID LEE |3.5 |L 46|L 38|W 56|L 6|W 57|D 52|W 48| MI | 14601397 / R: 1438 >1392 | |W |W |B |W |B |B |W | 36 | SIDDHARTH JHA |3.5 |L 13|W 57|W 51|D 33|H |L 16|D 28| MI | 14773163 / R: 1355 >1367 |N:4 |W |B |W |B | |W |B | 37 | AMIYATOSH PWNANANDAM |3.5 |B |L 5|W 34|L 27|H |L 23|W 61| MI | 15489571 / R: 980P12>1077P17 | | |B |W |W | |B |W | 38 | BRIAN LIU |3.0 |D 11|W 35|W 29|L 12|H |L 18|L 15| MI | 15108523 / R: 1423 >1439 |N:4 |W |B |W |W | |B |B | 39 | JOEL R HENDON |3.0 |L 1|W 54|W 40|L 16|W 44|L 21|L 24| MI | 12923035 / R: 1436P23>1413 |N:4 |B |W |B |W |B |W |W | 40 | FOREST ZHANG |3.0 |W 20|L 26|L 39|W 59|L 21|W 56|L 22| MI | 14892710 / R: 1348 >1346 | |B |B |W |W |B |W |W | 41 | KYLE WILLIAM MURPHY |3.0 |W 59|L 17|W 58|L 20|X |U |U | MI | 15761443 / R: 1403P5 >1341P9 | |B |W |B |W | | | | 42 | JARED GE |3.0 |L 12|L 50|L 57|D 60|D 61|W 64|W 56| MI | 14462326 / R: 1332 >1256 | |B |W |B |B |W |W |B | 43 | ROBERT GLEN VASEY |3.0 |L 21|L 23|L 24|W 63|W 59|L 46|W 55| MI | 14101068 / R: 1283 >1244 | |W |B |W |W |B |B |W | 44 | JUSTIN D SCHILLING |3.0 |B |L 14|L 32|W 53|L 39|L 24|W 59| MI | 15323504 / R: 1199 >1199 | | |W |B |B |W |B |W | 45 | DEREK YAN |3.0 |L 5|L 51|D 60|L 56|W 63|D 55|W 58| MI | 15372807 / R: 1242 >1191 | |W |B |W |B |W |B |W | 46 | JACOB ALEXANDER LAVALLEY |3.0 |W 35|L 7|L 27|L 50|W 64|W 43|L 23| MI | 15490981 / R: 377P3 >1076P10 | |B |W |B |W |B |W |W | 47 | ERIC WRIGHT |2.5 |L 18|W 24|L 21|W 61|L 8|D 51|L 25| MI | 12533115 / R: 1362 >1341 | |W |B |W |B |W |B |W | 48 | DANIEL KHAIN |2.5 |L 17|W 63|H |D 52|H |L 29|L 35| MI | 14369165 / R: 1382 >1335 | |B |W | |B | |W |B | 49 | MICHAEL J MARTIN |2.5 |L 26|L 20|D 63|D 64|W 58|H |U | MI | 12531685 / R: 1291P12>1259P17 | |W |W |B |W |B | | | 50 | SHIVAM JHA |2.5 |L 29|W 42|L 33|W 46|H |L 31|L 30| MI | 14773178 / R: 1056 >1111 | |W |B |W |B | |B |W | 51 | TEJAS AYYAGARI |2.5 |L 27|W 45|L 36|W 57|L 32|D 47|L 33| MI | 15205474 / R: 1011 >1097 | |B |W |B |W |B |W |W | 52 | ETHAN GUO |2.5 |W 30|D 22|L 19|D 48|L 29|D 35|L 34| MI | 14918803 / R: 935 >1092 |N:4 |B |W |B |W |B |W |B | 53 | JOSE C YBARRA |2.0 |H |L 25|H |L 44|U |W 57|U | MI | 12578849 / R: 1393 >1359 | | |B | |W | |W | | 54 | LARRY HODGE |2.0 |L 14|L 39|L 61|B |L 15|L 59|W 64| MI | 12836773 / R: 1270 >1200 | |B |B |W | |W |B |W | 55 | ALEX KONG |2.0 |L 62|D 31|L 10|L 30|B |D 45|L 43| MI | 15412571 / R: 1186 >1163 | |W |B |W |B | |W |B | 56 | MARISA RICCI |2.0 |H |L 11|L 35|W 45|H |L 40|L 42| MI | 14679887 / R: 1153 >1140 | | |B |W |W | |B |W | 57 | MICHAEL LU |2.0 |L 7|L 36|W 42|L 51|L 35|L 53|B | MI | 15113330 / R: 1092 >1079 | |B |W |W |B |W |B | | 58 | VIRAJ MOHILE |2.0 |W 31|L 2|L 41|L 23|L 49|B |L 45| MI | 14700365 / R: 917 > 941 | |W |B |W |B |W | |B | 59 | SEAN M MC CORMICK |2.0 |L 41|B |L 9|L 40|L 43|W 54|L 44| MI | 12841036 / R: 853 > 878 | |W | |B |B |W |W |B | 60 | JULIA SHEN |1.5 |L 33|L 34|D 45|D 42|L 24|H |U | MI | 14579262 / R: 967 > 984 | |W |B |B |W |B | | | 61 | JEZZEL FARKAS |1.5 |L 32|L 3|W 54|L 47|D 42|L 30|L 37| ON | 15771592 / R: 955P11> 979P18 | |B |W |B |W |B |W |B | 62 | ASHWIN BALAJI |1.0 |W 55|U |U |U |U |U |U | MI | 15219542 / R: 1530 >1535 | |B | | | | | | | 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U | MI | 15057092 / R: 1175 >1125 | |W |B |W |B |B | | | 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54| MI | 15006561 / R: 1163 >1112 | |B |W |W |B |W |B |B | "
# Add extra characters at the end of text data. This helps read complete data.
# If extra characters are not added, last line will be skipped.
chess.text.level2 <- paste(chess.text.level2,"_*_*_*_")
# Extract the text using pattern repeating 180 characters
# Idea is to combine 2 lines of each player's info into single line
# Once combined, information about single player takes 180 characters
pattern <- "[[:print:]]{180}"
chess.text.level3 <- unlist(str_extract_all(chess.text.level2, pattern = pattern))
head(chess.text.level3,3)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W | "
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7| MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B | "
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12| MI | 14959604 / R: 1384 >1640 |N:2 |W |B |W |B |W |B |W | "
# Each player has about 20 columns of information. Split the single string into 20 columns
chess.data <- data.frame(unlist(str_split_fixed(chess.text.level3, "\\|", 21)))
head(chess.data, 5)
## X1 X2 X3 X4 X5 X6 X7
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 2 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 3 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 4 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 5 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## X8 X9 X10 X11 X12 X13 X14
## 1 W 7 D 12 D 4 ON 15445895 / R: 1794 >1817 N:2 W
## 2 W 16 W 20 W 7 MI 14598900 / R: 1553 >1663 N:2 B
## 3 W 11 W 13 W 12 MI 14959604 / R: 1384 >1640 N:2 W
## 4 D 5 W 19 D 1 MI 12616049 / R: 1716 >1744 N:2 W
## 5 D 4 W 14 W 17 MI 14601533 / R: 1655 >1690 N:2 B
## X15 X16 X17 X18 X19 X20 X21
## 1 B W B W B W
## 2 W B W B W B
## 3 B W B W B W
## 4 B W B W B B
## 5 W B W B W B
# Using regular expressions extract the pre match rating for each player
pattern = "(R:\\s+[[:digit:]]{3,4})"
chess.preGameRating <- unlist(str_extract(string = chess.data[,12], pattern = pattern))
# Get the pre-game rating
pattern = "[[:digit:]]{3,4}"
chess.preGameRating <- str_extract(string = chess.preGameRating, pattern = pattern)
chess.preGameRating <- data.frame(chess.data, preGameRating = chess.preGameRating)
head(chess.preGameRating, 5)
## X1 X2 X3 X4 X5 X6 X7
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 2 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 3 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 4 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 5 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## X8 X9 X10 X11 X12 X13 X14
## 1 W 7 D 12 D 4 ON 15445895 / R: 1794 >1817 N:2 W
## 2 W 16 W 20 W 7 MI 14598900 / R: 1553 >1663 N:2 B
## 3 W 11 W 13 W 12 MI 14959604 / R: 1384 >1640 N:2 W
## 4 D 5 W 19 D 1 MI 12616049 / R: 1716 >1744 N:2 W
## 5 D 4 W 14 W 17 MI 14601533 / R: 1655 >1690 N:2 B
## X15 X16 X17 X18 X19 X20 X21 preGameRating
## 1 B W B W B W 1794
## 2 W B W B W B 1553
## 3 B W B W B W 1384
## 4 B W B W B B 1716
## 5 W B W B W B 1655
# Format the column type
chess.preGameRating$X1 <- as.numeric(as.character(chess.preGameRating$X1))
chess.preGameRating$preGameRating <- as.numeric(as.character(chess.preGameRating$preGameRating))
# Extract the information about opponent for each round
# Opponent player info is left as "NA" values if player did not play or got bye
# Get opponet playerID
pattern = "[[:digit:]]{1,2}"
chess.preGameRating$G1opponent = as.numeric(str_match(string = chess.preGameRating$X4, pattern = pattern))
chess.preGameRating$G2opponent = as.numeric(str_match(string = chess.preGameRating$X5, pattern = pattern))
chess.preGameRating$G3opponent = as.numeric(str_match(string = chess.preGameRating$X6, pattern = pattern))
chess.preGameRating$G4opponent = as.numeric(str_match(string = chess.preGameRating$X7, pattern = pattern))
chess.preGameRating$G5opponent = as.numeric(str_match(string = chess.preGameRating$X8, pattern = pattern))
chess.preGameRating$G6opponent = as.numeric(str_match(string = chess.preGameRating$X9, pattern = pattern))
chess.preGameRating$G7opponent = as.numeric(str_match(string = chess.preGameRating$X10, pattern = pattern))
# Get required data
chess.data <- chess.preGameRating %>%
select(playerID = as.numeric(X1),playerName = X2, playerPoints = as.numeric(X3), playerState = X11, preGameRating, G1opponent,G2opponent,G3opponent,G4opponent,G5opponent,G6opponent,G7opponent)
# Convert data from factors to character
chess.data$playerName <- as.character(chess.data$playerName)
chess.data$playerState <- as.character(chess.data$playerState)
chess.data$playerPoints <- as.character(chess.data$playerPoints)
# Remove extra spaces
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
chess.data$playerState <- trim(chess.data$playerState)
chess.data$playerName <- trim(chess.data$playerName)
chess.data$playerPoints <- trim(chess.data$playerPoints)
# Player 62 did not play 2, 3 or 4 games
tail(chess.data,10)
## playerID playerName playerPoints playerState preGameRating
## 55 55 ALEX KONG 2.0 MI 1186
## 56 56 MARISA RICCI 2.0 MI 1153
## 57 57 MICHAEL LU 2.0 MI 1092
## 58 58 VIRAJ MOHILE 2.0 MI 917
## 59 59 SEAN M MC CORMICK 2.0 MI 853
## 60 60 JULIA SHEN 1.5 MI 967
## 61 61 JEZZEL FARKAS 1.5 ON 955
## 62 62 ASHWIN BALAJI 1.0 MI 1530
## 63 63 THOMAS JOSEPH HOSMER 1.0 MI 1175
## 64 64 BEN LI 1.0 MI 1163
## G1opponent G2opponent G3opponent G4opponent G5opponent G6opponent
## 55 62 31 10 30 NA 45
## 56 NA 11 35 45 NA 40
## 57 7 36 42 51 35 53
## 58 31 2 41 23 49 NA
## 59 41 NA 9 40 43 54
## 60 33 34 45 42 24 NA
## 61 32 3 54 47 42 30
## 62 55 NA NA NA NA NA
## 63 2 48 49 43 45 NA
## 64 22 30 31 49 46 42
## G7opponent
## 55 43
## 56 42
## 57 NA
## 58 45
## 59 44
## 60 NA
## 61 37
## 62 NA
## 63 NA
## 64 54
# Tidy all data, convert all the opponet values into rows
# opponent is new column with values(G1opponent, G2opponent, ... G7opponent)
# n is the column that stored values for opponetID
chess.tidy.data <- chess.data %>%
gather(opponent, n, -playerID, -playerName, -playerState, -preGameRating, -playerPoints, na.rm = TRUE) %>%
arrange(playerID)
# Rename the column
chess.tidy.data <- rename(chess.tidy.data, opponentID = n)
head(chess.tidy.data,15)
## playerID playerName playerPoints playerState preGameRating
## 1 1 GARY HUA 6.0 ON 1794
## 2 1 GARY HUA 6.0 ON 1794
## 3 1 GARY HUA 6.0 ON 1794
## 4 1 GARY HUA 6.0 ON 1794
## 5 1 GARY HUA 6.0 ON 1794
## 6 1 GARY HUA 6.0 ON 1794
## 7 1 GARY HUA 6.0 ON 1794
## 8 2 DAKSHESH DARURI 6.0 MI 1553
## 9 2 DAKSHESH DARURI 6.0 MI 1553
## 10 2 DAKSHESH DARURI 6.0 MI 1553
## 11 2 DAKSHESH DARURI 6.0 MI 1553
## 12 2 DAKSHESH DARURI 6.0 MI 1553
## 13 2 DAKSHESH DARURI 6.0 MI 1553
## 14 2 DAKSHESH DARURI 6.0 MI 1553
## 15 3 ADITYA BAJAJ 6.0 MI 1384
## opponent opponentID
## 1 G1opponent 39
## 2 G2opponent 21
## 3 G3opponent 18
## 4 G4opponent 14
## 5 G5opponent 7
## 6 G6opponent 12
## 7 G7opponent 4
## 8 G1opponent 63
## 9 G2opponent 58
## 10 G3opponent 4
## 11 G4opponent 17
## 12 G5opponent 16
## 13 G6opponent 20
## 14 G7opponent 7
## 15 G1opponent 8
# Remove the column that is not being used.
chess.tidy.data$opponent <- NULL
# Following code joins chess.tidy.data and chess.data using "opponentID" = "playerID" and extracts opponentID pre-game rating.
# It summarises data using playerID and playerName, this gives sum of opponent ratings and number games played.
# Finally uses mutate to calculate average opponent ratings
# And saves required data to chess.rating data frame
chess.rating <- chess.tidy.data %>% inner_join(chess.data, by = c("opponentID" = "playerID")) %>%
select (playerID, playerName = playerName.x, opponentID, opponentPlayer = playerName.y , opponentRating = as.numeric(preGameRating.y)) %>%
group_by (playerID,playerName) %>%
summarise(opponentRatingSum = sum(opponentRating), numGames = n()) %>%
inner_join(chess.data, by = ("playerID")) %>%
select (playerID, playerName = playerName.x, playerPoints, playerState, preGameRating, opponentRatingSum, numGames) %>%
mutate(avgOppPreGameRating = round(opponentRatingSum / numGames,0)) %>%
select (playerID, playerName, playerPoints, playerState, preGameRating, avgOppPreGameRating)
head(chess.rating,10)
## Source: local data frame [10 x 6]
## Groups: playerID [10]
##
## playerID playerName playerPoints playerState preGameRating
## <dbl> <chr> <chr> <chr> <dbl>
## 1 1 GARY HUA 6.0 ON 1794
## 2 2 DAKSHESH DARURI 6.0 MI 1553
## 3 3 ADITYA BAJAJ 6.0 MI 1384
## 4 4 PATRICK H SCHILLING 5.5 MI 1716
## 5 5 HANSHI ZUO 5.5 MI 1655
## 6 6 HANSEN SONG 5.0 OH 1686
## 7 7 GARY DEE SWATHELL 5.0 MI 1649
## 8 8 EZEKIEL HOUGHTON 5.0 MI 1641
## 9 9 STEFANO LEE 5.0 ON 1411
## 10 10 ANVIT RAO 5.0 MI 1365
## # ... with 1 more variables: avgOppPreGameRating <dbl>
# Generate CSV files
chessOutFile <- paste(str_trim(localDir),"//chessOutFile.csv",sep="")
write.table(chess.rating, file = chessOutFile, row.names = FALSE, sep = ",")