This assignment is about using R to tidy the given data, moving between “wide” and “long” formats to best support our analysis. Following libraries will be used “base R”, tidyr and dplyr to complete the assignment.

Load the libraries

library(tidyr)    # gather() and spread() functions are used to tidy the data
library(dplyr)    # functions select(), summarise() and filter() are from this library
library(stringr)  # used for pattern matching and data cleansing 

Sample CSV file is created using data instructions provided in Assignment Tidying and Transforming Data.pdf.

localDir <- getwd()
flightsFile <- paste(str_trim(localDir),"//Flights.csv",sep="")

# Data into a data frame
flights.raw <- read.csv(flightsFile, header = T)


# Rename the column names accordingly
flights.raw <- rename(flights.raw, `Los Angeles` = `Los.Angeles`, `San Diego` =`San.Diego`, `San Francisco` = `San.Francisco`, Airline = X, ArrivalInfo = `X.1`)

flights.raw
##   Airline ArrivalInfo Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA     on time         497     221       212           503    1841
## 2             delayed          62      12        20           102     305
## 3                              NA      NA        NA            NA      NA
## 4 AM WEST     on time         694    4840       383           320     201
## 5             delayed         117     415        65           129      61
# Remove rows that are empty.
# flights.raw <- flights.raw %>% filter(`Los Angeles` != "NA")

flights.raw <- flights.raw %>%
  filter(!is.na(`Los Angeles`))

flights.raw
##   Airline ArrivalInfo Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA     on time         497     221       212           503    1841
## 2             delayed          62      12        20           102     305
## 3 AM WEST     on time         694    4840       383           320     201
## 4             delayed         117     415        65           129      61
# Format data
# 2 rows make up 1 row, following command populate Airline name in second row.
# Fill in missing data.

flights.raw[seq(2,nrow(flights.raw),2),1] <- flights.raw[seq(1,nrow(flights.raw),2),1]

flights.raw
##   Airline ArrivalInfo Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA     on time         497     221       212           503    1841
## 2  ALASKA     delayed          62      12        20           102     305
## 3 AM WEST     on time         694    4840       383           320     201
## 4 AM WEST     delayed         117     415        65           129      61
# tidy data
# flights.raw dataset has 3 variables(Airline, ArrivalInfo, Cities) and 2 observations(on time, delayed).
# 1. Airline has 2 possible values (Alaska, AM West)
# 2. ArrivalInfo has 2 possible values(on time and delayed)
# 3. Cities has 5 possible values(Los Angeles, Phoenix, San Diego, San Francisco, Seattle)
# 4. Observations have 20 values (497,221,212,503,1841,62,12,20,102,305,694,4840,383,320,201,117,415,65,129,61)

flights.table1 <- flights.raw %>% 
  gather(City, Count, -Airline, -ArrivalInfo, na.rm = TRUE)

# A tidy version of the flights.raw data looks like this
flights.table1
##    Airline ArrivalInfo          City Count
## 1   ALASKA     on time   Los Angeles   497
## 2   ALASKA     delayed   Los Angeles    62
## 3  AM WEST     on time   Los Angeles   694
## 4  AM WEST     delayed   Los Angeles   117
## 5   ALASKA     on time       Phoenix   221
## 6   ALASKA     delayed       Phoenix    12
## 7  AM WEST     on time       Phoenix  4840
## 8  AM WEST     delayed       Phoenix   415
## 9   ALASKA     on time     San Diego   212
## 10  ALASKA     delayed     San Diego    20
## 11 AM WEST     on time     San Diego   383
## 12 AM WEST     delayed     San Diego    65
## 13  ALASKA     on time San Francisco   503
## 14  ALASKA     delayed San Francisco   102
## 15 AM WEST     on time San Francisco   320
## 16 AM WEST     delayed San Francisco   129
## 17  ALASKA     on time       Seattle  1841
## 18  ALASKA     delayed       Seattle   305
## 19 AM WEST     on time       Seattle   201
## 20 AM WEST     delayed       Seattle    61

Data set is mostly tidy except for ArrivalInfo column. ArrivalInfo stores the names of variables.

# Expand rows into columns

flights.table2 <- flights.table1 %>% 
  spread(ArrivalInfo, Count) %>%
  select (Airline, City,`on time`, delayed)

flights.table2
##    Airline          City on time delayed
## 1   ALASKA   Los Angeles     497      62
## 2   ALASKA       Phoenix     221      12
## 3   ALASKA     San Diego     212      20
## 4   ALASKA San Francisco     503     102
## 5   ALASKA       Seattle    1841     305
## 6  AM WEST   Los Angeles     694     117
## 7  AM WEST       Phoenix    4840     415
## 8  AM WEST     San Diego     383      65
## 9  AM WEST San Francisco     320     129
## 10 AM WEST       Seattle     201      61

Analysis of the data.

# Add a new column to find total flights by airline using mutate function

flights.table3 <- flights.table2 %>%
  mutate(TotalFlights = flights.table2$`on time`+flights.table2$delayed , 
         PercentageOfOnTime = round(flights.table2$`on time` * 100/TotalFlights,2) ,
         PercentageOfDelay = round(flights.table2$delayed * 100/TotalFlights,2)
        ) %>%
  filter(City %in% c("Los Angeles","Phoenix")) %>%
  select(Airline,City,PercentageOfOnTime,PercentageOfDelay) %>%
  arrange(City,Airline,PercentageOfDelay)


# By looking at the data for cities Los Angeles and Phoenix both airlines perform close enough. However, ALASKA airlines is better with on time arrival.
flights.table3
##   Airline        City PercentageOfOnTime PercentageOfDelay
## 1  ALASKA Los Angeles              88.91             11.09
## 2 AM WEST Los Angeles              85.57             14.43
## 3  ALASKA     Phoenix              94.85              5.15
## 4 AM WEST     Phoenix              92.10              7.90
# Data for San Diego, San Francisco and Seattle

flights.table3 <- flights.table2 %>%
  mutate(TotalFlights = flights.table2$`on time`+flights.table2$delayed , 
         PercentageOfOnTime = round(flights.table2$`on time` * 100/TotalFlights,2) ,
         PercentageOfDelay = round(flights.table2$delayed * 100/TotalFlights,2)
        ) %>%
  filter(City %in% c("San Diego","San Francisco","Seattle")) %>%
  select(Airline,City,PercentageOfOnTime,PercentageOfDelay) %>%
  arrange(City,Airline,PercentageOfDelay)


# By looking at the data for cities San Diego, San Francisco and Seattle, ALASKA airlines is better with on time arrival and also has less delays.
flights.table3
##   Airline          City PercentageOfOnTime PercentageOfDelay
## 1  ALASKA     San Diego              91.38              8.62
## 2 AM WEST     San Diego              85.49             14.51
## 3  ALASKA San Francisco              83.14             16.86
## 4 AM WEST San Francisco              71.27             28.73
## 5  ALASKA       Seattle              85.79             14.21
## 6 AM WEST       Seattle              76.72             23.28

Additional analysis, Seattle airport is used as example.

# Data for Seattle

flights.table3 <- flights.table1 %>% 
  filter(City == "Seattle") %>%
  select(Airline,ArrivalInfo,Count) %>%
  spread(ArrivalInfo,Count)

# Bind row totals

flights.table3$Totals <- rowSums(flights.table3[,-1] )

# Bind column totals

flights.table3 %<>%
  summarise(Airline = "Totals",
            delayed = sum(delayed),
            `on time` = sum(`on time`),
            Totals = sum(Totals)) %>%
  bind_rows(flights.table3,.)

flights.table3
##   Airline delayed on time Totals
## 1  ALASKA     305    1841   2146
## 2 AM WEST      61     201    262
## 3  Totals     366    2042   2408

Assuming ALASKA and AM WEST are the olny airlines flying to Seattle,

  1. Given random flight being delayed at Seattle airport, probability that it belongs to ALASKA airlines : 0.14

  2. Given random flight being delayed at Seattle airport, probability that it belongs to AM WEST airlines : 0.23

  3. Given random flight being on time at Seattle airport, probability that it belongs to ALASKA airlines : 0.86

  4. Given random flight being on time at Seattle airport, probability that it belongs to AM WEST airlines : 0.77

  5. Of all the airlines flying to Seattle, percentage of ALASKA airlines flights : 89.12

  6. Of all the airlines flying to Seattle, percentage of AM WEST airlines flights : 10.88

Conclusion:

It is fair to say ALASKA airlines has better on time flights over AM WEST airlines.

Last week’s project without using sqldf library

Libraries used for this project.

library(readr)      # used for reading the text file
library(stringr)    # used for extracting text using regular expressions
library(tidyr)      # replaced sqldf 
library(dplyr)      # replaced sqldf
library(utils)      # used for generating CSV file

Load Chess Tournament Cross Table file.

# Get local directory and load the file

localDir <- getwd()
chessFile <- paste(str_trim(localDir),"//tournamentinfo.txt",sep="")

# Skip top 3 lines as it contains header info

chess.text<-read_lines(chessFile,skip = 3)
head(chess.text,10)
##  [1] "-----------------------------------------------------------------------------------------"
##  [2] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
##  [3] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
##  [4] "-----------------------------------------------------------------------------------------"
##  [5] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
##  [6] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
##  [7] "-----------------------------------------------------------------------------------------"
##  [8] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
##  [9] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [10] "-----------------------------------------------------------------------------------------"

There are total of 64 players. File needs formatting in order to perform calculations and generate CSV output.

# First level of formatting removes hypen("-") characters

chess.text.level1 <- gsub("-","",chess.text)
head(chess.text.level1, 6)
## [1] ""                                                                                         
## [2] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [3] "   ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [4] ""                                                                                         
## [5] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [6] "   MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
# Second level of formatting removes empty lines
# This makes a continous data without newline characters 

chess.text.level2 <- paste(unlist(chess.text.level1), collapse = " ")
head(chess.text.level2,10)
## [1] "     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|    ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |      2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|    MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |      3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|    MI | 14959604 / R: 1384   >1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |      4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|    MI | 12616049 / R: 1716   >1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |      5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|    MI | 14601533 / R: 1655   >1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |      6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|    OH | 15055204 / R: 1686   >1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |      7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|    MI | 11146376 / R: 1649   >1673     |N:3  |W    |B    |W    |B    |B    |W    |W    |      8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|    MI | 15142253 / R: 1641P17>1657P24  |N:3  |B    |W    |B    |W    |B    |W    |W    |      9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|    ON | 14954524 / R: 1411   >1564     |N:2  |W    |B    |W    |B    |W    |B    |B    |     10 | ANVIT RAO                       |5.0  |D  16|L  19|W  55|W  31|D   6|W  25|W  18|    MI | 14150362 / R: 1365   >1544     |N:3  |W    |W    |B    |B    |W    |B    |W    |     11 | CAMERON WILLIAM MC LEMAN        |4.5  |D  38|W  56|W   6|L   7|L   3|W  34|W  26|    MI | 12581589 / R: 1712   >1696     |N:3  |B    |W    |B    |W    |B    |W    |B    |     12 | KENNETH J TACK                  |4.5  |W  42|W  33|D   5|W  38|H    |D   1|L   3|    MI | 12681257 / R: 1663   >1670     |N:3  |W    |B    |W    |B    |     |W    |B    |     13 | TORRANCE HENRY JR               |4.5  |W  36|W  27|L   7|D   5|W  33|L   3|W  32|    MI | 15082995 / R: 1666   >1662     |N:3  |B    |W    |B    |B    |W    |W    |B    |     14 | BRADLEY SHAW                    |4.5  |W  54|W  44|W   8|L   1|D  27|L   5|W  31|    MI | 10131499 / R: 1610   >1618     |N:3  |W    |B    |W    |W    |B    |B    |W    |     15 | ZACHARY JAMES HOUGHTON          |4.5  |D  19|L  16|W  30|L  22|W  54|W  33|W  38|    MI | 15619130 / R: 1220P13>1416P20  |N:3  |B    |B    |W    |W    |B    |B    |W    |     16 | MIKE NIKITIN                    |4.0  |D  10|W  15|H    |W  39|L   2|W  36|U    |    MI | 10295068 / R: 1604   >1613     |N:3  |B    |W    |     |B    |W    |B    |     |     17 | RONALD GRZEGORCZYK              |4.0  |W  48|W  41|L  26|L   2|W  23|W  22|L   5|    MI | 10297702 / R: 1629   >1610     |N:3  |W    |B    |W    |B    |W    |B    |W    |     18 | DAVID SUNDEEN                   |4.0  |W  47|W   9|L   1|W  32|L  19|W  38|L  10|    MI | 11342094 / R: 1600   >1600     |N:3  |B    |W    |B    |W    |B    |W    |B    |     19 | DIPANKAR ROY                    |4.0  |D  15|W  10|W  52|D  28|W  18|L   4|L   8|    MI | 14862333 / R: 1564   >1570     |N:3  |W    |B    |W    |B    |W    |W    |B    |     20 | JASON ZHENG                     |4.0  |L  40|W  49|W  23|W  41|W  28|L   2|L   9|    MI | 14529060 / R: 1595   >1569     |N:4  |W    |B    |W    |B    |W    |B    |W    |     21 | DINH DANG BUI                   |4.0  |W  43|L   1|W  47|L   3|W  40|W  39|L   6|    ON | 15495066 / R: 1563P22>1562     |N:3  |B    |W    |B    |W    |W    |B    |W    |     22 | EUGENE L MCCLURE                |4.0  |W  64|D  52|L  28|W  15|H    |L  17|W  40|    MI | 12405534 / R: 1555   >1529     |N:4  |W    |B    |W    |B    |     |W    |B    |     23 | ALAN BUI                        |4.0  |L   4|W  43|L  20|W  58|L  17|W  37|W  46|    ON | 15030142 / R: 1363   >1371     |     |B    |W    |B    |W    |B    |W    |B    |     24 | MICHAEL R ALDRICH               |4.0  |L  28|L  47|W  43|L  25|W  60|W  44|W  39|    MI | 13469010 / R: 1229   >1300     |N:4  |B    |W    |B    |B    |W    |W    |B    |     25 | LOREN SCHWIEBERT                |3.5  |L   9|W  53|L   3|W  24|D  34|L  10|W  47|    MI | 12486656 / R: 1745   >1681     |N:4  |B    |W    |B    |W    |B    |W    |B    |     26 | MAX ZHU                         |3.5  |W  49|W  40|W  17|L   4|L   9|D  32|L  11|    ON | 15131520 / R: 1579   >1564     |N:4  |B    |W    |B    |W    |B    |W    |W    |     27 | GAURAV GIDWANI                  |3.5  |W  51|L  13|W  46|W  37|D  14|L   6|U    |    MI | 14476567 / R: 1552   >1539     |N:4  |W    |B    |W    |B    |W    |B    |     |     28 | SOFIA ADINA STANESCUBELLU      |3.5  |W  24|D   4|W  22|D  19|L  20|L   8|D  36|    MI | 14882954 / R: 1507   >1513     |N:3  |W    |W    |B    |W    |B    |B    |W    |     29 | CHIEDOZIE OKORIE                |3.5  |W  50|D   6|L  38|L  34|W  52|W  48|U    |    MI | 15323285 / R: 1602P6 >1508P12  |N:4  |B    |W    |B    |W    |W    |B    |     |     30 | GEORGE AVERY JONES              |3.5  |L  52|D  64|L  15|W  55|L  31|W  61|W  50|    ON | 12577178 / R: 1522   >1444     |     |W    |B    |B    |W    |W    |B    |B    |     31 | RISHI SHETTY                    |3.5  |L  58|D  55|W  64|L  10|W  30|W  50|L  14|    MI | 15131618 / R: 1494   >1444     |     |B    |W    |B    |W    |B    |W    |B    |     32 | JOSHUA PHILIP MATHEWS           |3.5  |W  61|L   8|W  44|L  18|W  51|D  26|L  13|    ON | 14073750 / R: 1441   >1433     |N:4  |W    |B    |W    |B    |W    |B    |W    |     33 | JADE GE                         |3.5  |W  60|L  12|W  50|D  36|L  13|L  15|W  51|    MI | 14691842 / R: 1449   >1421     |     |B    |W    |B    |W    |B    |W    |B    |     34 | MICHAEL JEFFERY THOMAS          |3.5  |L   6|W  60|L  37|W  29|D  25|L  11|W  52|    MI | 15051807 / R: 1399   >1400     |     |B    |W    |B    |B    |W    |B    |W    |     35 | JOSHUA DAVID LEE                |3.5  |L  46|L  38|W  56|L   6|W  57|D  52|W  48|    MI | 14601397 / R: 1438   >1392     |     |W    |W    |B    |W    |B    |B    |W    |     36 | SIDDHARTH JHA                   |3.5  |L  13|W  57|W  51|D  33|H    |L  16|D  28|    MI | 14773163 / R: 1355   >1367     |N:4  |W    |B    |W    |B    |     |W    |B    |     37 | AMIYATOSH PWNANANDAM            |3.5  |B    |L   5|W  34|L  27|H    |L  23|W  61|    MI | 15489571 / R:  980P12>1077P17  |     |     |B    |W    |W    |     |B    |W    |     38 | BRIAN LIU                       |3.0  |D  11|W  35|W  29|L  12|H    |L  18|L  15|    MI | 15108523 / R: 1423   >1439     |N:4  |W    |B    |W    |W    |     |B    |B    |     39 | JOEL R HENDON                   |3.0  |L   1|W  54|W  40|L  16|W  44|L  21|L  24|    MI | 12923035 / R: 1436P23>1413     |N:4  |B    |W    |B    |W    |B    |W    |W    |     40 | FOREST ZHANG                    |3.0  |W  20|L  26|L  39|W  59|L  21|W  56|L  22|    MI | 14892710 / R: 1348   >1346     |     |B    |B    |W    |W    |B    |W    |W    |     41 | KYLE WILLIAM MURPHY             |3.0  |W  59|L  17|W  58|L  20|X    |U    |U    |    MI | 15761443 / R: 1403P5 >1341P9   |     |B    |W    |B    |W    |     |     |     |     42 | JARED GE                        |3.0  |L  12|L  50|L  57|D  60|D  61|W  64|W  56|    MI | 14462326 / R: 1332   >1256     |     |B    |W    |B    |B    |W    |W    |B    |     43 | ROBERT GLEN VASEY               |3.0  |L  21|L  23|L  24|W  63|W  59|L  46|W  55|    MI | 14101068 / R: 1283   >1244     |     |W    |B    |W    |W    |B    |B    |W    |     44 | JUSTIN D SCHILLING              |3.0  |B    |L  14|L  32|W  53|L  39|L  24|W  59|    MI | 15323504 / R: 1199   >1199     |     |     |W    |B    |B    |W    |B    |W    |     45 | DEREK YAN                       |3.0  |L   5|L  51|D  60|L  56|W  63|D  55|W  58|    MI | 15372807 / R: 1242   >1191     |     |W    |B    |W    |B    |W    |B    |W    |     46 | JACOB ALEXANDER LAVALLEY        |3.0  |W  35|L   7|L  27|L  50|W  64|W  43|L  23|    MI | 15490981 / R:  377P3 >1076P10  |     |B    |W    |B    |W    |B    |W    |W    |     47 | ERIC WRIGHT                     |2.5  |L  18|W  24|L  21|W  61|L   8|D  51|L  25|    MI | 12533115 / R: 1362   >1341     |     |W    |B    |W    |B    |W    |B    |W    |     48 | DANIEL KHAIN                    |2.5  |L  17|W  63|H    |D  52|H    |L  29|L  35|    MI | 14369165 / R: 1382   >1335     |     |B    |W    |     |B    |     |W    |B    |     49 | MICHAEL J MARTIN                |2.5  |L  26|L  20|D  63|D  64|W  58|H    |U    |    MI | 12531685 / R: 1291P12>1259P17  |     |W    |W    |B    |W    |B    |     |     |     50 | SHIVAM JHA                      |2.5  |L  29|W  42|L  33|W  46|H    |L  31|L  30|    MI | 14773178 / R: 1056   >1111     |     |W    |B    |W    |B    |     |B    |W    |     51 | TEJAS AYYAGARI                  |2.5  |L  27|W  45|L  36|W  57|L  32|D  47|L  33|    MI | 15205474 / R: 1011   >1097     |     |B    |W    |B    |W    |B    |W    |W    |     52 | ETHAN GUO                       |2.5  |W  30|D  22|L  19|D  48|L  29|D  35|L  34|    MI | 14918803 / R:  935   >1092     |N:4  |B    |W    |B    |W    |B    |W    |B    |     53 | JOSE C YBARRA                   |2.0  |H    |L  25|H    |L  44|U    |W  57|U    |    MI | 12578849 / R: 1393   >1359     |     |     |B    |     |W    |     |W    |     |     54 | LARRY HODGE                     |2.0  |L  14|L  39|L  61|B    |L  15|L  59|W  64|    MI | 12836773 / R: 1270   >1200     |     |B    |B    |W    |     |W    |B    |W    |     55 | ALEX KONG                       |2.0  |L  62|D  31|L  10|L  30|B    |D  45|L  43|    MI | 15412571 / R: 1186   >1163     |     |W    |B    |W    |B    |     |W    |B    |     56 | MARISA RICCI                    |2.0  |H    |L  11|L  35|W  45|H    |L  40|L  42|    MI | 14679887 / R: 1153   >1140     |     |     |B    |W    |W    |     |B    |W    |     57 | MICHAEL LU                      |2.0  |L   7|L  36|W  42|L  51|L  35|L  53|B    |    MI | 15113330 / R: 1092   >1079     |     |B    |W    |W    |B    |W    |B    |     |     58 | VIRAJ MOHILE                    |2.0  |W  31|L   2|L  41|L  23|L  49|B    |L  45|    MI | 14700365 / R:  917   > 941     |     |W    |B    |W    |B    |W    |     |B    |     59 | SEAN M MC CORMICK               |2.0  |L  41|B    |L   9|L  40|L  43|W  54|L  44|    MI | 12841036 / R:  853   > 878     |     |W    |     |B    |B    |W    |W    |B    |     60 | JULIA SHEN                      |1.5  |L  33|L  34|D  45|D  42|L  24|H    |U    |    MI | 14579262 / R:  967   > 984     |     |W    |B    |B    |W    |B    |     |     |     61 | JEZZEL FARKAS                   |1.5  |L  32|L   3|W  54|L  47|D  42|L  30|L  37|    ON | 15771592 / R:  955P11> 979P18  |     |B    |W    |B    |W    |B    |W    |B    |     62 | ASHWIN BALAJI                   |1.0  |W  55|U    |U    |U    |U    |U    |U    |    MI | 15219542 / R: 1530   >1535     |     |B    |     |     |     |     |     |     |     63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |    MI | 15057092 / R: 1175   >1125     |     |W    |B    |W    |B    |B    |     |     |     64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|    MI | 15006561 / R: 1163   >1112     |     |B    |W    |W    |B    |W    |B    |B    | "
# Add extra characters at the end of text data. This helps read complete data. 
# If extra characters are not added, last line will be skipped. 

chess.text.level2 <- paste(chess.text.level2,"_*_*_*_")

# Extract the text using pattern repeating 180 characters
# Idea is to combine 2 lines of each player's info into single line
# Once combined, information about single player takes 180 characters

pattern <- "[[:print:]]{180}"
chess.text.level3 <- unlist(str_extract_all(chess.text.level2, pattern = pattern))
head(chess.text.level3,3)
## [1] "     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|    ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    | "
## [2] "     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|    MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    | "
## [3] "     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|    MI | 14959604 / R: 1384   >1640     |N:2  |W    |B    |W    |B    |W    |B    |W    | "
# Each player has about 20 columns of information. Split the single string into 20 columns

chess.data <- data.frame(unlist(str_split_fixed(chess.text.level3, "\\|", 21)))
head(chess.data, 5)
##        X1                                X2    X3    X4    X5    X6    X7
## 1      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14
## 2      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17
## 3      3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21
## 4      4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26
## 5      5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13
##      X8    X9   X10     X11                              X12   X13   X14
## 1 W   7 D  12 D   4     ON   15445895 / R: 1794   >1817      N:2   W    
## 2 W  16 W  20 W   7     MI   14598900 / R: 1553   >1663      N:2   B    
## 3 W  11 W  13 W  12     MI   14959604 / R: 1384   >1640      N:2   W    
## 4 D   5 W  19 D   1     MI   12616049 / R: 1716   >1744      N:2   W    
## 5 D   4 W  14 W  17     MI   14601533 / R: 1655   >1690      N:2   B    
##     X15   X16   X17   X18   X19   X20 X21
## 1 B     W     B     W     B     W        
## 2 W     B     W     B     W     B        
## 3 B     W     B     W     B     W        
## 4 B     W     B     W     B     B        
## 5 W     B     W     B     W     B
# Using regular expressions extract the pre match rating for each player

pattern = "(R:\\s+[[:digit:]]{3,4})"
chess.preGameRating <- unlist(str_extract(string = chess.data[,12], pattern = pattern))

# Get the pre-game rating
pattern = "[[:digit:]]{3,4}"
chess.preGameRating <- str_extract(string = chess.preGameRating, pattern = pattern)
chess.preGameRating <- data.frame(chess.data, preGameRating = chess.preGameRating)

head(chess.preGameRating, 5)
##        X1                                X2    X3    X4    X5    X6    X7
## 1      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14
## 2      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17
## 3      3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21
## 4      4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26
## 5      5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13
##      X8    X9   X10     X11                              X12   X13   X14
## 1 W   7 D  12 D   4     ON   15445895 / R: 1794   >1817      N:2   W    
## 2 W  16 W  20 W   7     MI   14598900 / R: 1553   >1663      N:2   B    
## 3 W  11 W  13 W  12     MI   14959604 / R: 1384   >1640      N:2   W    
## 4 D   5 W  19 D   1     MI   12616049 / R: 1716   >1744      N:2   W    
## 5 D   4 W  14 W  17     MI   14601533 / R: 1655   >1690      N:2   B    
##     X15   X16   X17   X18   X19   X20 X21 preGameRating
## 1 B     W     B     W     B     W                  1794
## 2 W     B     W     B     W     B                  1553
## 3 B     W     B     W     B     W                  1384
## 4 B     W     B     W     B     B                  1716
## 5 W     B     W     B     W     B                  1655
# Format the column type

chess.preGameRating$X1 <- as.numeric(as.character(chess.preGameRating$X1))
chess.preGameRating$preGameRating <- as.numeric(as.character(chess.preGameRating$preGameRating))
# Extract the information about opponent for each round
# Opponent player info is left as "NA" values if player did not play or got bye

# Get opponet playerID
pattern = "[[:digit:]]{1,2}"

chess.preGameRating$G1opponent = as.numeric(str_match(string = chess.preGameRating$X4, pattern = pattern))
chess.preGameRating$G2opponent = as.numeric(str_match(string = chess.preGameRating$X5, pattern = pattern))
chess.preGameRating$G3opponent = as.numeric(str_match(string = chess.preGameRating$X6, pattern = pattern))
chess.preGameRating$G4opponent = as.numeric(str_match(string = chess.preGameRating$X7, pattern = pattern))
chess.preGameRating$G5opponent = as.numeric(str_match(string = chess.preGameRating$X8, pattern = pattern))
chess.preGameRating$G6opponent = as.numeric(str_match(string = chess.preGameRating$X9, pattern = pattern))
chess.preGameRating$G7opponent = as.numeric(str_match(string = chess.preGameRating$X10, pattern = pattern))

# Get required data
chess.data <- chess.preGameRating %>% 
  select(playerID = as.numeric(X1),playerName = X2, playerPoints = as.numeric(X3), playerState = X11, preGameRating, G1opponent,G2opponent,G3opponent,G4opponent,G5opponent,G6opponent,G7opponent)

# Convert data from factors to character

chess.data$playerName <- as.character(chess.data$playerName)
chess.data$playerState <- as.character(chess.data$playerState)
chess.data$playerPoints <- as.character(chess.data$playerPoints)

# Remove extra spaces

trim <- function (x) gsub("^\\s+|\\s+$", "", x)
chess.data$playerState <- trim(chess.data$playerState)
chess.data$playerName <- trim(chess.data$playerName)
chess.data$playerPoints <- trim(chess.data$playerPoints)

# Player 62 did not play 2, 3 or 4 games
tail(chess.data,10)
##    playerID           playerName playerPoints playerState preGameRating
## 55       55            ALEX KONG          2.0          MI          1186
## 56       56         MARISA RICCI          2.0          MI          1153
## 57       57           MICHAEL LU          2.0          MI          1092
## 58       58         VIRAJ MOHILE          2.0          MI           917
## 59       59    SEAN M MC CORMICK          2.0          MI           853
## 60       60           JULIA SHEN          1.5          MI           967
## 61       61        JEZZEL FARKAS          1.5          ON           955
## 62       62        ASHWIN BALAJI          1.0          MI          1530
## 63       63 THOMAS JOSEPH HOSMER          1.0          MI          1175
## 64       64               BEN LI          1.0          MI          1163
##    G1opponent G2opponent G3opponent G4opponent G5opponent G6opponent
## 55         62         31         10         30         NA         45
## 56         NA         11         35         45         NA         40
## 57          7         36         42         51         35         53
## 58         31          2         41         23         49         NA
## 59         41         NA          9         40         43         54
## 60         33         34         45         42         24         NA
## 61         32          3         54         47         42         30
## 62         55         NA         NA         NA         NA         NA
## 63          2         48         49         43         45         NA
## 64         22         30         31         49         46         42
##    G7opponent
## 55         43
## 56         42
## 57         NA
## 58         45
## 59         44
## 60         NA
## 61         37
## 62         NA
## 63         NA
## 64         54
# Tidy all data, convert all the opponet values into rows
# opponent is new column with values(G1opponent, G2opponent, ... G7opponent)
# n is the column that stored values for opponetID

chess.tidy.data <- chess.data %>% 
  gather(opponent, n, -playerID, -playerName, -playerState, -preGameRating, -playerPoints, na.rm = TRUE) %>%
  arrange(playerID)

# Rename the column
chess.tidy.data <-  rename(chess.tidy.data, opponentID = n)
head(chess.tidy.data,15)
##    playerID      playerName playerPoints playerState preGameRating
## 1         1        GARY HUA          6.0          ON          1794
## 2         1        GARY HUA          6.0          ON          1794
## 3         1        GARY HUA          6.0          ON          1794
## 4         1        GARY HUA          6.0          ON          1794
## 5         1        GARY HUA          6.0          ON          1794
## 6         1        GARY HUA          6.0          ON          1794
## 7         1        GARY HUA          6.0          ON          1794
## 8         2 DAKSHESH DARURI          6.0          MI          1553
## 9         2 DAKSHESH DARURI          6.0          MI          1553
## 10        2 DAKSHESH DARURI          6.0          MI          1553
## 11        2 DAKSHESH DARURI          6.0          MI          1553
## 12        2 DAKSHESH DARURI          6.0          MI          1553
## 13        2 DAKSHESH DARURI          6.0          MI          1553
## 14        2 DAKSHESH DARURI          6.0          MI          1553
## 15        3    ADITYA BAJAJ          6.0          MI          1384
##      opponent opponentID
## 1  G1opponent         39
## 2  G2opponent         21
## 3  G3opponent         18
## 4  G4opponent         14
## 5  G5opponent          7
## 6  G6opponent         12
## 7  G7opponent          4
## 8  G1opponent         63
## 9  G2opponent         58
## 10 G3opponent          4
## 11 G4opponent         17
## 12 G5opponent         16
## 13 G6opponent         20
## 14 G7opponent          7
## 15 G1opponent          8
# Remove the column that is not being used.
chess.tidy.data$opponent <- NULL
# Following code joins chess.tidy.data and chess.data using "opponentID" = "playerID" and extracts opponentID pre-game rating.
# It summarises data using playerID and playerName, this gives sum of opponent ratings and number games played.
# Finally uses mutate to calculate average opponent ratings
# And saves required data to chess.rating data frame

chess.rating <- chess.tidy.data %>% inner_join(chess.data, by = c("opponentID" = "playerID")) %>%
  select (playerID, playerName = playerName.x, opponentID, opponentPlayer = playerName.y , opponentRating = as.numeric(preGameRating.y)) %>%
  group_by (playerID,playerName) %>% 
  summarise(opponentRatingSum = sum(opponentRating), numGames = n()) %>% 
  inner_join(chess.data, by = ("playerID")) %>% 
  select (playerID, playerName = playerName.x, playerPoints, playerState, preGameRating, opponentRatingSum, numGames) %>% 
  mutate(avgOppPreGameRating = round(opponentRatingSum / numGames,0)) %>% 
  select (playerID, playerName, playerPoints, playerState, preGameRating, avgOppPreGameRating)


head(chess.rating,10)
## Source: local data frame [10 x 6]
## Groups: playerID [10]
## 
##    playerID          playerName playerPoints playerState preGameRating
##       <dbl>               <chr>        <chr>       <chr>         <dbl>
## 1         1            GARY HUA          6.0          ON          1794
## 2         2     DAKSHESH DARURI          6.0          MI          1553
## 3         3        ADITYA BAJAJ          6.0          MI          1384
## 4         4 PATRICK H SCHILLING          5.5          MI          1716
## 5         5          HANSHI ZUO          5.5          MI          1655
## 6         6         HANSEN SONG          5.0          OH          1686
## 7         7   GARY DEE SWATHELL          5.0          MI          1649
## 8         8    EZEKIEL HOUGHTON          5.0          MI          1641
## 9         9         STEFANO LEE          5.0          ON          1411
## 10       10           ANVIT RAO          5.0          MI          1365
## # ... with 1 more variables: avgOppPreGameRating <dbl>
# Generate CSV files

chessOutFile <- paste(str_trim(localDir),"//chessOutFile.csv",sep="")
write.table(chess.rating, file = chessOutFile, row.names = FALSE, sep = ",")