RevoscaleR Reading and Transforming Data

This notebook follows the tutorial at

https://docs.microsoft.com/en-us/machine-learning-server/r/tutorial-revoscaler-data-import-transform

library(readr)
library(RevoScaleR)
list.files(rxGetOption("sampleDataDir"))

##  [1] "AirlineDemo1kNoMissing.csv" "AirlineDemoSmall.csv"      
##  [3] "AirlineDemoSmall.xdf"       "AirlineDemoSmallComposite" 
##  [5] "AirlineDemoSmallOrc"        "AirlineDemoSmallParquet"   
##  [7] "AirlineDemoSmallSplit"      "AirlineDemoSmallUC.xdf"    
##  [9] "ccFraudScoreSmall.csv"      "ccFraudSmall.csv"          
## [11] "CensusWorkers.xdf"          "claims.dat"                
## [13] "claims.sas7bdat"            "claims.sav"                
## [15] "claims.sd7"                 "claims.sqlite"             
## [17] "claims.sts"                 "claims.txt"                
## [19] "claims.xdf"                 "claims_.txt"               
## [21] "claims4blocks.xdf"          "claimsExtra.txt"           
## [23] "claimsParquet"              "claimsQuote.txt"           
## [25] "claimsTab.txt"              "claimsTxt"                 
## [27] "claimsXdf"                  "CustomerSurvey.xdf"        
## [29] "DJIAdaily.xdf"              "fourthgraders.xdf"         
## [31] "hyphens.txt"                "Kyphosis.xdf"              
## [33] "mortDefaultSmall.xdf"       "mortDefaultSmall2000.csv"  
## [35] "mortDefaultSmall2001.csv"   "mortDefaultSmall2002.csv"  
## [37] "mortDefaultSmall2003.csv"   "mortDefaultSmall2004.csv"  
## [39] "mortDefaultSmall2005.csv"   "mortDefaultSmall2006.csv"  
## [41] "mortDefaultSmall2007.csv"   "mortDefaultSmall2008.csv"  
## [43] "mortDefaultSmall2009.csv"   "mrsDebugParquet"           
## [45] "README"                     "testAvro4.bin"             
## [47] "Utf16leDb.sqlite"

Where is my working directory?

getwd()

## [1] "D:/Dropbox/Documents/SMU/CSC 360/Fall 2017 MSSA"

 mysource <- file.path(rxGetOption("sampleDataDir"), "AirlineDemoSmall.csv")
    airXdfData <- rxImport(inData=mysource)

## Rows Read: 500000, Total Rows Processed: 500000, Total Chunk Time: 0.989 seconds
## Rows Read: 100000, Total Rows Processed: 600000, Total Chunk Time: 0.270 seconds

Note that the data was read in in chunks.

let’s also import this data using RStudio’s import facility.

AirlineDemoSmall_df <- read_csv("C:/Program Files/Microsoft/ML Server/R_SERVER/library/RevoScaleR/SampleData/AirlineDemoSmall.csv", 
col_types = cols(ArrDelay = col_integer()))

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 17372 parsing failures.
## row # A tibble: 5 x 5 col     row      col   expected actual expected   <int>    <chr>      <chr>  <chr> actual 1    99 ArrDelay an integer      M file 2   312 ArrDelay an integer      M row 3   325 ArrDelay an integer      M col 4   611 ArrDelay an integer      M expected 5   781 ArrDelay an integer      M actual # ... with 1 more variables: file <chr>
## ... ................. ... .................................. ........ .................................. ...... .................................. .... .................................. ... .................................. ... .................................. ........ .................................. ...... .......................................
## See problems(...) for more details.

Do a summary of the dataframe

summary(AirlineDemoSmall_df)

##     ArrDelay         CRSDepTime        DayOfWeek        
##  Min.   : -86.00   Min.   : 0.01667   Length:600000     
##  1st Qu.:  -9.00   1st Qu.: 9.41667   Class :character  
##  Median :   0.00   Median :13.41667   Mode  :character  
##  Mean   :  11.32   Mean   :13.48227                     
##  3rd Qu.:  16.00   3rd Qu.:17.33333                     
##  Max.   :1490.00   Max.   :23.98333                     
##  NA's   :17372

Make a copy of the xdf file on disk

airXdfData <- rxImport(inData=mysource, outFile="airExample.xdf",overwrite = TRUE)

## Rows Read: 500000, Total Rows Processed: 500000, Total Chunk Time: 1.003 seconds
## Rows Read: 100000, Total Rows Processed: 600000, Total Chunk Time: 0.270 seconds

What is airXdfData?

str(airXdfData)

## Formal class 'RxXdfData' [package "RevoScaleR"] with 19 slots
##   ..@ fileSystem            :List of 1
##   .. ..$ fileSystemType: chr "native"
##   .. ..- attr(*, "class")= chr [1:2] "RxNativeFileSystem" "RxFileSystem"
##   ..@ createCompositeSet    : NULL
##   ..@ createPartitionSet    : NULL
##   ..@ blocksPerCompositeFile: int 3
##   ..@ readByBlock           : logi TRUE
##   ..@ xdfUuid               : chr "005759F5D17B4E48BF0D832E394664F7"
##   ..@ cache                 : logi FALSE
##   ..@ dfName                : chr "df-9BFD30DC8CAE4247AF2481278DE0C463"
##   ..@ dfType                : chr "xdf"
##   ..@ dfSource              : chr "airExample.xdf"
##   ..@ file                  : chr "airExample.xdf"
##   ..@ colNames              : chr ""
##   ..@ id                    :<externalptr> 
##   ..@ colClasses            : NULL
##   ..@ colInfo               : NULL
##   ..@ returnDataFrame       : logi TRUE
##   ..@ stringsAsFactors      : logi FALSE
##   ..@ rowsOrBlocksPerRead   : int 1
##   ..@ compatibilityRequest  :Classes 'CompatibilityRequest', 'R6' <CompatibilityRequest>
##   Public:
##     assertServerCapability: function (capability, notSupported, notKnown) 
##     clone: function (deep = FALSE) 
##     deferredAssertServerCapability: function (capability, notSupported, notKnown) 
##     getRequestedCapabilities: function () 
##     initialize: function (server, notSupported = capabilityNotSupported, notKnown = serverNotKnown) 
##     merge: function (request) 
##     requestCapability: function (capability) 
##     runDeferredAssertions: function (server) 
##     serialize: function (file) 
##   Private:
##     deferredRequests: list
##     notKnown: function (server, capability, warningMessage) 
##     notSupported: function (server, capability, errorMessage) 
##     requestedCapabilities: 
##     runCallback: function (type, server, capability, userHandler) 
##     server: ServerDefinition, AbstractServerDefinition

It’s definitely not a dataframe!

We do have a utility function that looks a lot like summary.

rxGetInfo(airXdfData, getVarInfo = TRUE)

## File name: D:\Dropbox\Documents\SMU\CSC 360\Fall 2017 MSSA\airExample.xdf 
## Number of observations: 6e+05 
## Number of variables: 3 
## Number of blocks: 2 
## Compression type: zlib 
## Variable information: 
## Var 1: ArrDelay, Type: character
## Var 2: CRSDepTime, Type: numeric, Storage: float32, Low/High: (0.0167, 23.9833)
## Var 3: DayOfWeek, Type: character

What happens if we try to run summary?

summary(airXdfData)

## Call:
## rxSummary(formula = form, data = object, byTerm = TRUE, reportProgress = 0L)
## 
## Summary Statistics Results for: ~ArrDelay + CRSDepTime + DayOfWeek
## Data: object (RxXdfData Data Source)
## File name: airExample.xdf
## Number of valid observations: 6e+05 
##  
##  Name       Mean     StdDev   Min      Max      ValidObs MissingObs
##  CRSDepTime 13.48227 4.697566 0.016667 23.98333 6e+05    0

Note that this is different from what we would get from running summary on a dataframe. Note that summary() is more than one function. It looks at its input object and finds a summary particular to that type of object.

Also note that the median and quantiles are missing.

It is probably more convenient to use a different rx funcion if we just want the variable information.

rxGetVarInfo(airXdfData)

## Var 1: ArrDelay, Type: character
## Var 2: CRSDepTime, Type: numeric, Storage: float32, Low/High: (0.0167, 23.9833)
## Var 3: DayOfWeek, Type: character

Reimport the data using diferent parameters.

airXdfData <- rxImport(inData=mysource, outFile="airExample.xdf",
    stringsAsFactors=TRUE, missingValueString="M", rowsPerRead=200000,
    overwrite=TRUE)

## Rows Read: 200000, Total Rows Processed: 200000, Total Chunk Time: 0.377 seconds
## Rows Read: 200000, Total Rows Processed: 400000, Total Chunk Time: 0.355 seconds
## Rows Read: 200000, Total Rows Processed: 600000, Total Chunk Time: 0.362 seconds

Look at the results again.

rxGetInfo(airXdfData)

## File name: D:\Dropbox\Documents\SMU\CSC 360\Fall 2017 MSSA\airExample.xdf 
## Number of observations: 6e+05 
## Number of variables: 3 
## Number of blocks: 3 
## Compression type: zlib

This is different from the output in the Microsoft document. Use instead.

rxGetVarInfo(airXdfData)

## Var 1: ArrDelay, Type: integer, Low/High: (-86, 1490)
## Var 2: CRSDepTime, Type: numeric, Storage: float32, Low/High: (0.0167, 23.9833)
## Var 3: DayOfWeek
##        7 factor levels: Monday Tuesday Wednesday Thursday Friday Saturday Sunday

Get descriptive statistics for ArrDelay

rxSummary(~ ArrDelay, data = airXdfData)

## Rows Read: 200000, Total Rows Processed: 200000, Total Chunk Time: 0.004 seconds
## Rows Read: 200000, Total Rows Processed: 400000, Total Chunk Time: 0.005 seconds
## Rows Read: 200000, Total Rows Processed: 600000, Total Chunk Time: 0.004 seconds 
## Computation time: 0.016 seconds.

## Call:
## rxSummary(formula = ~ArrDelay, data = airXdfData)
## 
## Summary Statistics Results for: ~ArrDelay
## Data: airXdfData (RxXdfData Data Source)
## File name: airExample.xdf
## Number of valid observations: 6e+05 
##  
##  Name     Mean     StdDev   Min Max  ValidObs MissingObs
##  ArrDelay 11.31794 40.68854 -86 1490 582628   17372

Visualize the data

rxHistogram(~ArrDelay|DayOfWeek,  data = airXdfData)

## Rows Read: 200000, Total Rows Processed: 200000, Total Chunk Time: 0.080 secondsRows Read: 200000, Total Rows Processed: 400000, Total Chunk Time: 0.077 secondsRows Read: 200000, Total Rows Processed: 600000, Total Chunk Time: 0.082 seconds 
## Computation time: 0.250 seconds.