This notebook follows the tutorial at
https://docs.microsoft.com/en-us/machine-learning-server/r/tutorial-revoscaler-data-import-transform
library(readr)
library(RevoScaleR)
list.files(rxGetOption("sampleDataDir"))
## [1] "AirlineDemo1kNoMissing.csv" "AirlineDemoSmall.csv"
## [3] "AirlineDemoSmall.xdf" "AirlineDemoSmallComposite"
## [5] "AirlineDemoSmallOrc" "AirlineDemoSmallParquet"
## [7] "AirlineDemoSmallSplit" "AirlineDemoSmallUC.xdf"
## [9] "ccFraudScoreSmall.csv" "ccFraudSmall.csv"
## [11] "CensusWorkers.xdf" "claims.dat"
## [13] "claims.sas7bdat" "claims.sav"
## [15] "claims.sd7" "claims.sqlite"
## [17] "claims.sts" "claims.txt"
## [19] "claims.xdf" "claims_.txt"
## [21] "claims4blocks.xdf" "claimsExtra.txt"
## [23] "claimsParquet" "claimsQuote.txt"
## [25] "claimsTab.txt" "claimsTxt"
## [27] "claimsXdf" "CustomerSurvey.xdf"
## [29] "DJIAdaily.xdf" "fourthgraders.xdf"
## [31] "hyphens.txt" "Kyphosis.xdf"
## [33] "mortDefaultSmall.xdf" "mortDefaultSmall2000.csv"
## [35] "mortDefaultSmall2001.csv" "mortDefaultSmall2002.csv"
## [37] "mortDefaultSmall2003.csv" "mortDefaultSmall2004.csv"
## [39] "mortDefaultSmall2005.csv" "mortDefaultSmall2006.csv"
## [41] "mortDefaultSmall2007.csv" "mortDefaultSmall2008.csv"
## [43] "mortDefaultSmall2009.csv" "mrsDebugParquet"
## [45] "README" "testAvro4.bin"
## [47] "Utf16leDb.sqlite"
Where is my working directory?
getwd()
## [1] "D:/Dropbox/Documents/SMU/CSC 360/Fall 2017 MSSA"
mysource <- file.path(rxGetOption("sampleDataDir"), "AirlineDemoSmall.csv")
airXdfData <- rxImport(inData=mysource)
## Rows Read: 500000, Total Rows Processed: 500000, Total Chunk Time: 0.989 seconds
## Rows Read: 100000, Total Rows Processed: 600000, Total Chunk Time: 0.270 seconds
Note that the data was read in in chunks.
let’s also import this data using RStudio’s import facility.
AirlineDemoSmall_df <- read_csv("C:/Program Files/Microsoft/ML Server/R_SERVER/library/RevoScaleR/SampleData/AirlineDemoSmall.csv",
col_types = cols(ArrDelay = col_integer()))
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 17372 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 99 ArrDelay an integer M file 2 312 ArrDelay an integer M row 3 325 ArrDelay an integer M col 4 611 ArrDelay an integer M expected 5 781 ArrDelay an integer M actual # ... with 1 more variables: file <chr>
## ... ................. ... .................................. ........ .................................. ...... .................................. .... .................................. ... .................................. ... .................................. ........ .................................. ...... .......................................
## See problems(...) for more details.
Do a summary of the dataframe
summary(AirlineDemoSmall_df)
## ArrDelay CRSDepTime DayOfWeek
## Min. : -86.00 Min. : 0.01667 Length:600000
## 1st Qu.: -9.00 1st Qu.: 9.41667 Class :character
## Median : 0.00 Median :13.41667 Mode :character
## Mean : 11.32 Mean :13.48227
## 3rd Qu.: 16.00 3rd Qu.:17.33333
## Max. :1490.00 Max. :23.98333
## NA's :17372
Make a copy of the xdf file on disk
airXdfData <- rxImport(inData=mysource, outFile="airExample.xdf",overwrite = TRUE)
## Rows Read: 500000, Total Rows Processed: 500000, Total Chunk Time: 1.003 seconds
## Rows Read: 100000, Total Rows Processed: 600000, Total Chunk Time: 0.270 seconds
What is airXdfData?
str(airXdfData)
## Formal class 'RxXdfData' [package "RevoScaleR"] with 19 slots
## ..@ fileSystem :List of 1
## .. ..$ fileSystemType: chr "native"
## .. ..- attr(*, "class")= chr [1:2] "RxNativeFileSystem" "RxFileSystem"
## ..@ createCompositeSet : NULL
## ..@ createPartitionSet : NULL
## ..@ blocksPerCompositeFile: int 3
## ..@ readByBlock : logi TRUE
## ..@ xdfUuid : chr "005759F5D17B4E48BF0D832E394664F7"
## ..@ cache : logi FALSE
## ..@ dfName : chr "df-9BFD30DC8CAE4247AF2481278DE0C463"
## ..@ dfType : chr "xdf"
## ..@ dfSource : chr "airExample.xdf"
## ..@ file : chr "airExample.xdf"
## ..@ colNames : chr ""
## ..@ id :<externalptr>
## ..@ colClasses : NULL
## ..@ colInfo : NULL
## ..@ returnDataFrame : logi TRUE
## ..@ stringsAsFactors : logi FALSE
## ..@ rowsOrBlocksPerRead : int 1
## ..@ compatibilityRequest :Classes 'CompatibilityRequest', 'R6' <CompatibilityRequest>
## Public:
## assertServerCapability: function (capability, notSupported, notKnown)
## clone: function (deep = FALSE)
## deferredAssertServerCapability: function (capability, notSupported, notKnown)
## getRequestedCapabilities: function ()
## initialize: function (server, notSupported = capabilityNotSupported, notKnown = serverNotKnown)
## merge: function (request)
## requestCapability: function (capability)
## runDeferredAssertions: function (server)
## serialize: function (file)
## Private:
## deferredRequests: list
## notKnown: function (server, capability, warningMessage)
## notSupported: function (server, capability, errorMessage)
## requestedCapabilities:
## runCallback: function (type, server, capability, userHandler)
## server: ServerDefinition, AbstractServerDefinition
It’s definitely not a dataframe!
We do have a utility function that looks a lot like summary.
rxGetInfo(airXdfData, getVarInfo = TRUE)
## File name: D:\Dropbox\Documents\SMU\CSC 360\Fall 2017 MSSA\airExample.xdf
## Number of observations: 6e+05
## Number of variables: 3
## Number of blocks: 2
## Compression type: zlib
## Variable information:
## Var 1: ArrDelay, Type: character
## Var 2: CRSDepTime, Type: numeric, Storage: float32, Low/High: (0.0167, 23.9833)
## Var 3: DayOfWeek, Type: character
What happens if we try to run summary?
summary(airXdfData)
## Call:
## rxSummary(formula = form, data = object, byTerm = TRUE, reportProgress = 0L)
##
## Summary Statistics Results for: ~ArrDelay + CRSDepTime + DayOfWeek
## Data: object (RxXdfData Data Source)
## File name: airExample.xdf
## Number of valid observations: 6e+05
##
## Name Mean StdDev Min Max ValidObs MissingObs
## CRSDepTime 13.48227 4.697566 0.016667 23.98333 6e+05 0
Note that this is different from what we would get from running summary on a dataframe. Note that summary() is more than one function. It looks at its input object and finds a summary particular to that type of object.
Also note that the median and quantiles are missing.
It is probably more convenient to use a different rx funcion if we just want the variable information.
rxGetVarInfo(airXdfData)
## Var 1: ArrDelay, Type: character
## Var 2: CRSDepTime, Type: numeric, Storage: float32, Low/High: (0.0167, 23.9833)
## Var 3: DayOfWeek, Type: character
Reimport the data using diferent parameters.
airXdfData <- rxImport(inData=mysource, outFile="airExample.xdf",
stringsAsFactors=TRUE, missingValueString="M", rowsPerRead=200000,
overwrite=TRUE)
## Rows Read: 200000, Total Rows Processed: 200000, Total Chunk Time: 0.377 seconds
## Rows Read: 200000, Total Rows Processed: 400000, Total Chunk Time: 0.355 seconds
## Rows Read: 200000, Total Rows Processed: 600000, Total Chunk Time: 0.362 seconds
Look at the results again.
rxGetInfo(airXdfData)
## File name: D:\Dropbox\Documents\SMU\CSC 360\Fall 2017 MSSA\airExample.xdf
## Number of observations: 6e+05
## Number of variables: 3
## Number of blocks: 3
## Compression type: zlib
This is different from the output in the Microsoft document. Use instead.
rxGetVarInfo(airXdfData)
## Var 1: ArrDelay, Type: integer, Low/High: (-86, 1490)
## Var 2: CRSDepTime, Type: numeric, Storage: float32, Low/High: (0.0167, 23.9833)
## Var 3: DayOfWeek
## 7 factor levels: Monday Tuesday Wednesday Thursday Friday Saturday Sunday
Get descriptive statistics for ArrDelay
rxSummary(~ ArrDelay, data = airXdfData)
## Rows Read: 200000, Total Rows Processed: 200000, Total Chunk Time: 0.004 seconds
## Rows Read: 200000, Total Rows Processed: 400000, Total Chunk Time: 0.005 seconds
## Rows Read: 200000, Total Rows Processed: 600000, Total Chunk Time: 0.004 seconds
## Computation time: 0.016 seconds.
## Call:
## rxSummary(formula = ~ArrDelay, data = airXdfData)
##
## Summary Statistics Results for: ~ArrDelay
## Data: airXdfData (RxXdfData Data Source)
## File name: airExample.xdf
## Number of valid observations: 6e+05
##
## Name Mean StdDev Min Max ValidObs MissingObs
## ArrDelay 11.31794 40.68854 -86 1490 582628 17372
Visualize the data
rxHistogram(~ArrDelay|DayOfWeek, data = airXdfData)
## Rows Read: 200000, Total Rows Processed: 200000, Total Chunk Time: 0.080 secondsRows Read: 200000, Total Rows Processed: 400000, Total Chunk Time: 0.077 secondsRows Read: 200000, Total Rows Processed: 600000, Total Chunk Time: 0.082 seconds
## Computation time: 0.250 seconds.