Housekeeping
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(bit64)
## Warning: package 'bit64' was built under R version 3.5.2
## Loading required package: bit
## Warning: package 'bit' was built under R version 3.5.2
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
##
## Attaching package: 'bit'
## The following object is masked from 'package:data.table':
##
## setattr
## The following object is masked from 'package:base':
##
## xor
## Attaching package bit64
## package:bit64 (c) 2011-2012 Jens Oehlschlaegel
## creators: integer64 seq :
## coercion: as.integer64 as.vector as.logical as.integer as.double as.character as.bin
## logical operator: ! & | xor != == < <= >= >
## arithmetic operator: + - * / %/% %% ^
## math: sign abs sqrt log log2 log10
## math: floor ceiling trunc round
## querying: is.integer64 is.vector [is.atomic} [length] format print str
## values: is.na is.nan is.finite is.infinite
## aggregation: any all min max range sum prod
## cumulation: diff cummin cummax cumsum cumprod
## access: length<- [ [<- [[ [[<-
## combine: c rep cbind rbind as.data.frame
## WARNING don't use as subscripts
## WARNING semantics differ from integer
## for more help type ?bit64
##
## Attaching package: 'bit64'
## The following object is masked from 'package:bit':
##
## still.identical
## The following objects are masked from 'package:base':
##
## %in%, :, is.double, match, order, rank
library(ff)
## Warning: package 'ff' was built under R version 3.5.3
## Attaching package ff
## - getOption("fftempdir")=="C:/Users/Hiroshi/AppData/Local/Temp/RtmpkH3NIQ"
## - getOption("ffextension")=="ff"
## - getOption("ffdrop")==TRUE
## - getOption("fffinonexit")==TRUE
## - getOption("ffpagesize")==65536
## - getOption("ffcaching")=="mmnoflush" -- consider "ffeachflush" if your system stalls on large writes
## - getOption("ffbatchbytes")==171431690.24 -- consider a different value for tuning your system
## - getOption("ffmaxbytes")==8571584512 -- consider a different value for tuning your system
##
## Attaching package: 'ff'
## The following objects are masked from 'package:bit':
##
## clone, clone.default, clone.list
## The following objects are masked from 'package:utils':
##
## write.csv, write.csv2
## The following objects are masked from 'package:base':
##
## is.factor, is.ordered
library(ffbase)
## Warning: package 'ffbase' was built under R version 3.5.3
##
## Attaching package: 'ffbase'
## The following objects are masked from 'package:ff':
##
## [.ff, [.ffdf, [<-.ff, [<-.ffdf
## The following object is masked from 'package:bit64':
##
## %in%
## The following objects are masked from 'package:base':
##
## %in%, table
library(ggplot2)
library(biglm)
## Warning: package 'biglm' was built under R version 3.5.3
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.5.2
library(tidyr)
setwd("L:/Kaggle/Proj3")
Part 1
system.time(read.csv("ss13hus.csv"))
## user system elapsed
## 403.47 160.28 625.23
Part 2:
system.time(scan(file = "ss13hus.csv", what = "character"))
## user system elapsed
## 89.12 10.50 102.56
Part 3:
system.time(readLines("ss13hus.csv"))
## user system elapsed
## 53.36 8.96 63.74
df <- fread(file = "ss13hus.csv")
After various workaround were tried it was determined that the most efficient way to subset the file was to first just read the entire thing using the most efficient way posisble. Fread was determined to be the most efficient package, taking a mere 38.74 seconds to read the file into R.
vars <- c("REGION", "ST", "ADJHSG", "ADJINC", 'NP', 'ACR', 'BDSP', 'ELEP', 'GASP', 'RMSP', 'VEH', 'WATP', 'FINCP', 'HINCP')
df <- select(df, vars)
Next, we filetered the original dataframe for our 14 variables. Dplyr allowed us to do this efficiently. Finally, we use dplyr to take a slice of 1,000,000 from our random number generator. Df1 now contains a sample of 1,000,000 of our samples, with the appropriate 14 variables. We export that using the efficient frwite package.
set.seed(1000)
numbers <- sample(1:7219000, 1000000, replace = T)
df1 <- slice(df, numbers)
fwrite(df1, "ss13sample.csv")
Method 1: Read.csv
system.time(read.csv("ss13sample.csv"))
## user system elapsed
## 2.80 0.16 3.02
Method 2: Read.table (from data.frame package)
system.time(read.table("ss13sample.csv"))
## user system elapsed
## 22.67 0.13 23.26
Method 3: read.csv.ffdf (from ff package)
system.time(read.table.ffdf(file = "ss13sample.csv"))
## user system elapsed
## 24.34 0.16 24.73
Curiously, read.csv took the least time out of all three methods this time around. Maybe it’s related to the size of the database – 1 million versus 7.2 million. Regardless, the time is relatively low for all three methods.
#Fixed NA data for each variable, using the mean/median for each.
df1 <- mutate(df1, FINCP = replace_na(FINCP, mean(FINCP)))
df1 <- mutate(df1, BDSP = replace_na(BDSP, median(BDSP)))
df1 <- mutate(df1, ADJINC = replace_na(ADJINC, median(ADJINC)))
#Removed households with zero income.
df2 <- subset(df1, FINCP != 0)
#Calculated new variable, constant dollars, using FINCP and ADJINC variables. Divided by 10^6 to get real dollar conversion.
df2 <- mutate(df2, CDollars = FINCP * (ADJINC / 10^6))
#Took subset of data to prevent overpolotting
df3 <- df2[1:10000,]
#Create Plot
pl <- ggplot(df3, aes(x = BDSP, y = CDollars)) + geom_smooth(method = 'loess', alpha = 1.0, size = 1.2 ) + theme_gray() +geom_jitter(alpha = .2, color = 'Black') + xlim(0, 6) + ylim(0, 200000) +ggtitle("Number of Bedrooms versus Adjusted Household Income") + xlab("Household Bedrooms") + ylab("Dollars (adjusted)")
pl
## Warning: Removed 711 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 3
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning: Removed 753 rows containing missing values (geom_point).
The graph is displayed up above. First, we limited the sample to 10,000 to prevent overplotting of data. Since this was originally a random sample of 1,000,000, it was no problem to just take the first 10,000 samples, although we could easily take a random sample of 10,000 from 1,000,000 if needed. Since the number of housedhold bedrooms is a factor, we used a jiggle polot to represent the number of households as discrete bins.
The data shows a mild coorelation between number of bedrooms and adjusted housedhold income. The loess smoother curve shows that income increases from approximately 50,000 to 100,000 as bedroom size increases from 2-4. Past 4, there is less coorelation, mostly because the sample size becomes less sparse. That’s why we elected to show only the data between 1-6 bedrooms, and income from 0-200,000.
df2 <- mutate(df2, vEH = replace_na(VEH, median(VEH)))
eq <- CDollars ~ BDSP + VEH
regression <- bigglm(eq, data = df2, chunksize = 5000, sandwich = T)
summary(regression)
## Large data regression model: bigglm(eq, data = df2, chunksize = 5000, sandwich = T)
## Sample size = 555889
## Coef (95% CI) SE p
## (Intercept) -2393.596 -3289.259 -1497.932 447.8318 0
## BDSP 19426.421 19107.565 19745.277 159.4280 0
## VEH 14418.818 14190.917 14646.719 113.9504 0
## Sandwich (model-robust) standard errors