library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(ggthemes)
train <- fread("/Users/kailukowiak/Data606_Proposal/Zillow/train_2016.csv")
properties <- fread("/Users/kailukowiak/Data606_Proposal/Zillow/properties_2016.csv")
## Warning in fread("/Users/kailukowiak/Data606_Proposal/Zillow/
## properties_2016.csv"): Bumped column 50 to type character on data row
## 10354, field contains 'true'. Coercing previously read values in this
## column from logical, integer or numeric back to character which may not
## be lossless; e.g., if '00' and '000' occurred before they will now be just
## '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too
## (if they occurred in this column before the bump). If this matters please
## rerun and set 'colClasses' to 'character' for this column. Please note that
## column type detection uses a sample of 1,000 rows (100 rows at 10 points)
## so hopefully this message should be very rare. If reporting to datatable-
## help, please rerun and include the output from verbose=TRUE.
##
Read 3.3% of 2985217 rows
Read 17.4% of 2985217 rows
Read 31.5% of 2985217 rows
Read 46.9% of 2985217 rows
Read 61.3% of 2985217 rows
Read 75.0% of 2985217 rows
Read 90.1% of 2985217 rows
Read 2985217 rows and 58 (of 58) columns from 0.604 GB file in 00:00:09
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Can baisic statistical models predict housing prices better than baisic markers like mean price for an area or inflation adjusted previous price?
What are the cases, and how many are there?
Describe the method of data collection.
Data collection was easy since the datasets were posted on kaggle. Files can be dowloaded here: https://www.kaggle.com/c/zillow-prize-1/data
What type of study is this (observational/experiment)?
This is an observational study based on previous housing prices and atributes.
If you collected the data, state self-collected. If not, provide a citation/link.
What is the response variable, and what type is it (numerical/categorical)?
The response variable in numeric (log error or housing price error).
What is the explanatory variable, and what type is it (numerical/categorival)?
There are a mix of numeric and catigorical variables (nothing OLS can’t handle)
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
qplot(logerror, data = train, bins = 400) +
ylab("Count") +
ggtitle("Counts of log error",subtitle = "For the Train Dataset") +
coord_cartesian(x = c(-0.5,0.5)) +
theme_economist()
glimpse(properties)
## Observations: 2,985,217
## Variables: 58
## $ parcelid <int> 10754147, 10759547, 10843547, 108...
## $ airconditioningtypeid <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ architecturalstyletypeid <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ basementsqft <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ bathroomcnt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ bedroomcnt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ buildingclasstypeid <int> NA, NA, NA, 3, 4, 4, NA, NA, NA, ...
## $ buildingqualitytypeid <int> NA, NA, NA, 7, NA, 7, NA, NA, NA,...
## $ calculatedbathnbr <dbl> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ decktypeid <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ finishedfloor1squarefeet <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ calculatedfinishedsquarefeet <dbl> NA, NA, 73026, 5068, 1776, 2400, ...
## $ finishedsquarefeet12 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ finishedsquarefeet13 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ finishedsquarefeet15 <int> NA, NA, 73026, 5068, 1776, 2400, ...
## $ finishedsquarefeet50 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ finishedsquarefeet6 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ fips <int> 6037, 6037, 6037, 6037, 6037, 603...
## $ fireplacecnt <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ fullbathcnt <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ garagecarcnt <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ garagetotalsqft <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ hashottuborspa <chr> "", "", "", "", "", "", "", "", "...
## $ heatingorsystemtypeid <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ latitude <int> 34144442, 34140430, 33989359, 341...
## $ longitude <int> -118654084, -118625364, -11839463...
## $ lotsizesquarefeet <dbl> 85768, 4083, 63085, 7521, 8512, 2...
## $ poolcnt <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ poolsizesum <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pooltypeid10 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pooltypeid2 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pooltypeid7 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ propertycountylandusecode <chr> "010D", "0109", "1200", "1200", "...
## $ propertylandusetypeid <int> 269, 261, 47, 47, 31, 31, 260, 31...
## $ propertyzoningdesc <chr> "", "LCA11*", "LAC2", "LAC2", "LA...
## $ rawcensustractandblock <dbl> 60378002, 60378001, 60377030, 603...
## $ regionidcity <int> 37688, 37688, 51617, 12447, 12447...
## $ regionidcounty <int> 3101, 3101, 3101, 3101, 3101, 310...
## $ regionidneighborhood <int> NA, NA, NA, 27080, 46795, 46795, ...
## $ regionidzip <int> 96337, 96337, 96095, 96424, 96450...
## $ roomcnt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ storytypeid <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ threequarterbathnbr <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ typeconstructiontypeid <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ unitcnt <int> NA, NA, 2, NA, 1, NA, NA, NA, NA,...
## $ yardbuildingsqft17 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ yardbuildingsqft26 <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ yearbuilt <dbl> NA, NA, NA, 1948, 1947, 1943, NA,...
## $ numberofstories <int> NA, NA, NA, 1, NA, 1, NA, 1, NA, ...
## $ fireplaceflag <chr> "", "", "", "", "", "", "", "", "...
## $ structuretaxvaluedollarcnt <dbl> NA, NA, 650756, 571346, 193796, 1...
## $ taxvaluedollarcnt <dbl> 9, 27516, 1413387, 1156834, 43349...
## $ assessmentyear <int> 2015, 2015, 2015, 2015, 2015, 201...
## $ landtaxvaluedollarcnt <dbl> 9, 27516, 762631, 585488, 239695,...
## $ taxamount <dbl> NA, NA, 20800.37, 14557.57, 5725....
## $ taxdelinquencyflag <chr> "", "", "", "", "", "", "", "", "...
## $ taxdelinquencyyear <int> NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ censustractandblock <S3: integer64> 0, 0, 0, 0, 0, 0, 0, 0,...
There are a lot of NAs.
naVals <- properties %>%
summarise_all(funs(sum(is.na(.))/n())) %>%
gather(key = "Variable", value = "missingPercent")
naVals %>%
ggplot(aes(x = reorder( Variable, missingPercent), y = missingPercent)) +
geom_bar(stat = "identity") +
ylim(0,1) +
ggtitle("Percentage of Non- Missing Values") +
coord_flip() +
theme_economist()