#load data
setwd("F:/tiger/zillow")
train<-read.csv('train_property.csv',stringsAsFactors = F)
#remove variables with missing values more than 20%
NA_rate<-colMeans(sapply(train,is.na))
remain_col<-names(train)[which(NA_rate<.2)]
train2<-train[,remain_col]
train2<-train2[,-1]
#remained variables
names(train2)
## [1] "parcelid" "logerror"
## [3] "transactiondate" "bathroomcnt"
## [5] "bedroomcnt" "calculatedbathnbr"
## [7] "calculatedfinishedsquarefeet" "finishedsquarefeet12"
## [9] "fips" "fullbathcnt"
## [11] "hashottuborspa" "latitude"
## [13] "longitude" "lotsizesquarefeet"
## [15] "propertycountylandusecode" "propertylandusetypeid"
## [17] "propertyzoningdesc" "rawcensustractandblock"
## [19] "regionidcity" "regionidcounty"
## [21] "regionidzip" "roomcnt"
## [23] "yearbuilt" "fireplaceflag"
## [25] "structuretaxvaluedollarcnt" "taxvaluedollarcnt"
## [27] "assessmentyear" "landtaxvaluedollarcnt"
## [29] "taxamount" "taxdelinquencyflag"
## [31] "censustractandblock"
str(train2)
## 'data.frame': 90275 obs. of 31 variables:
## $ parcelid : int 10711738 10711755 10711805 10711816 10711858 10711910 10712086 10712162 10712163 10712195 ...
## $ logerror : num 0.0276 -0.0182 -0.1009 -0.0121 -0.0481 ...
## $ transactiondate : chr "2016-08-02" "2016-08-02" "2016-05-03" "2016-04-05" ...
## $ bathroomcnt : num 3 3 2 2 2 2 2 3 3 3 ...
## $ bedroomcnt : int 4 3 3 4 4 3 4 3 4 3 ...
## $ calculatedbathnbr : num 3 3 2 2 2 2 2 3 3 3 ...
## $ calculatedfinishedsquarefeet: int 2538 1589 2411 2232 1882 1477 1850 3193 2421 1678 ...
## $ finishedsquarefeet12 : int 2538 1589 2411 2232 1882 1477 1850 3193 2421 1678 ...
## $ fips : int 6037 6037 6037 6037 6037 6037 6037 6037 6037 6037 ...
## $ fullbathcnt : int 3 3 2 2 2 2 2 3 3 3 ...
## $ hashottuborspa : chr "" "" "" "" ...
## $ latitude : int 34220381 34222040 34220427 34222390 34222544 34221864 34226039 34226833 34226843 34223689 ...
## $ longitude : int -118620802 -118622240 -118618549 -118618631 -118617961 -118615739 -118618527 -118612917 -118612422 -118612746 ...
## $ lotsizesquarefeet : num 11012 11010 11723 9002 9002 ...
## $ propertycountylandusecode : chr "0101" "0101" "0101" "0100" ...
## $ propertylandusetypeid : int 261 261 261 261 261 261 261 261 261 261 ...
## $ propertyzoningdesc : chr "LARE11" "LARE11" "LARE9" "LARE9" ...
## $ rawcensustractandblock : num 60371132 60371132 60371132 60371132 60371132 ...
## $ regionidcity : int 12447 12447 12447 12447 12447 12447 12447 12447 12447 12447 ...
## $ regionidcounty : int 3101 3101 3101 3101 3101 3101 3101 3101 3101 3101 ...
## $ regionidzip : int 96339 96339 96339 96339 96339 96339 96339 96339 96339 96339 ...
## $ roomcnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ yearbuilt : int 1978 1959 1973 1973 1973 1960 1974 1964 1962 1961 ...
## $ fireplaceflag : chr "" "" "" "" ...
## $ structuretaxvaluedollarcnt : num 245180 254691 235114 262309 232037 ...
## $ taxvaluedollarcnt : num 567112 459844 384787 437176 382055 ...
## $ assessmentyear : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ landtaxvaluedollarcnt : num 321932 205153 149673 174867 150018 ...
## $ taxamount : num 7219 6901 4877 5560 4878 ...
## $ taxdelinquencyflag : chr "" "" "" "" ...
## $ censustractandblock : num 6.04e+13 6.04e+13 6.04e+13 6.04e+13 6.04e+13 ...
#missing rate of the remained variables from the highest to the lowest
miss<-data.frame(var=remain_col,NA_rate=NA_rate[remain_col],row.names = NULL,stringsAsFactors = F)
miss<-miss[order(miss$NA_rate,decreasing = T),]
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
ggplot(miss,aes(x=reorder(var,-NA_rate),y=NA_rate))+geom_bar(stat='identity',fill='red')+
labs(x='remained variables',title='missing rate of the remained variables')+
theme(axis.text.x = element_text(angle=90,hjust=1))
The original train dataset has 90275 observations and 61 variables. After removing data with missing rate more than 20%, 31 variables are remained.
Among these 31 variables, 18 are continuous variables. Except the outcome variablE logerror, the other 17 variables can be grouped into 5 categories.
room count related variables: bathroomcnt, bedroomcnt, caculatedbathnbr, fullbathcnt, roomcnt
house size related variables: calculatedfinishedsquarefeet, finishedsquarefeet12, lotsizesquarefeet
house location related: longitute, latitute
4)house value related: taxvaluedollarcnt, landtaxvaluedollarcnt, taxamount,structuretaxvaluedollarcnt
There are 13 categorical variables:
parcelid
house adress related: fips, regionidcity, regionidcounty, regionidzip
house feature related: hashottuborspa, fireplaceflag
property use variable: propertycountylandusecode, propertyzoningdesc, propertylandusetypeid
tax : taxdelinquencyflag
census track and block variables: censustractandblock and rawcensustractandblock
summary(train2$logerror)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.60500 -0.02530 0.00600 0.01146 0.03920 4.73700
plot(density(train2$logerror),main='logerror')
Findings: in general, the logerror between estimated house price and the actual sale price is very small
Roomcount related
names(train2)
## [1] "parcelid" "logerror"
## [3] "transactiondate" "bathroomcnt"
## [5] "bedroomcnt" "calculatedbathnbr"
## [7] "calculatedfinishedsquarefeet" "finishedsquarefeet12"
## [9] "fips" "fullbathcnt"
## [11] "hashottuborspa" "latitude"
## [13] "longitude" "lotsizesquarefeet"
## [15] "propertycountylandusecode" "propertylandusetypeid"
## [17] "propertyzoningdesc" "rawcensustractandblock"
## [19] "regionidcity" "regionidcounty"
## [21] "regionidzip" "roomcnt"
## [23] "yearbuilt" "fireplaceflag"
## [25] "structuretaxvaluedollarcnt" "taxvaluedollarcnt"
## [27] "assessmentyear" "landtaxvaluedollarcnt"
## [29] "taxamount" "taxdelinquencyflag"
## [31] "censustractandblock"
summary(train2[,c(4,5,6,10,22)])
## bathroomcnt bedroomcnt calculatedbathnbr fullbathcnt
## Min. : 0.000 Min. : 0.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.: 2.000
## Median : 2.000 Median : 3.000 Median : 2.000 Median : 2.000
## Mean : 2.279 Mean : 3.032 Mean : 2.309 Mean : 2.241
## 3rd Qu.: 3.000 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 3.000
## Max. :20.000 Max. :16.000 Max. :20.000 Max. :20.000
## NA's :1182 NA's :1182
## roomcnt
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 1.479
## 3rd Qu.: 0.000
## Max. :18.000
##
par(mfrow=c(2,3))
hist(train2[,4],xlab='bathroomcnt',main='number of bathroom')
hist(train2[,5],xlab='bedroomcnt',main='number of bedroom')
hist(train2[,6],xlab='calculatedbathnbr',main='number of calculatedbathnbr')
hist(train2[,10],xlab='fullbathcnt',main='number o fullbathcnt')
hist(train2[,22],xlab='roomcnt',main='number of roomcnt')
Findings: 1) the distribution of bathroom and bedroom are positively skewed 2) most houses have less than 5 bedroom and bathrooms 3) apart from some houses with only one room, a lot of houses have 5~8 rooms in total 4) bathroomcnt, bedroomcnt and roomcnt have better data quality cuz they do not have NAs
House adress related variables
#fips distribution
table(train2$fips)/nrow(train2)
##
## 6037 6059 6111
## 0.64883966 0.27144835 0.07971199
#6037 for LA county, 6059 for Orange county and 6111 for ventura
#city distribution
#table(train2$regionidcity)
#county distribution
#table(train2$regionidcounty)
#zip
#table(train2$regionidzip)
table(train2$hashottuborspa)/nrow(train2)
##
## true
## 0.97380227 0.02619773
table(train2$fireplaceflag)/nrow(train2)
##
## true
## 0.997540847 0.002459153
Findings: very few houses have fireplace or spa tub
table(train2$propertycountylandusecode)
##
## 0 010 0100 0101 0102 0103 0104 0108 0109 010C 010D
## 1 1 1 30846 7435 3 100 348 46 27 10264 2209
## 010E 010F 010G 010H 010M 010V 0110 0111 0114 012C 012D 012E
## 2286 28 80 72 59 201 4 1 1 523 4 7
## 0130 0131 01DC 01DD 01HC 0200 0201 020G 020M 0210 0300 0301
## 1 1 251 1 148 2153 43 4 1 1 578 1
## 0303 030G 0400 0401 040A 040V 0700 070D 1 100V 1011 1012
## 1 2 747 4 1 6 54 7 2915 5 2 2
## 1014 105 1110 1111 1112 1116 1117 1128 1129 1200 1210 122
## 32 4 1117 3883 5 11 46 356 1643 1 47 15383
## 1222 1310 1321 1333 135 1410 1420 1421 1432 1720 1722 200
## 28 4 7 1 35 13 1 2 2 7 1 1
## 34 38 6050 73 8800 96
## 5946 106 1 11 1 104
#table(train2$propertyzoningdesc)
corr<-cor(train2[,c(2,4:8,10,12:14,22,23,25,26,28,29)],use='pairwise.complete.obs')
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
corrplot(corr)
cor_logerror<-sort(corr[,1],decreasing = T)
cor_logerror
## logerror finishedsquarefeet12
## 1.000000000 0.041922367
## calculatedfinishedsquarefeet calculatedbathnbr
## 0.038784069 0.029447685
## fullbathcnt bathroomcnt
## 0.028845122 0.027889287
## bedroomcnt structuretaxvaluedollarcnt
## 0.025467090 0.022084970
## yearbuilt taxvaluedollarcnt
## 0.017312211 0.006507999
## roomcnt latitude
## 0.005759796 0.004915466
## lotsizesquarefeet landtaxvaluedollarcnt
## 0.004835250 -0.003051035
## longitude taxamount
## -0.003432217 -0.006671116
#corelations are all soooo weak
#show corelation between each variable and the logerror
par(mfrow=c(4,2))
plot(train2$finishedsquarefeet12,train2$logerror)
plot(train2$calculatedfinishedsquarefeet,train2$logerror)
plot(train2$calculatedbathnbr,train2$logerror)
plot(train2$fullbathcnt,train2$logerror)
plot(train2$bathroomcnt,train2$logerror)
plot(train2$bedroomcnt,train2$logerror)
plot(train2$structuretaxvaluedollarcnt,train2$logerror)
the points are randomly scattered and indicates the corelation is weak
library(lattice)
## Warning: package 'lattice' was built under R version 3.4.3
train2$transMonth<-sapply(strsplit(train2$transactiondate,'-'),function(x) x[2])
bwplot(logerror~transMonth,data=train2,xlab='month')
bwplot(logerror[which(abs(logerror)<0.05)]~transMonth,data=train2,xlab='month')
err.month<-by(train2,train2$transMonth,function(x){return(mean(x$logerror))})
plot(names(err.month),err.month,type='l',xlab='month',ylab='average error')
Findings: distributon of logerror is similar, but average logerror differs in the month of april, may, june
train2$transDay<-sapply(strsplit(train2$transactiondate,'-'),function(x) x[3])
err.day<-by(train2,train2$transDay,function(x){return(mean(x$logerror))})
plot(names(err.day),err.day,type='l',xlab='day',ylab='average logerror')
For categorical variables , I would explore the relationship between the logerror and the fips, hashottuborspa,fireplaceflag,propertylanusetypeid, taxdelinquency
library(lattice)
library(gridExtra)
fipsPlot1<-bwplot(logerror~as.character(fips),data=train2,xlab='fips')
fipsPlot2<-bwplot(logerror~as.character(fips),data=subset(train2,abs(logerror)<.05),
xlab='fips with logerror less than 0.05')
grid.arrange(fipsPlot1,fipsPlot2,ncol=2)
spaPlot1<-bwplot(logerror~hashottuborspa,data=train2,xlab='hashottuborspa')
spaPlot2<-bwplot(logerror~hashottuborspa,data=subset(train2,abs(logerror)<.05),
xlab='hashottuborspa with logerror less than 0.05')
grid.arrange(spaPlot1,spaPlot2,ncol=2)
firePlot1<-bwplot(logerror~fireplaceflag,data=train2,xlab='fireplaceflag')
firePlot2<-bwplot(logerror~fireplaceflag,data=subset(train2,abs(logerror)<.05),
xlab='fireplace with logerror less tha 0.05')
grid.arrange(firePlot1,firePlot2,ncol=2)
taxPlot1<-bwplot(logerror~taxdelinquencyflag,data=train2,xlab='taxdelinquency')
taxPlot2<-bwplot(logerror~taxdelinquencyflag,data=subset(train2,abs(logerror)<.05),
xlab='taxdelinquency with logerror less tha 0.05')
grid.arrange(taxPlot1,taxPlot2,ncol=2)
landPlot1<-bwplot(logerror~as.character(propertylandusetypeid),data=train2,xlab='landtype')
landPlot2<-bwplot(logerror~as.character(propertylandusetypeid),data=subset(train2,abs(logerror)<.05),
xlab='landtype with logerror less tha 0.05')
grid.arrange(landPlot1,landPlot2,ncol=2)
anova(with(train2,lm(logerror~as.character(fips))))
## Analysis of Variance Table
##
## Response: logerror
## Df Sum Sq Mean Sq F value Pr(>F)
## as.character(fips) 2 0.27 0.136935 5.2781 0.005104 **
## Residuals 90272 2342.01 0.025944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(with(train2,lm(logerror~as.character(propertylandusetypeid))))
## Analysis of Variance Table
##
## Response: logerror
## Df Sum Sq Mean Sq F value
## as.character(propertylandusetypeid) 13 2.8 0.215373 8.3094
## Residuals 90261 2339.5 0.025919
## Pr(>F)
## as.character(propertylandusetypeid) < 2.2e-16 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(with(train2,lm(logerror~hashottuborspa)))
## Analysis of Variance Table
##
## Response: logerror
## Df Sum Sq Mean Sq F value Pr(>F)
## hashottuborspa 1 0.06 0.062835 2.4218 0.1197
## Residuals 90273 2342.22 0.025946
anova(with(train2,lm(logerror~fireplaceflag)))
## Analysis of Variance Table
##
## Response: logerror
## Df Sum Sq Mean Sq F value Pr(>F)
## fireplaceflag 1 0.0 0.0032303 0.1245 0.7242
## Residuals 90273 2342.3 0.0259466
anova(with(train2,lm(logerror~taxdelinquencyflag)))
## Analysis of Variance Table
##
## Response: logerror
## Df Sum Sq Mean Sq F value Pr(>F)
## taxdelinquencyflag 1 0.84 0.83984 32.38 1.272e-08 ***
## Residuals 90273 2341.44 0.02594
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Findings: taxdelinquence, fips and propertylandusetypeid really matters
number of houses sold by city
#calculate the number of houses sold by city
library(plyr)
## Warning: package 'plyr' was built under R version 3.4.3
house.city<-ddply(train2,.(regionidcity),summarise,cityhousecnt=length(regionidcity))
train3<-merge(train2,house.city,by='regionidcity',all.x=T)
with(train3,plot(cityhousecnt,logerror,main='logerror vs city house count' ))
train3_sub<-train3[train3$cityhousecnt != max(train3$cityhousecnt),]
with(train3_sub,plot(cityhousecnt,logerror))
####number of house sold by zip
house.zip<-ddply(train2,.(regionidzip),summarise,ziphousecnt=length(regionidzip))
train4<-merge(train2,house.zip,by='regionidzip',all.x=T)
with(train4,plot(ziphousecnt,logerror,main='logerror vs zip house count' ))
cor(train4$ziphousecnt,train$logerror)
## [1] 0.0002476373
price.zip<-ddply(train4,.(regionidzip),summarize,avgtax=mean(taxamount,na.rm = T))
train5<-merge(train4,price.zip,by='regionidzip',all.x=T)
with(train5,plot(avgtax,logerror,main='avgtax vs error ' ))