We are provided with data of all residential home sales in Ames, Iowa between 2006 and 2010. The data set contains many explanatory variables on the quality and quantity of physical attributes of residential homes in Iowa sold in these 5 yrs. Most of the variables describe information a typical home buyer would like to know about a property - square footage, number of bedrooms and bathrooms, size of lot, etc. .
And the aim of this task is to predict the future sale price of housing in ames, Iowa, US as accurately as possible. This will be done by building advanced regression models.
The main goal of this task is to apply the learnings of the course ADEC730201 and go a step further if possible in learning the tricks of the trade of data analysis/science including feature engineering and advanced forecasting techniques.
train<-read.csv("D:/Boston College/MS AE Courses/Data Analysis/train.csv",header=TRUE)
test <-read.csv("D:/Boston College/MS AE Courses/Data Analysis/test.csv",header=TRUE)
#length(colnames(train))
#length(colnames(test))
hist(train$SalePrice/1000)
hist(log(train$SalePrice))
#cor(train$SalePrice,train$Neighborhood)
#AmesHousingData
The original Sales price as we can see is heavily right skewed. Log transformation modifies it to an almost norma distribution.
The current analysis is done on the dataset available on kaggle. This dataset has been divided into test and train datasets already. Each having 79 variables and 1460 and 1459 observations each. Out of 79 variables(other than Id and Salesprice variable), 46 are categorical and 33 are continuous.
The original source of the data, however is this and has many more records and features.
A crisp description of all the variables is available here. The response variable for our problem is SalePrice.
# this command will summarize the dataframe.
#summary(train)
# decimal places - only 4
options(digits = 4)
# checking for number of NAs for each variable
x<-data.frame(miss_cnt=sapply(train, function(x) sum(is.na(x))))
x1<-subset(x,x$miss_cnt>=600)
miss_vars<-rownames(x1)
#40 % of 1490 is 596.
order(x,decreasing = TRUE)
## [1] 73 75 7 74 58 4 59 60 61 64 65 33 36 31 32 34 26 27 43 1 2 3 5
## [24] 6 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 28 29 30 35
## [47] 37 38 39 40 41 42 44 45 46 47 48 49 50 51 52 53 54 55 56 57 62 63 66
## [70] 67 68 69 70 71 72 76 77 78 79 80 81
order(x,x$miss_cnt,decreasing = TRUE)
## [1] 73 75 7 74 58 4 59 60 61 64 65 33 36 31 32 34 26 27 43 1 2 3 5
## [24] 6 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 28 29 30 35
## [47] 37 38 39 40 41 42 44 45 46 47 48 49 50 51 52 53 54 55 56 57 62 63 66
## [70] 67 68 69 70 71 72 76 77 78 79 80 81
x2<-x[order(x,x$miss_cnt,decreasing = TRUE),]
t<-train[, setdiff(names(train),miss_vars)]
list_vars<-sapply(t, class)
a=1
b=1
num_vars<-NULL
cat_vars<-NULL
for(i in 1:length(list_vars)){
if(list_vars[i] == "integer"){
num_vars[a] = names(list_vars[i])
a=a+1
}else{
cat_vars[b] = names(list_vars[i])
b=b+1
}
}
num_vars<-append(num_vars,"SalePrice")
cor_mat<-NULL
for(i in 1:length(num_vars)){
cor_mat[i] = cor(train$SalePrice,eval(parse(text=paste("train$",num_vars[i]))))
}
cor_file<-NULL
cor_file<-data.frame(cbind(num_var=num_vars,corr=cor_mat))
cor_file$corr<-as.numeric(as.character(cor_file$corr))
cor_file$select<-ifelse(((cor_file$corr >= 0.50) ),1,ifelse((cor_file$corr <= -0.5),1,0))
# Numerical Variables selected
sum(cor_file$select,na.rm=TRUE)
## [1] 12
cor_file1<-subset(cor_file,cor_file$select == 1)["num_var"]
final_num_vars<-cor_file1$num_var
#final_num_vars<-cor_file[cor_file["select"]==1,"num_var"]
# y = chisq.test(train[,"SalePrice"], train[,i])
10 out of 34 variables have high correlation so we start our simple model with these features to begin with. This is also shown by the darker shades of blue(positive) and red(negative) colored cubes in the correlogram shown below.
for(i in 1:length(final_num_vars)){
plot(eval(parse(text=paste("train$",num_vars[i]))), train$SalePrice/1000, main="Scatterplot Example", xlab=num_vars[i], ylab="SalePrice ", pch=19)
}
#plot(train$PoolArea, train$SalePrice/1000, main="Scatterplot Example", xlab="Neighborhood ", ylab="SalePrice ", pch=19)
#cor(train[,num_vars], use="complete.obs",method="kendall")
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
res1 <- cor.mtest(train[,num_vars], conf.level = .95)
res1$p<-round(res1$p,2)
res2 <- cor.mtest(train[,num_vars], conf.level = .99)
res2$p<-round(res2$p,2)
corrplot(cor(train[,as.matrix(final_num_vars)]), type="lower", p.mat = res1$p, insig = "p-value", tl.srt = 45,order = "hclust")
#diag(corMat) <- 0
corrplot(cor(train[,as.matrix(num_vars)]), type="lower", p.mat = res1$p, insig = "blank")
# method = "circle",type = "lower")
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.2
corrgram(train[,num_vars], order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt, main="Ames Housing Data ")
We can very clearly see that Sale Price is highly correlated with the variables selected namely GarageArea, YrBuilt, YearRemodAdd, FullBath etc.
chi_df<-NULL
#i=1
for(i in 1:length(cat_vars)){
c = chisq.test(train[,"SalePrice"], train[,cat_vars[i]])
#chi_df$cat_var[i] = cat_vars[i]
chi_df$pval[i] = c$p.value
}
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
## Warning in chisq.test(train[, "SalePrice"], train[, cat_vars[i]]): Chi-
## squared approximation may be incorrect
chi_df1<-as.data.frame(cbind(cat_vars,pval=chi_df$pval))
chi_df1$select<-ifelse(chi_df$pval>=0.05,0,1)
sum(chi_df1$select)
## [1] 19
final_cat_vars<-subset(chi_df1,select==1)["cat_vars"]
final_cat_vars<-as.matrix(final_cat_vars)
18 out of 40 categorical variables have a significant relationship with the SalePrice.
Lets see if we really need to treat the missing values and outliers.
model_reg <- lm(SalePrice ~ MSSubClass + LotArea + LandContour +
Utilities + LotConfig + Neighborhood + BldgType +
HouseStyle + OverallQual + OverallCond + YearBuilt +
MasVnrType + Foundation + BsmtCond +
BsmtFinType1 + BsmtFinSF1 + TotalBsmtSF + Heating +
X1stFlrSF + X2ndFlrSF + FullBath + HalfBath + KitchenQual +
GarageCars + GarageArea + GarageCond + SaleType +
SaleCondition, data = train)
summary(model_reg)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + LandContour +
## Utilities + LotConfig + Neighborhood + BldgType + HouseStyle +
## OverallQual + OverallCond + YearBuilt + MasVnrType + Foundation +
## BsmtCond + BsmtFinType1 + BsmtFinSF1 + TotalBsmtSF + Heating +
## X1stFlrSF + X2ndFlrSF + FullBath + HalfBath + KitchenQual +
## GarageCars + GarageArea + GarageCond + SaleType + SaleCondition,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -370902 -11972 -8 11201 228938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.90e+05 1.79e+05 -2.18 0.0294 *
## MSSubClass -2.18e+02 1.09e+02 -2.00 0.0457 *
## LotArea 4.75e-01 1.04e-01 4.57 5.2e-06 ***
## LandContourHLS 3.02e+04 6.70e+03 4.51 7.2e-06 ***
## LandContourLow 2.39e+04 8.08e+03 2.96 0.0032 **
## LandContourLvl 2.11e+04 4.85e+03 4.35 1.5e-05 ***
## UtilitiesNoSeWa -5.43e+04 3.28e+04 -1.66 0.0977 .
## LotConfigCulDSac 1.02e+04 4.02e+03 2.54 0.0113 *
## LotConfigFR2 -1.14e+04 5.30e+03 -2.14 0.0323 *
## LotConfigFR3 -2.19e+04 1.58e+04 -1.39 0.1662
## LotConfigInside -1.96e+02 2.27e+03 -0.09 0.9313
## NeighborhoodBlueste -2.20e+03 2.37e+04 -0.09 0.9260
## NeighborhoodBrDale 6.50e+03 1.26e+04 0.51 0.6073
## NeighborhoodBrkSide -1.37e+03 1.09e+04 -0.13 0.8998
## NeighborhoodClearCr 2.43e+03 1.12e+04 0.22 0.8287
## NeighborhoodCollgCr -7.35e+03 8.83e+03 -0.83 0.4053
## NeighborhoodCrawfor 1.61e+04 1.04e+04 1.55 0.1209
## NeighborhoodEdwards -2.57e+04 9.70e+03 -2.65 0.0081 **
## NeighborhoodGilbert -1.19e+04 9.49e+03 -1.25 0.2114
## NeighborhoodIDOTRR -7.99e+03 1.16e+04 -0.69 0.4898
## NeighborhoodMeadowV -1.32e+03 1.25e+04 -0.11 0.9158
## NeighborhoodMitchel -1.28e+04 9.99e+03 -1.28 0.2022
## NeighborhoodNAmes -1.79e+04 9.36e+03 -1.91 0.0565 .
## NeighborhoodNoRidge 4.29e+04 1.00e+04 4.27 2.1e-05 ***
## NeighborhoodNPkVill 7.46e+02 1.37e+04 0.05 0.9564
## NeighborhoodNridgHt 3.66e+04 9.06e+03 4.03 5.8e-05 ***
## NeighborhoodNWAmes -2.26e+04 9.55e+03 -2.37 0.0181 *
## NeighborhoodOldTown -2.02e+04 1.04e+04 -1.95 0.0517 .
## NeighborhoodSawyer -1.78e+04 9.86e+03 -1.81 0.0707 .
## NeighborhoodSawyerW -8.01e+03 9.45e+03 -0.85 0.3970
## NeighborhoodSomerst 9.61e+03 8.97e+03 1.07 0.2842
## NeighborhoodStoneBr 4.39e+04 1.02e+04 4.32 1.7e-05 ***
## NeighborhoodSWISU -1.01e+04 1.23e+04 -0.82 0.4119
## NeighborhoodTimber -2.90e+03 1.01e+04 -0.29 0.7734
## NeighborhoodVeenker 1.50e+04 1.29e+04 1.17 0.2442
## BldgType2fmCon 1.71e+04 1.66e+04 1.03 0.3022
## BldgTypeDuplex -2.10e+04 8.36e+03 -2.51 0.0122 *
## BldgTypeTwnhs -1.22e+04 1.27e+04 -0.97 0.3344
## BldgTypeTwnhsE -4.58e+03 1.14e+04 -0.40 0.6888
## HouseStyle1.5Unf 4.03e+03 1.09e+04 0.37 0.7115
## HouseStyle1Story 1.52e+04 5.54e+03 2.75 0.0061 **
## HouseStyle2.5Fin 2.31e+04 1.41e+04 1.64 0.1012
## HouseStyle2.5Unf -1.14e+03 1.16e+04 -0.10 0.9221
## HouseStyle2Story -8.33e+03 4.39e+03 -1.90 0.0580 .
## HouseStyleSFoyer 2.47e+04 8.37e+03 2.95 0.0033 **
## HouseStyleSLvl 1.96e+04 6.88e+03 2.85 0.0044 **
## OverallQual 1.28e+04 1.26e+03 10.12 < 2e-16 ***
## OverallCond 5.87e+03 1.01e+03 5.83 7.1e-09 ***
## YearBuilt 1.70e+02 9.00e+01 1.89 0.0588 .
## MasVnrTypeBrkFace 1.21e+04 8.53e+03 1.42 0.1550
## MasVnrTypeNone 1.24e+04 8.40e+03 1.48 0.1391
## MasVnrTypeStone 1.63e+04 9.01e+03 1.80 0.0715 .
## FoundationCBlock 7.62e+03 4.29e+03 1.78 0.0760 .
## FoundationPConc 9.84e+03 4.71e+03 2.09 0.0370 *
## FoundationStone 9.29e+03 1.34e+04 0.69 0.4889
## FoundationWood -1.49e+04 1.88e+04 -0.79 0.4268
## BsmtCondGd 3.16e+03 6.86e+03 0.46 0.6455
## BsmtCondPo 1.27e+04 3.53e+04 0.36 0.7182
## BsmtCondTA 7.04e+03 5.55e+03 1.27 0.2048
## BsmtFinType1BLQ 3.00e+02 3.53e+03 0.09 0.9322
## BsmtFinType1GLQ 5.01e+03 3.22e+03 1.55 0.1204
## BsmtFinType1LwQ -3.92e+03 4.57e+03 -0.86 0.3904
## BsmtFinType1Rec -1.67e+03 3.78e+03 -0.44 0.6587
## BsmtFinType1Unf -8.89e+03 3.58e+03 -2.48 0.0132 *
## BsmtFinSF1 7.53e+00 3.20e+00 2.36 0.0187 *
## TotalBsmtSF 2.05e+00 5.55e+00 0.37 0.7126
## HeatingGasW -6.49e+03 8.52e+03 -0.76 0.4464
## HeatingGrav 2.49e+04 2.20e+04 1.13 0.2581
## HeatingOthW -4.02e+04 3.33e+04 -1.21 0.2271
## X1stFlrSF 5.16e+01 6.03e+00 8.56 < 2e-16 ***
## X2ndFlrSF 6.91e+01 5.98e+00 11.55 < 2e-16 ***
## FullBath 4.32e+03 2.81e+03 1.54 0.1236
## HalfBath 3.44e+03 2.66e+03 1.29 0.1965
## KitchenQualFa -4.19e+04 8.36e+03 -5.01 6.1e-07 ***
## KitchenQualGd -3.98e+04 4.06e+03 -9.81 < 2e-16 ***
## KitchenQualTA -3.97e+04 4.71e+03 -8.43 < 2e-16 ***
## GarageCars 1.66e+04 2.85e+03 5.83 7.2e-09 ***
## GarageArea -1.65e+01 9.38e+00 -1.76 0.0782 .
## GarageCondFa -9.07e+03 2.30e+04 -0.40 0.6927
## GarageCondGd -6.75e+03 2.45e+04 -0.28 0.7828
## GarageCondPo -2.32e+04 2.69e+04 -0.86 0.3878
## GarageCondTA -7.04e+03 2.21e+04 -0.32 0.7506
## SaleTypeCon 3.97e+04 2.30e+04 1.73 0.0846 .
## SaleTypeConLD 1.37e+04 1.47e+04 0.93 0.3509
## SaleTypeConLI 1.13e+04 1.65e+04 0.69 0.4913
## SaleTypeConLw 3.41e+03 1.67e+04 0.20 0.8388
## SaleTypeCWD 1.61e+04 1.66e+04 0.97 0.3331
## SaleTypeNew 4.69e+04 1.97e+04 2.38 0.0173 *
## SaleTypeOth 3.73e+04 3.11e+04 1.20 0.2303
## SaleTypeWD 1.79e+03 5.44e+03 0.33 0.7420
## SaleConditionAdjLand 3.41e+04 3.14e+04 1.09 0.2779
## SaleConditionAlloca 1.76e+04 1.32e+04 1.33 0.1852
## SaleConditionFamily 1.22e+03 7.99e+03 0.15 0.8791
## SaleConditionNormal 7.64e+03 3.85e+03 1.98 0.0475 *
## SaleConditionPartial -2.44e+04 1.90e+04 -1.29 0.1988
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30500 on 1246 degrees of freedom
## (119 observations deleted due to missingness)
## Multiple R-squared: 0.861, Adjusted R-squared: 0.851
## F-statistic: 82.4 on 94 and 1246 DF, p-value: <2e-16
#Residual standard error: 30460 on 1246 degrees of freedom
# (119 observations deleted due to missingness)
#Multiple R-squared: 0.8614, Adjusted R-squared: 0.8509
#F-statistic: 82.35 on 94 and 1246 DF, p-value: < 2.2e-16
The simple model comes out at about 85% R^2.
#pred=predict(model_reg,train)
#rmse=RMSE(pred,test)
plot(model_reg$residuals,model_reg$fitted.values)
Validating the normality condition by checking variance of residuals.
x_train=as.matrix(train[final_num_vars,-train$SalePrice])
y_train=train$SalePrice
x_test=as.matrix(test)
Performing Lasso Regression as there are many variables and lasso and further regression models are able to avoid overfitting.
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
# LASSO
#cv=cv.glmnet(x_train,y_train,alpha=1)
#penalty=cv$lambda.min
#glm.lasso=glmnet(x=x_train,y=y_train,lambda = penalty)
#pred_lasso=as.numeric(predict(glm.lasso,x_train))
#rmse_lasso=RMSE(pred_lasso,y_train)
#predict_lasso=as.numeric(predict(glm.lasso,x_test))