library(zoo)
library(lubridate)
head(train)
cleanData <- function(data, trainData=TRUE) {
if (trainData) {
data$SALES_PRICE <- log(train$SALES_PRICE)
}
}
hist(train$SALES_PRICE)

hist(log(train$SALES_PRICE))

train$SALES_PRICE <- log(train$SALES_PRICE)
train$N_ROOM <- as.factor(train$N_ROOM)
train$N_BATHROOM <- as.factor(train$N_BATHROOM)
train$N_BEDROOM <- as.factor(train$N_BEDROOM)
train$PROPERTY_AGE <- (as.yearmon(strptime("11.01.2018", format = "%d.%m.%Y"))-
as.yearmon(strptime(train$DATE_BUILD, format = "%d-%m-%Y")))
train$SALE_YEAR <- as.factor(substring(train$DATE_SALE,7,10))
cleaned_data <- subset(train, select = -c(PRT_ID, DATE_BUILD, DATE_SALE))
feature_classes <- sapply(names(cleaned_data),function(x){class(cleaned_data[[x]])})
numeric_feats <-names(feature_classes[feature_classes != "factor"])
library(moments)
skewed_feats <- sapply(numeric_feats,function(x){skewness(cleaned_data[[x]],na.rm=TRUE)})
skewed_feats <- sapply(numeric_feats,function(x){skewness(cleaned_data[[x]],na.rm=TRUE)})
skewed_feats <- skewed_feats[skewed_feats > 0.75]
for(x in names(skewed_feats)) {
cleaned_data[[x]] <- log(cleaned_data[[x]] + 1)
}
categorical_feats <- names(feature_classes[feature_classes == "factor"])
sapply(cleaned_data, function(x) sum(is.na(x)))
## AREA INT_SQFT DIST_MAINROAD N_BEDROOM N_BATHROOM
## 0 0 0 1 5
## N_ROOM SALE_COND PARK_FACIL BUILDTYPE UTILITY_AVAIL
## 0 0 0 0 0
## STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM
## 0 0 0 0 0
## QS_OVERALL REG_FEE COMMIS SALES_PRICE PROPERTY_AGE
## 48 0 0 0 0
## SALE_YEAR
## 0
Mode <- function (x, na.rm) {
xtab <- table(x)
xmode <- names(which(xtab == max(xtab)))
if (length(xmode) > 1) xmode <- ">1 mode"
return(xmode)
}
for (var in 1:ncol(cleaned_data)) {
if (class(cleaned_data[,var])=="numeric") {
cleaned_data[is.na(cleaned_data[,var]),var] <- mean(cleaned_data[,var], na.rm = TRUE)
} else if (class(cleaned_data[,var]) %in% c("character", "factor")) {
cleaned_data[is.na(cleaned_data[,var]),var] <- Mode(cleaned_data[,var], na.rm = TRUE)
}
}
sapply(cleaned_data, function(x) sum(is.na(x)))
## AREA INT_SQFT DIST_MAINROAD N_BEDROOM N_BATHROOM
## 0 0 0 0 0
## N_ROOM SALE_COND PARK_FACIL BUILDTYPE UTILITY_AVAIL
## 0 0 0 0 0
## STREET MZZONE QS_ROOMS QS_BATHROOM QS_BEDROOM
## 0 0 0 0 0
## QS_OVERALL REG_FEE COMMIS SALES_PRICE PROPERTY_AGE
## 0 0 0 0 0
## SALE_YEAR
## 0
library(caret)
dummies <- dummyVars(~.,cleaned_data[categorical_feats])
categorical_1_hot <- predict(dummies,cleaned_data[categorical_feats])
final <- cbind(cleaned_data[numeric_feats],categorical_1_hot)
x<- subset(final,select= -SALES_PRICE)
y <- final$SALES_PRICE
CARET.TRAIN.CTRL <- trainControl(method="repeatedcv", number=5, repeats=5, returnResamp="final", verboseIter=FALSE)
## LINEAR REGRESSION
model_linear <- train(SALES_PRICE~.,final,method="lm",metric="RMSE",maximize=FALSE,trControl=CARET.TRAIN.CTRL)
summary(model_linear)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38396 -0.02400 -0.00028 0.02213 0.20143
##
## Coefficients: (13 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.404e+01 4.975e-02 282.294 < 2e-16 ***
## INT_SQFT 3.734e-04 5.126e-06 72.858 < 2e-16 ***
## DIST_MAINROAD -2.319e-06 8.852e-06 -0.262 0.793349
## QS_ROOMS -4.573e-03 1.563e-03 -2.927 0.003435 **
## QS_BATHROOM -4.572e-03 1.674e-03 -2.731 0.006328 **
## QS_BEDROOM -3.926e-03 1.909e-03 -2.056 0.039813 *
## QS_OVERALL 2.034e-02 4.877e-03 4.172 3.06e-05 ***
## REG_FEE 1.129e-01 3.828e-03 29.499 < 2e-16 ***
## COMMIS 8.361e-03 1.111e-03 7.525 5.94e-14 ***
## PROPERTY_AGE -1.758e-03 4.418e-05 -39.788 < 2e-16 ***
## AREA.Adyar 1.554e-01 4.119e-03 37.736 < 2e-16 ***
## AREA.AnnaNagar 1.845e-01 3.269e-03 56.457 < 2e-16 ***
## AREA.Chrompet 1.293e-01 3.609e-03 35.839 < 2e-16 ***
## AREA.Karapakkam -1.213e-01 3.742e-03 -32.414 < 2e-16 ***
## AREA.KKNagar -6.847e-02 2.565e-03 -26.691 < 2e-16 ***
## AREA.TNagar 1.884e-01 3.404e-03 55.342 < 2e-16 ***
## AREA.Velachery NA NA NA NA
## N_BEDROOM.1 8.426e-02 6.698e-03 12.579 < 2e-16 ***
## N_BEDROOM.2 8.091e-02 5.504e-03 14.701 < 2e-16 ***
## N_BEDROOM.3 5.269e-02 3.900e-03 13.512 < 2e-16 ***
## N_BEDROOM.4 NA NA NA NA
## N_BATHROOM.1 -5.760e-03 3.472e-03 -1.659 0.097162 .
## N_BATHROOM.2 NA NA NA NA
## N_ROOM.2 -7.010e-02 4.166e-03 -16.825 < 2e-16 ***
## N_ROOM.3 -1.916e-02 3.571e-03 -5.367 8.28e-08 ***
## N_ROOM.4 NA NA NA NA
## N_ROOM.5 NA NA NA NA
## N_ROOM.6 NA NA NA NA
## SALE_COND.AbNormal 2.444e-02 1.613e-03 15.150 < 2e-16 ***
## SALE_COND.AdjLand 5.578e-02 1.620e-03 34.439 < 2e-16 ***
## SALE_COND.Family 1.548e-02 1.614e-03 9.589 < 2e-16 ***
## SALE_COND.NormalSale 2.765e-02 1.610e-03 17.179 < 2e-16 ***
## SALE_COND.Partial NA NA NA NA
## PARK_FACIL.No -8.995e-02 1.105e-03 -81.400 < 2e-16 ***
## PARK_FACIL.Yes NA NA NA NA
## BUILDTYPE.Commercial 2.909e-01 1.824e-03 159.522 < 2e-16 ***
## BUILDTYPE.House -6.385e-02 1.274e-03 -50.135 < 2e-16 ***
## BUILDTYPE.Others NA NA NA NA
## UTILITY_AVAIL.AllPub 1.100e-02 1.407e-03 7.822 5.96e-15 ***
## UTILITY_AVAIL.ELO -1.752e-02 1.512e-03 -11.585 < 2e-16 ***
## UTILITY_AVAIL.NoSeWa -1.160e-02 1.409e-03 -8.238 < 2e-16 ***
## UTILITY_AVAIL.NoSewr NA NA NA NA
## STREET.Gravel 4.560e-02 1.218e-03 37.432 < 2e-16 ***
## STREET.NoAccess -5.915e-02 1.340e-03 -44.136 < 2e-16 ***
## STREET.Paved NA NA NA NA
## MZZONE.A -3.193e-01 2.703e-03 -118.111 < 2e-16 ***
## MZZONE.C -2.427e-01 2.509e-03 -96.741 < 2e-16 ***
## MZZONE.I -1.658e-01 2.467e-03 -67.220 < 2e-16 ***
## MZZONE.RH -9.931e-02 1.484e-03 -66.914 < 2e-16 ***
## MZZONE.RL -4.893e-02 1.429e-03 -34.244 < 2e-16 ***
## MZZONE.RM NA NA NA NA
## SALE_YEAR.2004 2.787e-02 7.446e-03 3.743 0.000183 ***
## SALE_YEAR.2005 1.844e-02 7.522e-03 2.451 0.014255 *
## SALE_YEAR.2006 2.032e-02 6.677e-03 3.044 0.002345 **
## SALE_YEAR.2007 1.701e-02 6.254e-03 2.720 0.006535 **
## SALE_YEAR.2008 1.663e-02 6.222e-03 2.672 0.007549 **
## SALE_YEAR.2009 1.428e-02 6.179e-03 2.310 0.020895 *
## SALE_YEAR.2010 1.266e-02 6.171e-03 2.051 0.040319 *
## SALE_YEAR.2011 1.114e-02 6.173e-03 1.804 0.071254 .
## SALE_YEAR.2012 1.053e-02 6.245e-03 1.686 0.091928 .
## SALE_YEAR.2013 7.284e-03 6.398e-03 1.138 0.254960
## SALE_YEAR.2014 2.755e-03 6.437e-03 0.428 0.668698
## SALE_YEAR.2015 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04269 on 7059 degrees of freedom
## Multiple R-squared: 0.9853, Adjusted R-squared: 0.9852
## F-statistic: 9639 on 49 and 7059 DF, p-value: < 2.2e-16
mean(model_linear$resample$RMSE)
## [1] 0.04285586
## RIDGE REGRESSION
set.seed(123) # for reproducibility
model_ridge <- train(x=x,y=y, method="glmnet", metric="RMSE",maximize=FALSE,trControl=CARET.TRAIN.CTRL,tuneGrid=expand.grid(alpha=0, lambda=0.039)) #alpha is set to 0 for Ridge regression
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
mean(model_ridge$resample$RMSE)
## [1] 0.05149145
## LASSO
set.seed(123) # for reproducibility
model_lasso <- train(x=x,y=y,
method="glmnet",
metric="RMSE",
maximize=FALSE,
trControl=CARET.TRAIN.CTRL,
tuneGrid=expand.grid(alpha=1,lambda=0.01)) # alpha is set to 1 for Lasso regression
model_lasso
## glmnet
##
## 7109 samples
## 62 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 5687, 5686, 5687, 5687, 5689, 5686, ...
## Resampling results:
##
## RMSE Rsquared
## 0.07106875 0.9622713
##
## Tuning parameter 'alpha' was held constant at a value of 1
##
## Tuning parameter 'lambda' was held constant at a value of 0.01
##
mean(model_lasso$resample$RMSE)
## [1] 0.07106875