library(zoo)
library(lubridate)

head(train)
cleanData <- function(data, trainData=TRUE) {
  if (trainData) {
    data$SALES_PRICE <- log(train$SALES_PRICE)
  }
  
}

hist(train$SALES_PRICE)

hist(log(train$SALES_PRICE))

train$SALES_PRICE <- log(train$SALES_PRICE)

train$N_ROOM <- as.factor(train$N_ROOM)
train$N_BATHROOM <- as.factor(train$N_BATHROOM)
train$N_BEDROOM <- as.factor(train$N_BEDROOM)

train$PROPERTY_AGE <- (as.yearmon(strptime("11.01.2018", format = "%d.%m.%Y"))-
          as.yearmon(strptime(train$DATE_BUILD, format = "%d-%m-%Y")))
train$SALE_YEAR <- as.factor(substring(train$DATE_SALE,7,10))

cleaned_data <- subset(train, select = -c(PRT_ID, DATE_BUILD, DATE_SALE))

feature_classes <- sapply(names(cleaned_data),function(x){class(cleaned_data[[x]])})
numeric_feats <-names(feature_classes[feature_classes != "factor"]) 

library(moments)
skewed_feats <- sapply(numeric_feats,function(x){skewness(cleaned_data[[x]],na.rm=TRUE)})

skewed_feats <- sapply(numeric_feats,function(x){skewness(cleaned_data[[x]],na.rm=TRUE)})
skewed_feats <- skewed_feats[skewed_feats > 0.75]

for(x in names(skewed_feats)) {
  cleaned_data[[x]] <- log(cleaned_data[[x]] + 1)
}

categorical_feats <- names(feature_classes[feature_classes == "factor"])

sapply(cleaned_data, function(x) sum(is.na(x)))
##          AREA      INT_SQFT DIST_MAINROAD     N_BEDROOM    N_BATHROOM 
##             0             0             0             1             5 
##        N_ROOM     SALE_COND    PARK_FACIL     BUILDTYPE UTILITY_AVAIL 
##             0             0             0             0             0 
##        STREET        MZZONE      QS_ROOMS   QS_BATHROOM    QS_BEDROOM 
##             0             0             0             0             0 
##    QS_OVERALL       REG_FEE        COMMIS   SALES_PRICE  PROPERTY_AGE 
##            48             0             0             0             0 
##     SALE_YEAR 
##             0
Mode <- function (x, na.rm) {
    xtab <- table(x)
    xmode <- names(which(xtab == max(xtab)))
    if (length(xmode) > 1) xmode <- ">1 mode"
    return(xmode)
}

for (var in 1:ncol(cleaned_data)) {
    if (class(cleaned_data[,var])=="numeric") {
        cleaned_data[is.na(cleaned_data[,var]),var] <- mean(cleaned_data[,var], na.rm = TRUE)
    } else if (class(cleaned_data[,var]) %in% c("character", "factor")) {
        cleaned_data[is.na(cleaned_data[,var]),var] <- Mode(cleaned_data[,var], na.rm = TRUE)
    }
}

sapply(cleaned_data, function(x) sum(is.na(x)))
##          AREA      INT_SQFT DIST_MAINROAD     N_BEDROOM    N_BATHROOM 
##             0             0             0             0             0 
##        N_ROOM     SALE_COND    PARK_FACIL     BUILDTYPE UTILITY_AVAIL 
##             0             0             0             0             0 
##        STREET        MZZONE      QS_ROOMS   QS_BATHROOM    QS_BEDROOM 
##             0             0             0             0             0 
##    QS_OVERALL       REG_FEE        COMMIS   SALES_PRICE  PROPERTY_AGE 
##             0             0             0             0             0 
##     SALE_YEAR 
##             0
library(caret)
dummies <- dummyVars(~.,cleaned_data[categorical_feats])
categorical_1_hot <- predict(dummies,cleaned_data[categorical_feats])

final <- cbind(cleaned_data[numeric_feats],categorical_1_hot)
x<- subset(final,select= -SALES_PRICE)
y <- final$SALES_PRICE
CARET.TRAIN.CTRL <- trainControl(method="repeatedcv", number=5, repeats=5, returnResamp="final", verboseIter=FALSE)

## LINEAR REGRESSION
model_linear <- train(SALES_PRICE~.,final,method="lm",metric="RMSE",maximize=FALSE,trControl=CARET.TRAIN.CTRL)
summary(model_linear) 
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38396 -0.02400 -0.00028  0.02213  0.20143 
## 
## Coefficients: (13 not defined because of singularities)
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           1.404e+01  4.975e-02  282.294  < 2e-16 ***
## INT_SQFT              3.734e-04  5.126e-06   72.858  < 2e-16 ***
## DIST_MAINROAD        -2.319e-06  8.852e-06   -0.262 0.793349    
## QS_ROOMS             -4.573e-03  1.563e-03   -2.927 0.003435 ** 
## QS_BATHROOM          -4.572e-03  1.674e-03   -2.731 0.006328 ** 
## QS_BEDROOM           -3.926e-03  1.909e-03   -2.056 0.039813 *  
## QS_OVERALL            2.034e-02  4.877e-03    4.172 3.06e-05 ***
## REG_FEE               1.129e-01  3.828e-03   29.499  < 2e-16 ***
## COMMIS                8.361e-03  1.111e-03    7.525 5.94e-14 ***
## PROPERTY_AGE         -1.758e-03  4.418e-05  -39.788  < 2e-16 ***
## AREA.Adyar            1.554e-01  4.119e-03   37.736  < 2e-16 ***
## AREA.AnnaNagar        1.845e-01  3.269e-03   56.457  < 2e-16 ***
## AREA.Chrompet         1.293e-01  3.609e-03   35.839  < 2e-16 ***
## AREA.Karapakkam      -1.213e-01  3.742e-03  -32.414  < 2e-16 ***
## AREA.KKNagar         -6.847e-02  2.565e-03  -26.691  < 2e-16 ***
## AREA.TNagar           1.884e-01  3.404e-03   55.342  < 2e-16 ***
## AREA.Velachery               NA         NA       NA       NA    
## N_BEDROOM.1           8.426e-02  6.698e-03   12.579  < 2e-16 ***
## N_BEDROOM.2           8.091e-02  5.504e-03   14.701  < 2e-16 ***
## N_BEDROOM.3           5.269e-02  3.900e-03   13.512  < 2e-16 ***
## N_BEDROOM.4                  NA         NA       NA       NA    
## N_BATHROOM.1         -5.760e-03  3.472e-03   -1.659 0.097162 .  
## N_BATHROOM.2                 NA         NA       NA       NA    
## N_ROOM.2             -7.010e-02  4.166e-03  -16.825  < 2e-16 ***
## N_ROOM.3             -1.916e-02  3.571e-03   -5.367 8.28e-08 ***
## N_ROOM.4                     NA         NA       NA       NA    
## N_ROOM.5                     NA         NA       NA       NA    
## N_ROOM.6                     NA         NA       NA       NA    
## SALE_COND.AbNormal    2.444e-02  1.613e-03   15.150  < 2e-16 ***
## SALE_COND.AdjLand     5.578e-02  1.620e-03   34.439  < 2e-16 ***
## SALE_COND.Family      1.548e-02  1.614e-03    9.589  < 2e-16 ***
## SALE_COND.NormalSale  2.765e-02  1.610e-03   17.179  < 2e-16 ***
## SALE_COND.Partial            NA         NA       NA       NA    
## PARK_FACIL.No        -8.995e-02  1.105e-03  -81.400  < 2e-16 ***
## PARK_FACIL.Yes               NA         NA       NA       NA    
## BUILDTYPE.Commercial  2.909e-01  1.824e-03  159.522  < 2e-16 ***
## BUILDTYPE.House      -6.385e-02  1.274e-03  -50.135  < 2e-16 ***
## BUILDTYPE.Others             NA         NA       NA       NA    
## UTILITY_AVAIL.AllPub  1.100e-02  1.407e-03    7.822 5.96e-15 ***
## UTILITY_AVAIL.ELO    -1.752e-02  1.512e-03  -11.585  < 2e-16 ***
## UTILITY_AVAIL.NoSeWa -1.160e-02  1.409e-03   -8.238  < 2e-16 ***
## UTILITY_AVAIL.NoSewr         NA         NA       NA       NA    
## STREET.Gravel         4.560e-02  1.218e-03   37.432  < 2e-16 ***
## STREET.NoAccess      -5.915e-02  1.340e-03  -44.136  < 2e-16 ***
## STREET.Paved                 NA         NA       NA       NA    
## MZZONE.A             -3.193e-01  2.703e-03 -118.111  < 2e-16 ***
## MZZONE.C             -2.427e-01  2.509e-03  -96.741  < 2e-16 ***
## MZZONE.I             -1.658e-01  2.467e-03  -67.220  < 2e-16 ***
## MZZONE.RH            -9.931e-02  1.484e-03  -66.914  < 2e-16 ***
## MZZONE.RL            -4.893e-02  1.429e-03  -34.244  < 2e-16 ***
## MZZONE.RM                    NA         NA       NA       NA    
## SALE_YEAR.2004        2.787e-02  7.446e-03    3.743 0.000183 ***
## SALE_YEAR.2005        1.844e-02  7.522e-03    2.451 0.014255 *  
## SALE_YEAR.2006        2.032e-02  6.677e-03    3.044 0.002345 ** 
## SALE_YEAR.2007        1.701e-02  6.254e-03    2.720 0.006535 ** 
## SALE_YEAR.2008        1.663e-02  6.222e-03    2.672 0.007549 ** 
## SALE_YEAR.2009        1.428e-02  6.179e-03    2.310 0.020895 *  
## SALE_YEAR.2010        1.266e-02  6.171e-03    2.051 0.040319 *  
## SALE_YEAR.2011        1.114e-02  6.173e-03    1.804 0.071254 .  
## SALE_YEAR.2012        1.053e-02  6.245e-03    1.686 0.091928 .  
## SALE_YEAR.2013        7.284e-03  6.398e-03    1.138 0.254960    
## SALE_YEAR.2014        2.755e-03  6.437e-03    0.428 0.668698    
## SALE_YEAR.2015               NA         NA       NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04269 on 7059 degrees of freedom
## Multiple R-squared:  0.9853, Adjusted R-squared:  0.9852 
## F-statistic:  9639 on 49 and 7059 DF,  p-value: < 2.2e-16
mean(model_linear$resample$RMSE)
## [1] 0.04285586
## RIDGE REGRESSION
set.seed(123) # for reproducibility
model_ridge <- train(x=x,y=y, method="glmnet", metric="RMSE",maximize=FALSE,trControl=CARET.TRAIN.CTRL,tuneGrid=expand.grid(alpha=0, lambda=0.039)) #alpha is set to 0 for Ridge regression                                                                             
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
mean(model_ridge$resample$RMSE)
## [1] 0.05149145
## LASSO 
set.seed(123) # for reproducibility
model_lasso <- train(x=x,y=y,
method="glmnet",
metric="RMSE",
maximize=FALSE,
trControl=CARET.TRAIN.CTRL,
tuneGrid=expand.grid(alpha=1,lambda=0.01))  # alpha is set to 1 for Lasso regression

model_lasso
## glmnet 
## 
## 7109 samples
##   62 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 5687, 5686, 5687, 5687, 5689, 5686, ... 
## Resampling results:
## 
##   RMSE        Rsquared 
##   0.07106875  0.9622713
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## 
## Tuning parameter 'lambda' was held constant at a value of 0.01
## 
mean(model_lasso$resample$RMSE)
## [1] 0.07106875