General Data preperation

  • Load in data
  • Take out columns with substantial NA’s
  • Split DF into 3 exploratory sets
    • Qualitative variables
    • Quantitative variables with normal distributions
    • Quantitative variables with right skewed distributions
df <- read.csv("housing.csv")
df <- as.data.frame(df)
df2 <- df[,-c(6,7,73,74,75,10,40)]
full_df <- subset(df_22, select = -c(GrLivArea,TotalBsmtSF,Exterior1st))
kaggle_test <- as.data.frame(read.csv("test_housing.csv"))
Id_column <- kaggle_test$Id
  • I have eliminated all the columns which predominantly had only one value or predominantly had NA as values.
    • This broad brush stoke approach may lead to losing some hidden value, but with so many predictors I do not believe it should be much of an issue

Split data for exploration

  • I have noticed that several of the independent variables are heavily zero inflated.
    • I am going to separate them into a different df, and take a look at them separately from the rest of the variables
  • I also want to separate the categorical columns into their own df
## Get right skewed col 
a <- colSums(df2 == 0)
right_skewed_cols <- names(a[a>200& is.na(a)==FALSE])
total_df_names <- names(a)

## get difference in columns
results1 = setdiff(total_df_names, right_skewed_cols)

## create new df
r_skew_df <- df2[,right_skewed_cols]
normal_df <- df2[,results1]


## Later analysis revealed several columns that should be categorized as factors
normal_df[,c("MSSubClass","OverallQual","OverallCond")] <- lapply(normal_df[,c("MSSubClass","OverallQual","OverallCond")], factor) 

r_skew_df[,c("BsmtHalfBath","HalfBath",
        "BsmtFullBath","Fireplaces")] <- lapply(r_skew_df[,c("BsmtHalfBath","HalfBath","BsmtFullBath","Fireplaces")], factor) 
r_skew_df <- subset( r_skew_df, select = -c(BsmtHalfBath,HalfBath,
        BsmtFullBath,Fireplaces) )

# get factor cols
is.fact <- sapply(normal_df, is.factor)
is.fact2 <- sapply(r_skew_df, is.factor)
factors_df <- normal_df[, is.fact]
factors_df2 <- r_skew_df[,is.fact2]
factors_df <- bind_cols(factors_df,factors_df2)
factors_df$SalePrice <- normal_df$SalePrice

##
factor_names <- names(factors_df)
normal_names <- names(normal_df)
results2 = setdiff(normal_names, factor_names)
numeric_predictors <- normal_df[,results2]
total_df <- bind_cols(numeric_predictors,factors_df,r_skew_df)
numeric_predictors$SalePrice <- normal_df$SalePrice
r_skew_df$SalePrice <- numeric_predictors$SalePrice

## Grab big df for later
#summary(total_df)

Normally Distributed Variables

Correlation matrix

res <- cor(numeric_predictors,use= "complete.obs")
corrplot(res)

a <- 1:18
chunk <- function(x, n) split(x, sort(rank(x) %% n))
my_list <- chunk(a,4)
  • Above graphs show us that there are several issues with collinearity that may need to be addressed in our model
    • In particular we can see that many of the square footage variables are highly correlated with each other.
    • many of garage areas are highly collinear
  • there does appear that many of our variables are in fact correlated with our target variable, sale price
  • Lets just take a closer look at these correlations

Closer look Normally distributed Correlations

  • below chart prints our correlation values as well as shows us the distributions of our variables through histogram
    • Almost every independent variable has a significant relationship with our dependent variable
      • Only month sold and year sold show little correlation. I believe this may need to be looked at later as our data is during the time period of the housing crisis. Logically this would mean we should have some strong time series effect.
par(mfrow=c(3,3))
chart.Correlation(numeric_predictors[,c(my_list$'0',20)], histogram=TRUE, pch=19)

chart.Correlation(numeric_predictors[,c(my_list$'1',20)], histogram=TRUE, pch=19)

chart.Correlation(numeric_predictors[,c(my_list$'2',20)], histogram=TRUE, pch=19)

chart.Correlation(numeric_predictors[,c(my_list$'3',20)], histogram=TRUE, pch=19)

Multicollinearity

  • Vif chart below shows several variables approaching 5 and over 5
  • We need to figure out a way to reduce the dimensionality of the data, so lets take a closer look at the above statistics, and start dropping some columns manually
my_fit <- lm(SalePrice~.,data= numeric_predictors)
vif(my_fit)
##           Id  LotFrontage      LotArea    YearBuilt YearRemodAdd 
##     1.015087     1.574285     1.314655     4.050863     2.059486 
##   MasVnrArea    BsmtUnfSF  TotalBsmtSF    X1stFlrSF    GrLivArea 
##     1.405308     1.354039     4.424999     4.258494     5.015906 
##     FullBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd  GarageYrBlt 
##     2.470655     2.072899     1.275040     4.429492     4.327836 
##   GarageCars   GarageArea       MoSold       YrSold 
##     4.093638     4.292926     1.034259     1.035333

Columns to drop

  • basementsq and x1stsf extremely collinear, drop x1st
  • basementunitsf highly correlated with basement sf-drop
  • Garagecars and garage yrs built highly collinear with garage area-drop both
  • lot area lot frontage- Lotfrontage has decent amount NA so drop that
  • living area and rooms above ground+ drop rooms above ground
  • Full bath greater living area- drop full bath
  • bedroom above ground
numeric_predictors_2 <- subset( numeric_predictors, 
                                select = - c(BsmtUnfSF,X1stFlrSF,GarageCars,GarageYrBlt,LotFrontage,TotRmsAbvGrd,FullBath,BedroomAbvGr,KitchenAbvGr))
res <- cor(numeric_predictors_2[,2:11],use= "complete.obs")
corrplot(res)

chart.Correlation(numeric_predictors_2[,2:11], histogram=TRUE, pch=19)

my_fit <- lm(SalePrice~.,data= numeric_predictors_2)
vif(my_fit)
##           Id      LotArea    YearBuilt YearRemodAdd   MasVnrArea 
##     1.007435     1.123161     1.923149     1.642460     1.331410 
##  TotalBsmtSF    GrLivArea   GarageArea       MoSold       YrSold 
##     1.581703     1.598778     1.719565     1.025619     1.027127
df2 <- subset(df2, select = -c(Id,FireplaceQu))

finishing results

  • Our VIF matrix, seems much more reasonable now, hopefully I dint overdue it in eliminating variables, but we ca now proceed onto inspection of the skewed variables

Right skewed Variables

Correlation chart for right skewed variables

corrplot(cor(r_skew_df,method = 'pearson', use = 'complete.obs'))

chart.Correlation(r_skew_df[,c(1,2,3,4,5,6,12)], histogram=TRUE, pch=19)

chart.Correlation(r_skew_df[,c(6,7,8,9,10,11,12)], histogram=TRUE, pch=19)

  • Our right skewed variables have very low levels of correlation with our target
  • Lets try log transform see what that looks like

Log transform

  • Log doesn’t seem to help all that much, I believe the zero counts are causing issues
logr_skew_df <- log(r_skew_df+.0001)
corrplot(cor(logr_skew_df,method = 'pearson', use = 'complete.obs'))

chart.Correlation(logr_skew_df[,c(1,2,3,4,5,6,12)], histogram=TRUE, pch=19)

chart.Correlation(logr_skew_df[,c(6,7,8,9,10,11,12)], histogram=TRUE, pch=19)

temp_df <- r_skew_df
temp_df[temp_df == 0] <- NA
table(is.na(temp_df))
## 
## FALSE  TRUE 
##  5187 12333

Plot right skewed variables without zero values

  • This seems much more valuable
    • This shows us several of these variables are useless
    • How I can use the leftover variables if still to be determined
  • Columns to drop
    • BSmtFinsf2,lowqualfinsf, enclosed porch(maybe),x3ssnporch, screenPorch,Pool area,Miscval
  • I removed one outlier where the building had a giant basement that was pulling on the regression line
names_1 <- colnames(temp_df)
lapply(names_1,function(x){ggplot(temp_df, 
        aes_string(x = x, y = 'SalePrice')) +
        geom_point()   })
## [[1]]
## Warning: Removed 467 rows containing missing values (geom_point).

## 
## [[2]]
## Warning: Removed 1293 rows containing missing values (geom_point).

## 
## [[3]]
## Warning: Removed 829 rows containing missing values (geom_point).

## 
## [[4]]
## Warning: Removed 1434 rows containing missing values (geom_point).

## 
## [[5]]
## Warning: Removed 761 rows containing missing values (geom_point).

## 
## [[6]]
## Warning: Removed 656 rows containing missing values (geom_point).

## 
## [[7]]
## Warning: Removed 1252 rows containing missing values (geom_point).

## 
## [[8]]
## Warning: Removed 1436 rows containing missing values (geom_point).

## 
## [[9]]
## Warning: Removed 1344 rows containing missing values (geom_point).

## 
## [[10]]
## Warning: Removed 1453 rows containing missing values (geom_point).

## 
## [[11]]
## Warning: Removed 1408 rows containing missing values (geom_point).

## 
## [[12]]

temp_df <- subset( temp_df, select = -c(BsmtFinSF2,LowQualFinSF,
        X3SsnPorch,EnclosedPorch,PoolArea,MiscVal,ScreenPorch) )
chart.Correlation(temp_df, histogram=TRUE, pch=19)

temp_df <- temp_df %>% 
    filter(BsmtFinSF1<5600)


summary(temp_df$BsmtFinSF1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0   370.8   604.0   647.2   866.2  2260.0
  • Data looks much cleaner, Something needs to be done about the 0 values though.
    • Correlations absent 0 (.52, .66, .19, .056)
    • with ( (.39,.32,.32,.32))
    • the zeroes are adding noise and hurting the variables we need to keep
r_skew_df_2 <- subset( r_skew_df, select = -c(BsmtFinSF2,LowQualFinSF,
        X3SsnPorch,EnclosedPorch,PoolArea,MiscVal,ScreenPorch) )
chart.Correlation(r_skew_df_2, histogram=TRUE, pch=19)

my_fit <- lm(SalePrice~.,data= numeric_predictors_2)

Summarize quantatative exploration

  • Check on what we have left
r_skew_df_2 <- r_skew_df_2[,1:2]
post_analysis_df <- bind_cols(r_skew_df_2,numeric_predictors_2)
corrplot(cor(logr_skew_df,method = 'pearson', use = 'complete.obs'))

chart.Correlation(post_analysis_df[,c(1,2,4,5,6,7,8,9,10,13)], histogram=TRUE, pch=19)

my_fit <- lm(SalePrice~.,data= post_analysis_df)
vif(my_fit)
##   BsmtFinSF1    X2ndFlrSF           Id      LotArea    YearBuilt 
##     1.420428     4.553223     1.007970     1.141775     1.953314 
## YearRemodAdd   MasVnrArea  TotalBsmtSF    GrLivArea   GarageArea 
##     1.648109     1.344100     3.575395     6.003085     1.738801 
##       MoSold       YrSold 
##     1.026079     1.028034
  • Our VIF seems to show a problem with GrLivArea, I am going to leave it for now, lets see how it looks later with the factors explored
  • we have greatly reduced the dimensionality of the db.

Project Specific questions

: 5 points. Descriptive and Inferential Statistics

  • The above charts give us several options, I chose
    • garageArea: looks normally distributed and has high correlation .62
    • Lot frontage: looks normally distributed and has correlation of .35
my_correlation_mat <- cor(numeric_predictors[,c(2,17,20)],method = 'pearson', use = 'complete.obs')
kable(my_correlation_mat,caption=" Correlation matrix" )
Correlation matrix
LotFrontage GarageArea SalePrice
LotFrontage 1.0000000 0.3449967 0.3517991
GarageArea 0.3449967 1.0000000 0.6317615
SalePrice 0.3517991 0.6317615 1.0000000
cor.test(numeric_predictors$LotFrontage,
         numeric_predictors$SalePrice,
         method= "pearson",
         conf.level=.80)
## 
##  Pearson's product-moment correlation
## 
## data:  numeric_predictors$LotFrontage and numeric_predictors$SalePrice
## t = 13.013, df = 1199, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.3189425 0.3838111
## sample estimates:
##       cor 
## 0.3517991
cor.test(numeric_predictors$GarageArea,
         numeric_predictors$SalePrice,
         method= "pearson",
         conf.level=.80)
## 
##  Pearson's product-moment correlation
## 
## data:  numeric_predictors$GarageArea and numeric_predictors$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314
cor.test(numeric_predictors$GarageArea,
         numeric_predictors$LotFrontage,
         method= "pearson",
         conf.level=.80)
## 
##  Pearson's product-moment correlation
## 
## data:  numeric_predictors$GarageArea and numeric_predictors$LotFrontage
## t = 12.727, df = 1199, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.3119708 0.3771899
## sample estimates:
##       cor 
## 0.3449967
  • All of the above confidence intervals show that with .80 confidence there is a correlation between the target variable and the two independent variables
  • In this case I wouldn’t be all that worried about familywise error. Our P values are extremely significant. Family class error is more likely to occur when our P values are close to our threshold. With P value less than 10^-2, we should worry about classwise error. But our P values are all around 10^-16. We could make a significance threshold much higher than typical scientific thresholds(99.99), and we would likely still have a high p value.

For an extra test

  • take two variables that our descriptive statistics dint declare are correlated
    • Mosold and garage area
    • As expected, our p value doesn’t allow us to reject the null
cor.test(numeric_predictors$MoSold,
         numeric_predictors$GarageArea,
         method= "pearson",
         conf.level=.80)
## 
##  Pearson's product-moment correlation
## 
## data:  numeric_predictors$MoSold and numeric_predictors$GarageArea
## t = 1.0686, df = 1458, p-value = 0.2854
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.005593091  0.061477721
## sample estimates:
##       cor 
## 0.0279738

5 points. Linear Algebra and Correlation. Invert your 3 x 3 correlation matrix from above.

my_correlation_mat
##             LotFrontage GarageArea SalePrice
## LotFrontage   1.0000000  0.3449967 0.3517991
## GarageArea    0.3449967  1.0000000 0.6317615
## SalePrice     0.3517991  0.6317615 1.0000000
prec_matrix <- solve(my_correlation_mat)
prec_matrix
##             LotFrontage GarageArea  SalePrice
## LotFrontage   1.1748616 -0.2399936 -0.2616965
## GarageArea   -0.2399936  1.7132574 -0.9979405
## SalePrice    -0.2616965 -0.9979405  1.7225250
my_correlation_mat%*%prec_matrix
##               LotFrontage GarageArea     SalePrice
## LotFrontage  1.000000e+00          0  0.000000e+00
## GarageArea  -2.775558e-17          1 -2.220446e-16
## SalePrice   -5.551115e-17          0  1.000000e+00
prec_matrix%*%my_correlation_mat
##             LotFrontage    GarageArea     SalePrice
## LotFrontage           1 -5.551115e-17  0.000000e+00
## GarageArea            0  1.000000e+00 -1.110223e-16
## SalePrice             0  0.000000e+00  1.000000e+00
lu.decomposition(my_correlation_mat)
## $L
##           [,1]      [,2] [,3]
## [1,] 1.0000000 0.0000000    0
## [2,] 0.3449967 1.0000000    0
## [3,] 0.3517991 0.5793475    1
## 
## $U
##      [,1]      [,2]      [,3]
## [1,]    1 0.3449967 0.3517991
## [2,]    0 0.8809773 0.5103920
## [3,]    0 0.0000000 0.5805431

Playing around with VIF and quantative variables

  • VIF can point us to issues of collinearity
    • Clearly there are issues with the garage columns, although they are not terribly high
my_fit <- lm(SalePrice~.,data= numeric_predictors)
my_fit_2 <- update(my_fit,.~.-GrLivArea)
my_fit_2 <- update(my_fit_2,.~.-TotalBsmtSF)
vif(my_fit)
##           Id  LotFrontage      LotArea    YearBuilt YearRemodAdd 
##     1.015087     1.574285     1.314655     4.050863     2.059486 
##   MasVnrArea    BsmtUnfSF  TotalBsmtSF    X1stFlrSF    GrLivArea 
##     1.405308     1.354039     4.424999     4.258494     5.015906 
##     FullBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd  GarageYrBlt 
##     2.470655     2.072899     1.275040     4.429492     4.327836 
##   GarageCars   GarageArea       MoSold       YrSold 
##     4.093638     4.292926     1.034259     1.035333
vif(my_fit_2)
##           Id  LotFrontage      LotArea    YearBuilt YearRemodAdd 
##     1.014369     1.573987     1.293743     3.783964     2.027873 
##   MasVnrArea    BsmtUnfSF    X1stFlrSF     FullBath BedroomAbvGr 
##     1.344090     1.204705     1.914545     2.193704     2.062311 
## KitchenAbvGr TotRmsAbvGrd  GarageYrBlt   GarageCars   GarageArea 
##     1.193188     2.809577     4.314251     4.078585     4.205734 
##       MoSold       YrSold 
##     1.032221     1.035018

5 points. Calculus-Based Probability & Statistics.

  • lets grab one of our variables from right skewed df
    • OpenPorchSF
fitted_dist2 <- fitdistr(r_skew_df$OpenPorchSF,densfun ="negative binomial") 
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
fitted_dist <- fitdistr(r_skew_df$OpenPorchSF,densfun ="exponential") 
lambda <- fitted_dist$estimate
fitted_dist$estimate
##       rate 
## 0.02143151
par(mfrow=c(2,2))
hist(rexp(1000,lambda),breaks=40)
hist(rnegbin(1000,theta=fitted_dist2$estimate[1]),breaks=40)
hist(r_skew_df$OpenPorchSF, breaks=40)
suppressWarnings(broom::glance(MASS::fitdistr(r_skew_df$OpenPorchSF,"normal")))
##      n    logLik     AIC      BIC
## 1 1460 -8193.699 16391.4 16401.97
suppressWarnings(broom::glance(MASS::fitdistr(r_skew_df$OpenPorchSF,"exponential")))
##      n    logLik      AIC      BIC
## 1 1460 -7070.624 14143.25 14148.53
suppressWarnings(broom::glance(MASS::fitdistr(r_skew_df$OpenPorchSF,"negative binomial")))
##      n    logLik      AIC      BIC
## 1 1460 -5790.731 11585.46 11596.03

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

\[CDF_{exp}(1 ??? e)^{\lambda x}\]

lower_sim <-log(1-0.05)/-lambda
upper_sim <-log(1-0.95)/-lambda
actual_lower <- quantile(r_skew_df$OpenPorchSF,.05)
actual_upper <- quantile(r_skew_df$OpenPorchSF,.95) 
comparison <- as.data.frame(rbind(c(lower_sim,upper_sim),c(actual_lower,actual_upper) ) )
row.names(comparison) <- c("sim","actual")
colnames(comparison) <- c("lower","upper")
kable(comparison)
lower upper
sim 2.393359 139.7817
actual 0.000000 175.0500

Build CI

  • lower bound 43.26
  • upper bound 50.05
a <- mean(r_skew_df$OpenPorchSF)
n <- as.numeric(length(r_skew_df$OpenPorchSF))
mysd <- sd(r_skew_df$OpenPorchSF, na.rm = FALSE)
error <- qnorm(0.975)*mysd/sqrt(n)
my_lower <- a-error
my_upper <- a+error
my_lower
## [1] 43.2617
my_upper
## [1] 50.05885

Back to Kaggle

Explore factor columns

chooseOne = function(question){
    factors_df %>%
        filter(!UQ(sym(question)) == "") %>% 
        dplyr::group_by_(question) %>% 
        dplyr::summarise(count = n()) %>% 
        dplyr::mutate(percent = (count / sum(count)) * 100) %>% 
        dplyr::arrange(desc(count)) 
}

my_names <- colnames(factors_df)
the_names <- my_names[(c(2, 3, 10, 11, 12, 15, 19, 20, 25))]
lapply(my_names[1:5], function(x) chooseOne(x))
## [[1]]
## # A tibble: 15 x 3
##    MSSubClass count percent
##    <fct>      <int>   <dbl>
##  1 20           536  36.7  
##  2 60           299  20.5  
##  3 50           144   9.86 
##  4 120           87   5.96 
##  5 30            69   4.73 
##  6 160           63   4.32 
##  7 70            60   4.11 
##  8 80            58   3.97 
##  9 90            52   3.56 
## 10 190           30   2.05 
## 11 85            20   1.37 
## 12 75            16   1.10 
## 13 45            12   0.822
## 14 180           10   0.685
## 15 40             4   0.274
## 
## [[2]]
## # A tibble: 5 x 3
##   MSZoning count percent
##   <fct>    <int>   <dbl>
## 1 RL        1151  78.8  
## 2 RM         218  14.9  
## 3 FV          65   4.45 
## 4 RH          16   1.10 
## 5 C (all)     10   0.685
## 
## [[3]]
## # A tibble: 4 x 3
##   LotShape count percent
##   <fct>    <int>   <dbl>
## 1 Reg        925  63.4  
## 2 IR1        484  33.2  
## 3 IR2         41   2.81 
## 4 IR3         10   0.685
## 
## [[4]]
## # A tibble: 4 x 3
##   LandContour count percent
##   <fct>       <int>   <dbl>
## 1 Lvl          1311   89.8 
## 2 Bnk            63    4.32
## 3 HLS            50    3.42
## 4 Low            36    2.47
## 
## [[5]]
## # A tibble: 5 x 3
##   LotConfig count percent
##   <fct>     <int>   <dbl>
## 1 Inside     1052  72.1  
## 2 Corner      263  18.0  
## 3 CulDSac      94   6.44 
## 4 FR2          47   3.22 
## 5 FR3           4   0.274
par(mfrow=c(2,2))
for(x in 1:40){
plot(factors_df[,x],factors_df$SalePrice,main=my_names[x])
}

  • When I look at these variables for selection, I want to see that the means differ and that the quartile ranges are compact. Obviously factors with less levels will appear to have more similar means so eyeballing that needs to be taken into account
  • What can I eliminate?
    • Landslope, roofstyle, exterior qual and condition seem to be very similar(remove one), basement exposure,bsmtFintype1,bsmntfintype2,functional, GarageCond
  • strong predictors
    • OverallQual, looks very strong
factors_df <- subset(factors_df, select = -c(LandSlope,Exterior2nd,BldgType,FireplaceQu,RoofStyle,RoofMatl,SaleType,OverallCond,
        ExterQual,GarageQual,Electrical,BsmtCond,ExterCond, HeatingQC,  BsmtExposure,BsmtFinType1,BsmtFinType2,Functional,GarageCond,Condition2,Exterior1st))

Impute Data

  • The imputation was done, commented out, and the results were loaded at beginning of the script.
    • Therefore most of this section is commented out
  • Imputation was done post exploratory analysis
#impute missing values, hand picked 
post_analysis_2 <- bind_cols(post_analysis_df,factors_df)
post_analysis_2 <- post_analysis_2[,-13]
 mice_plot <- aggr(post_analysis_2, col=c('navyblue','yellow'),
                    numbers=TRUE, sortVars=TRUE,
                    labels=names(df2), cex.axis=.7,
                    gap=3, ylab=c("Missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##      Variable       Count
##      BsmtQual 0.055479452
##      BsmtCond 0.055479452
##     ExterQual 0.025342466
##     LotConfig 0.005479452
##    MasVnrType 0.005479452
##    MSSubClass 0.000000000
##      MSZoning 0.000000000
##   LotFrontage 0.000000000
##       LotArea 0.000000000
##      LotShape 0.000000000
##   LandContour 0.000000000
##     LandSlope 0.000000000
##  Neighborhood 0.000000000
##    Condition1 0.000000000
##    Condition2 0.000000000
##      BldgType 0.000000000
##    HouseStyle 0.000000000
##   OverallQual 0.000000000
##   OverallCond 0.000000000
##     YearBuilt 0.000000000
##  YearRemodAdd 0.000000000
##     RoofStyle 0.000000000
##      RoofMatl 0.000000000
##   Exterior1st 0.000000000
##   Exterior2nd 0.000000000
##    MasVnrArea 0.000000000
##     ExterCond 0.000000000
##    Foundation 0.000000000
##  BsmtExposure 0.000000000
##  BsmtFinType1 0.000000000
##    BsmtFinSF1 0.000000000
#df4 <- missForest(post_analysis_2)
#hand_selected__df<- df4$ximp
#save(hand_selected__df,file="handselected.Rda")

## Impute full df
#df4 <- missForest(post_analysis_2)
#whole_df<- df4$ximp
#save(hand_selected__df,file="handselected.Rda")
#save(df_22,file="wholedf.Rda")

Run Models

  • Models are paired with predcitor df
  • Two df from used
  • Hand_selected_df
    • hand selected input variables form data exploration
    • model 1- all hand picked vairables plugged into LM
    • model 2- backwards selected hand selected input variable
    • model 6- hand selected variables plugged into a gridsearch on elastic net
  • full_df
    • DF is the entire set of variables minus very few which resulted in collinearity issues with LM model
    • model3- all variables
    • model4- backwards selected variables
    • model 5- elastic net on all variables

Models ran on hand picked variables

  • Model 1 hand picked
    • log rmse= .1414
  • Model 2 hand picked backwards selected
    • log rmse = .1428
# df_model <- subset(df_44, select = -c(Id,YearRemodAdd))
# #df_model 
library(MASS)



# model 1 on selected variables
set.seed(123)
smp_size <- floor(0.75 * nrow(hand_selected__df))
train_ind <- sample(seq_len(nrow(hand_selected__df)), size = smp_size)
train <- hand_selected__df[train_ind, ]
test <- hand_selected__df[-train_ind, ]
mod1 <- lm(data=hand_selected__df,SalePrice1~.)
predictions <- predict(mod1, test)
model_1_rmse <- rmse(log(test$SalePrice1), log(predictions))   ## .3429
vif(mod1)
##                       GVIF Df GVIF^(1/(2*Df))
## BsmtFinSF1    1.812840e+00  1        1.346417
## X2ndFlrSF     1.766844e+01  1        4.203384
## Id            1.092547e+00  1        1.045250
## LotArea       1.795199e+00  1        1.339851
## YearBuilt     1.304753e+01  1        3.612137
## YearRemodAdd  2.531173e+00  1        1.590966
## MasVnrArea    2.705630e+00  1        1.644880
## TotalBsmtSF   6.209359e+00  1        2.491858
## GrLivArea     1.067143e+01  1        3.266715
## GarageArea    2.332788e+00  1        1.527347
## MoSold        1.126890e+00  1        1.061551
## YrSold        1.146123e+00  1        1.070571
## MSSubClass    2.417886e+07 14        1.835245
## MSZoning      4.215254e+01  4        1.596258
## LotShape      2.131249e+00  3        1.134416
## LandContour   2.635871e+00  3        1.175314
## LotConfig     1.837355e+00  4        1.079007
## Neighborhood  3.860187e+04 24        1.246104
## Condition1    3.156546e+00  8        1.074486
## HouseStyle    7.766806e+05  7        2.634703
## OverallQual   3.235302e+01  9        1.213065
## MasVnrType    4.434212e+00  3        1.281748
## Foundation    1.406771e+01  5        1.302634
## BsmtQual      9.595879e+00  3        1.457742
## CentralAir    1.697293e+00  1        1.302802
## KitchenQual   6.369658e+00  3        1.361505
## GarageType    7.935278e+00  5        1.230145
## GarageFinish  3.314388e+00  2        1.349276
## PavedDrive    1.782207e+00  2        1.155419
## SaleCondition 2.914113e+00  5        1.112886
summary(mod1)
## 
## Call:
## lm(formula = SalePrice1 ~ ., data = hand_selected__df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -400357  -12019    -302   11069  202275 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.034e+05  1.254e+06   0.481 0.630478    
## BsmtFinSF1            1.450e+01  2.266e+00   6.400 2.14e-10 ***
## X2ndFlrSF             2.683e+01  7.390e+00   3.630 0.000294 ***
## Id                   -8.680e-01  1.903e+00  -0.456 0.648326    
## LotArea               4.951e-01  1.030e-01   4.806 1.71e-06 ***
## YearBuilt             1.048e+02  9.179e+01   1.142 0.253694    
## YearRemodAdd          1.799e+02  5.914e+01   3.041 0.002403 ** 
## MasVnrArea            1.180e+01  6.976e+00   1.692 0.090904 .  
## TotalBsmtSF           1.193e+00  4.359e+00   0.274 0.784310    
## GrLivArea             4.365e+01  4.771e+00   9.149  < 2e-16 ***
## GarageArea            2.091e+01  5.483e+00   3.813 0.000143 ***
## MoSold               -4.458e+02  3.014e+02  -1.479 0.139295    
## YrSold               -5.893e+02  6.187e+02  -0.953 0.341007    
## MSSubClass30         -7.113e+03  5.525e+03  -1.287 0.198193    
## MSSubClass40         -5.683e+03  1.529e+04  -0.372 0.710280    
## MSSubClass45         -1.337e+04  2.397e+04  -0.558 0.577231    
## MSSubClass50          4.236e+03  1.018e+04   0.416 0.677300    
## MSSubClass60          3.047e+03  8.846e+03   0.344 0.730578    
## MSSubClass70          4.956e+03  9.517e+03   0.521 0.602590    
## MSSubClass75          4.672e+02  1.729e+04   0.027 0.978447    
## MSSubClass80         -1.195e+02  1.393e+04  -0.009 0.993161    
## MSSubClass85         -7.587e+03  1.248e+04  -0.608 0.543266    
## MSSubClass90         -1.234e+04  6.036e+03  -2.044 0.041125 *  
## MSSubClass120        -2.591e+04  4.641e+03  -5.583 2.85e-08 ***
## MSSubClass160        -2.373e+04  1.058e+04  -2.243 0.025090 *  
## MSSubClass180        -1.757e+04  1.536e+04  -1.144 0.252762    
## MSSubClass190        -5.386e+03  8.625e+03  -0.624 0.532418    
## MSZoningFV            3.499e+04  1.471e+04   2.378 0.017545 *  
## MSZoningRH            2.783e+04  1.470e+04   1.893 0.058560 .  
## MSZoningRL            2.978e+04  1.232e+04   2.417 0.015762 *  
## MSZoningRM            3.117e+04  1.152e+04   2.707 0.006885 ** 
## LotShapeIR2           1.251e+04  5.152e+03   2.429 0.015278 *  
## LotShapeIR3          -3.518e+04  1.037e+04  -3.393 0.000712 ***
## LotShapeReg           3.956e+02  1.973e+03   0.201 0.841095    
## LandContourHLS        2.469e+04  6.228e+03   3.965 7.73e-05 ***
## LandContourLow        2.147e+04  7.371e+03   2.913 0.003637 ** 
## LandContourLvl        1.694e+04  4.309e+03   3.931 8.90e-05 ***
## LotConfigCulDSac      1.062e+04  3.899e+03   2.725 0.006521 ** 
## LotConfigFR2         -9.687e+03  4.918e+03  -1.970 0.049082 *  
## LotConfigFR3         -1.770e+04  1.565e+04  -1.131 0.258336    
## LotConfigInside       6.962e+02  2.141e+03   0.325 0.745132    
## NeighborhoodBlueste  -9.775e+03  2.398e+04  -0.408 0.683646    
## NeighborhoodBrDale   -1.309e+04  1.373e+04  -0.954 0.340448    
## NeighborhoodBrkSide  -1.612e+04  1.121e+04  -1.437 0.150845    
## NeighborhoodClearCr  -1.023e+04  1.089e+04  -0.939 0.347643    
## NeighborhoodCollgCr  -1.649e+04  8.943e+03  -1.844 0.065341 .  
## NeighborhoodCrawfor   9.182e+03  1.034e+04   0.888 0.374767    
## NeighborhoodEdwards  -3.739e+04  9.688e+03  -3.859 0.000119 ***
## NeighborhoodGilbert  -1.668e+04  9.507e+03  -1.754 0.079653 .  
## NeighborhoodIDOTRR   -2.540e+04  1.276e+04  -1.991 0.046718 *  
## NeighborhoodMeadowV  -2.841e+04  1.396e+04  -2.035 0.042022 *  
## NeighborhoodMitchel  -2.661e+04  9.958e+03  -2.672 0.007637 ** 
## NeighborhoodNAmes    -2.332e+04  9.456e+03  -2.466 0.013775 *  
## NeighborhoodNoRidge   3.080e+04  1.031e+04   2.988 0.002857 ** 
## NeighborhoodNPkVill   2.387e+03  1.360e+04   0.176 0.860633    
## NeighborhoodNridgHt   1.467e+04  9.276e+03   1.582 0.113901    
## NeighborhoodNWAmes   -1.983e+04  9.680e+03  -2.048 0.040711 *  
## NeighborhoodOldTown  -3.235e+04  1.145e+04  -2.826 0.004786 ** 
## NeighborhoodSawyer   -2.330e+04  9.878e+03  -2.359 0.018458 *  
## NeighborhoodSawyerW  -1.257e+04  9.512e+03  -1.321 0.186646    
## NeighborhoodSomerst  -7.414e+03  1.123e+04  -0.660 0.509197    
## NeighborhoodStoneBr   2.812e+04  1.026e+04   2.742 0.006196 ** 
## NeighborhoodSWISU    -1.990e+04  1.177e+04  -1.691 0.091144 .  
## NeighborhoodTimber   -1.456e+04  1.010e+04  -1.442 0.149501    
## NeighborhoodVeenker   1.617e+04  1.254e+04   1.290 0.197409    
## Condition1Feedr      -7.865e+03  5.856e+03  -1.343 0.179517    
## Condition1Norm        5.910e+03  4.854e+03   1.218 0.223621    
## Condition1PosA        1.109e+04  1.188e+04   0.933 0.351005    
## Condition1PosN       -8.980e+03  8.558e+03  -1.049 0.294220    
## Condition1RRAe       -1.897e+04  1.073e+04  -1.769 0.077190 .  
## Condition1RRAn        5.772e+03  7.914e+03   0.729 0.465924    
## Condition1RRNe       -6.435e+03  2.192e+04  -0.294 0.769097    
## Condition1RRNn        5.735e+03  1.515e+04   0.379 0.705115    
## HouseStyle1.5Unf      3.097e+04  2.348e+04   1.319 0.187387    
## HouseStyle1Story      2.378e+04  1.013e+04   2.346 0.019118 *  
## HouseStyle2.5Fin     -9.881e+03  1.822e+04  -0.542 0.587597    
## HouseStyle2.5Unf     -2.873e+03  1.783e+04  -0.161 0.871974    
## HouseStyle2Story     -9.497e+03  9.473e+03  -1.003 0.316269    
## HouseStyleSFoyer      2.532e+04  1.342e+04   1.887 0.059377 .  
## HouseStyleSLvl        1.875e+04  1.532e+04   1.224 0.221277    
## OverallQual2          1.088e+04  2.864e+04   0.380 0.704099    
## OverallQual3          1.747e+04  2.366e+04   0.739 0.460323    
## OverallQual4          2.173e+04  2.307e+04   0.942 0.346348    
## OverallQual5          2.396e+04  2.320e+04   1.033 0.302014    
## OverallQual6          3.351e+04  2.326e+04   1.441 0.149910    
## OverallQual7          4.296e+04  2.338e+04   1.837 0.066400 .  
## OverallQual8          6.458e+04  2.366e+04   2.729 0.006427 ** 
## OverallQual9          1.086e+05  2.440e+04   4.451 9.24e-06 ***
## OverallQual10         1.250e+05  2.534e+04   4.932 9.16e-07 ***
## MasVnrTypeBrkFace     1.262e+04  8.198e+03   1.540 0.123809    
## MasVnrTypeNone        1.506e+04  8.235e+03   1.829 0.067567 .  
## MasVnrTypeStone       1.586e+04  8.713e+03   1.821 0.068857 .  
## FoundationCBlock      4.715e+03  3.825e+03   1.233 0.217824    
## FoundationPConc       5.202e+03  4.202e+03   1.238 0.215930    
## FoundationSlab       -9.702e+03  8.699e+03  -1.115 0.264934    
## FoundationStone       8.753e+03  1.280e+04   0.684 0.494312    
## FoundationWood       -2.098e+04  1.827e+04  -1.148 0.251159    
## BsmtQualFa           -1.897e+04  7.537e+03  -2.517 0.011953 *  
## BsmtQualGd           -1.872e+04  4.169e+03  -4.491 7.71e-06 ***
## BsmtQualTA           -2.012e+04  5.063e+03  -3.974 7.43e-05 ***
## CentralAirY           7.372e+03  4.053e+03   1.819 0.069130 .  
## KitchenQualFa        -2.739e+04  7.293e+03  -3.756 0.000180 ***
## KitchenQualGd        -1.869e+04  4.304e+03  -4.343 1.51e-05 ***
## KitchenQualTA        -2.505e+04  4.774e+03  -5.247 1.79e-07 ***
## GarageTypeAttchd      2.488e+04  1.306e+04   1.905 0.057040 .  
## GarageTypeBasment     2.911e+04  1.517e+04   1.919 0.055181 .  
## GarageTypeBuiltIn     2.200e+04  1.368e+04   1.608 0.108119    
## GarageTypeCarPort     1.830e+04  1.657e+04   1.105 0.269430    
## GarageTypeDetchd      2.452e+04  1.302e+04   1.883 0.059853 .  
## GarageFinishRFn      -1.808e+03  2.440e+03  -0.741 0.458841    
## GarageFinishUnf      -4.266e+03  2.902e+03  -1.470 0.141766    
## PavedDriveP          -3.748e+03  6.602e+03  -0.568 0.570291    
## PavedDriveY          -8.789e+02  3.967e+03  -0.222 0.824698    
## SaleConditionAdjLand  2.033e+04  1.605e+04   1.266 0.205639    
## SaleConditionAlloca   1.731e+04  1.019e+04   1.699 0.089607 .  
## SaleConditionFamily   6.068e+02  7.434e+03   0.082 0.934958    
## SaleConditionNormal   9.610e+03  3.218e+03   2.986 0.002877 ** 
## SaleConditionPartial  1.833e+04  4.646e+03   3.944 8.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29320 on 1342 degrees of freedom
## Multiple R-squared:  0.8747, Adjusted R-squared:  0.8638 
## F-statistic:  80.1 on 117 and 1342 DF,  p-value: < 2.2e-16
# Model 2 backwards select on selected variables
fit2 <- lm(data=hand_selected__df,SalePrice1~.)
step.model <- stepAIC(fit2, direction = "both", 
                      trace = FALSE)
summary(step.model)
## 
## Call:
## lm(formula = SalePrice1 ~ BsmtFinSF1 + X2ndFlrSF + LotArea + 
##     YearBuilt + YearRemodAdd + MasVnrArea + GrLivArea + GarageArea + 
##     MoSold + MSSubClass + LotShape + LandContour + LotConfig + 
##     Neighborhood + Condition1 + OverallQual + BsmtQual + CentralAir + 
##     KitchenQual + SaleCondition, data = hand_selected__df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -406923  -11950    -196   11290  210168 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6.700e+05  1.917e+05  -3.494 0.000490 ***
## BsmtFinSF1            1.605e+01  2.114e+00   7.595 5.68e-14 ***
## X2ndFlrSF             1.425e+01  5.424e+00   2.627 0.008717 ** 
## LotArea               5.047e-01  1.016e-01   4.968 7.63e-07 ***
## YearBuilt             1.960e+02  8.149e+01   2.405 0.016292 *  
## YearRemodAdd          1.761e+02  5.764e+01   3.055 0.002290 ** 
## MasVnrArea            8.251e+00  5.520e+00   1.495 0.135213    
## GrLivArea             4.563e+01  3.398e+00  13.429  < 2e-16 ***
## GarageArea            1.788e+01  5.196e+00   3.440 0.000598 ***
## MoSold               -4.747e+02  2.964e+02  -1.602 0.109484    
## MSSubClass30         -4.173e+03  5.267e+03  -0.792 0.428301    
## MSSubClass40         -4.799e+03  1.516e+04  -0.317 0.751659    
## MSSubClass45         -3.022e+03  9.367e+03  -0.323 0.747018    
## MSSubClass50         -1.306e+04  4.611e+03  -2.833 0.004684 ** 
## MSSubClass60         -2.016e+04  4.856e+03  -4.152 3.50e-05 ***
## MSSubClass70         -1.693e+04  6.702e+03  -2.526 0.011632 *  
## MSSubClass75         -1.504e+04  9.693e+03  -1.552 0.120874    
## MSSubClass80         -2.873e+03  4.304e+03  -0.667 0.504657    
## MSSubClass85         -5.017e+03  6.972e+03  -0.720 0.471903    
## MSSubClass90         -2.056e+04  4.953e+03  -4.151 3.52e-05 ***
## MSSubClass120        -2.512e+04  4.336e+03  -5.793 8.58e-09 ***
## MSSubClass160        -4.924e+04  6.772e+03  -7.271 5.97e-13 ***
## MSSubClass180        -1.649e+04  1.184e+04  -1.393 0.163895    
## MSSubClass190        -1.931e+04  6.948e+03  -2.779 0.005526 ** 
## LotShapeIR2           1.175e+04  5.086e+03   2.310 0.021028 *  
## LotShapeIR3          -3.525e+04  1.021e+04  -3.451 0.000575 ***
## LotShapeReg           3.809e+01  1.951e+03   0.020 0.984429    
## LandContourHLS        2.371e+04  6.100e+03   3.887 0.000106 ***
## LandContourLow        1.937e+04  7.250e+03   2.672 0.007623 ** 
## LandContourLvl        1.726e+04  4.208e+03   4.101 4.35e-05 ***
## LotConfigCulDSac      9.348e+03  3.860e+03   2.422 0.015571 *  
## LotConfigFR2         -1.115e+04  4.886e+03  -2.282 0.022626 *  
## LotConfigFR3         -1.862e+04  1.554e+04  -1.198 0.230970    
## LotConfigInside      -7.418e+00  2.120e+03  -0.003 0.997208    
## NeighborhoodBlueste  -1.600e+03  2.313e+04  -0.069 0.944871    
## NeighborhoodBrDale   -1.211e+04  1.263e+04  -0.959 0.337796    
## NeighborhoodBrkSide  -1.339e+04  1.018e+04  -1.315 0.188719    
## NeighborhoodClearCr  -5.837e+03  1.058e+04  -0.552 0.581375    
## NeighborhoodCollgCr  -1.551e+04  8.546e+03  -1.815 0.069694 .  
## NeighborhoodCrawfor   1.147e+04  1.003e+04   1.144 0.253026    
## NeighborhoodEdwards  -3.479e+04  9.271e+03  -3.752 0.000183 ***
## NeighborhoodGilbert  -1.492e+04  9.212e+03  -1.620 0.105558    
## NeighborhoodIDOTRR   -2.794e+04  1.063e+04  -2.629 0.008667 ** 
## NeighborhoodMeadowV  -2.293e+04  1.255e+04  -1.827 0.067842 .  
## NeighborhoodMitchel  -2.555e+04  9.563e+03  -2.672 0.007638 ** 
## NeighborhoodNAmes    -2.068e+04  9.091e+03  -2.275 0.023079 *  
## NeighborhoodNoRidge   3.382e+04  9.940e+03   3.403 0.000686 ***
## NeighborhoodNPkVill   4.068e+03  1.309e+04   0.311 0.755999    
## NeighborhoodNridgHt   1.546e+04  8.815e+03   1.754 0.079607 .  
## NeighborhoodNWAmes   -1.690e+04  9.331e+03  -1.811 0.070356 .  
## NeighborhoodOldTown  -2.909e+04  9.769e+03  -2.978 0.002951 ** 
## NeighborhoodSawyer   -2.112e+04  9.525e+03  -2.217 0.026796 *  
## NeighborhoodSawyerW  -1.233e+04  9.158e+03  -1.346 0.178468    
## NeighborhoodSomerst  -2.091e+03  8.771e+03  -0.238 0.811595    
## NeighborhoodStoneBr   3.026e+04  9.969e+03   3.036 0.002444 ** 
## NeighborhoodSWISU    -1.930e+04  1.126e+04  -1.714 0.086671 .  
## NeighborhoodTimber   -1.457e+04  9.866e+03  -1.477 0.139932    
## NeighborhoodVeenker   2.001e+04  1.222e+04   1.638 0.101572    
## Condition1Feedr      -9.173e+03  5.768e+03  -1.590 0.111953    
## Condition1Norm        5.633e+03  4.770e+03   1.181 0.237795    
## Condition1PosA        1.085e+04  1.184e+04   0.917 0.359563    
## Condition1PosN       -9.299e+03  8.505e+03  -1.093 0.274418    
## Condition1RRAe       -1.935e+04  1.054e+04  -1.835 0.066741 .  
## Condition1RRAn        5.164e+03  7.789e+03   0.663 0.507489    
## Condition1RRNe       -7.446e+03  2.189e+04  -0.340 0.733781    
## Condition1RRNn        5.585e+03  1.477e+04   0.378 0.705426    
## OverallQual2          1.052e+04  2.807e+04   0.375 0.707785    
## OverallQual3          1.897e+04  2.342e+04   0.810 0.418079    
## OverallQual4          2.572e+04  2.280e+04   1.128 0.259390    
## OverallQual5          2.999e+04  2.288e+04   1.311 0.190050    
## OverallQual6          3.969e+04  2.293e+04   1.731 0.083721 .  
## OverallQual7          4.998e+04  2.307e+04   2.166 0.030454 *  
## OverallQual8          7.223e+04  2.335e+04   3.093 0.002018 ** 
## OverallQual9          1.167e+05  2.408e+04   4.848 1.39e-06 ***
## OverallQual10         1.346e+05  2.505e+04   5.372 9.12e-08 ***
## BsmtQualFa           -1.940e+04  7.416e+03  -2.617 0.008974 ** 
## BsmtQualGd           -1.976e+04  4.125e+03  -4.790 1.85e-06 ***
## BsmtQualTA           -2.087e+04  4.924e+03  -4.238 2.41e-05 ***
## CentralAirY           8.858e+03  3.896e+03   2.274 0.023143 *  
## KitchenQualFa        -2.571e+04  7.213e+03  -3.565 0.000377 ***
## KitchenQualGd        -1.730e+04  4.256e+03  -4.065 5.08e-05 ***
## KitchenQualTA        -2.429e+04  4.709e+03  -5.159 2.85e-07 ***
## SaleConditionAdjLand  2.673e+04  1.576e+04   1.696 0.090084 .  
## SaleConditionAlloca   1.674e+04  9.755e+03   1.716 0.086338 .  
## SaleConditionFamily   2.329e+03  7.386e+03   0.315 0.752602    
## SaleConditionNormal   1.076e+04  3.149e+03   3.416 0.000653 ***
## SaleConditionPartial  2.035e+04  4.553e+03   4.470 8.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29410 on 1373 degrees of freedom
## Multiple R-squared:  0.871,  Adjusted R-squared:  0.8629 
## F-statistic: 107.8 on 86 and 1373 DF,  p-value: < 2.2e-16
predictions <- predict(step.model, test)
model_2_rmse <- rmse(log(test$SalePrice), log(predictions))

Full df models

  • model 3 full df
    • .122
  • model 4 full df with backwards selection
    • .13
# Model 3
smp_size <- floor(0.75 * nrow(full_df))
set.seed(123)
train_ind <- sample(seq_len(nrow(full_df)), size = smp_size)
train <- full_df[train_ind, ]
test <- full_df[-train_ind, ]
mod2 <- lm(data=full_df,SalePrice~.)
mod2_sum <- summary(mod2)
alias(mod2)
## Model :
## SalePrice ~ MSSubClass + MSZoning + LotFrontage + LotArea + LotShape + 
##     LandContour + LotConfig + LandSlope + Neighborhood + Condition1 + 
##     Condition2 + BldgType + HouseStyle + OverallQual + OverallCond + 
##     YearBuilt + YearRemodAdd + RoofStyle + RoofMatl + Exterior2nd + 
##     MasVnrType + MasVnrArea + ExterQual + ExterCond + Foundation + 
##     BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 + BsmtFinSF1 + 
##     BsmtFinType2 + BsmtFinSF2 + BsmtUnfSF + HeatingQC + CentralAir + 
##     Electrical + X1stFlrSF + X2ndFlrSF + LowQualFinSF + BsmtFullBath + 
##     BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + 
##     KitchenQual + TotRmsAbvGrd + Functional + Fireplaces + GarageType + 
##     GarageYrBlt + GarageFinish + GarageCars + GarageArea + GarageQual + 
##     GarageCond + PavedDrive + WoodDeckSF + OpenPorchSF + EnclosedPorch + 
##     X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + 
##     YrSold + SaleType + SaleCondition
predictions <- predict(mod2, test)
model_3_rmse <- rmse(log(test$SalePrice), log(predictions))## .473676544
vif(mod2)
##                       GVIF Df GVIF^(1/(2*Df))
## MSSubClass    3.243034e+01  1        5.694765
## MSZoning      5.565414e+01  4        1.652674
## LotFrontage   3.363250e+00  1        1.833917
## LotArea       3.097547e+00  1        1.759985
## LotShape      3.042801e+00  3        1.203776
## LandContour   4.546508e+00  3        1.287102
## LotConfig     2.890435e+00  4        1.141880
## LandSlope     5.583604e+00  2        1.537194
## Neighborhood  8.476487e+05 24        1.328937
## Condition1    1.006241e+01  8        1.155231
## Condition2    2.787284e+01  7        1.268315
## BldgType      3.170863e+02  4        2.054221
## HouseStyle    5.846074e+02  7        1.576279
## OverallQual   5.313678e+00  1        2.305142
## OverallCond   2.600351e+00  1        1.612560
## YearBuilt     1.485690e+01  1        3.854466
## YearRemodAdd  3.571418e+00  1        1.889820
## RoofStyle     8.084095e+01  5        1.551541
## RoofMatl      9.484462e+01  7        1.384252
## Exterior2nd   2.391576e+02 15        1.200299
## MasVnrType    5.965833e+00  3        1.346724
## MasVnrArea    3.016140e+00  1        1.736704
## ExterQual     1.713939e+01  3        1.605705
## ExterCond     4.783348e+00  4        1.216092
## Foundation    2.792192e+01  5        1.395065
## BsmtQual      1.341061e+01  3        1.541374
## BsmtCond      6.163974e+00  3        1.354077
## BsmtExposure  3.554808e+00  3        1.235386
## BsmtFinType1  1.702031e+01  5        1.327690
## BsmtFinSF1    1.203975e+01  1        3.469835
## BsmtFinType2  1.099871e+01  5        1.270967
## BsmtFinSF2    5.211180e+00  1        2.282801
## BsmtUnfSF     8.062634e+00  1        2.839478
## HeatingQC     5.614177e+00  4        1.240683
## CentralAir    2.274110e+00  1        1.508015
## Electrical    1.298990e+01  4        1.377846
## X1stFlrSF     1.075863e+01  1        3.280035
## X2ndFlrSF     1.557348e+01  1        3.946325
## LowQualFinSF  2.160945e+00  1        1.470015
## BsmtFullBath  2.895168e+00  1        1.701519
## BsmtHalfBath  1.425557e+00  1        1.193967
## FullBath      4.016002e+00  1        2.003997
## HalfBath      3.034488e+00  1        1.741978
## BedroomAbvGr  3.385744e+00  1        1.840039
## KitchenAbvGr  4.016196e+00  1        2.004045
## KitchenQual   9.572783e+00  3        1.457157
## TotRmsAbvGrd  6.476568e+00  1        2.544910
## Functional    7.734544e+00  6        1.185868
## Fireplaces    2.043756e+00  1        1.429600
## GarageType    1.408216e+01  5        1.302768
## GarageYrBlt   6.605429e+00  1        2.570103
## GarageFinish  3.945972e+00  2        1.409414
## GarageCars    7.282711e+00  1        2.698650
## GarageArea    7.713437e+00  1        2.777308
## GarageQual    4.872002e+01  4        1.625412
## GarageCond    2.526991e+01  4        1.497357
## PavedDrive    2.336810e+00  2        1.236391
## WoodDeckSF    1.473304e+00  1        1.213797
## OpenPorchSF   1.580156e+00  1        1.257043
## EnclosedPorch 1.565488e+00  1        1.251194
## X3SsnPorch    1.204158e+00  1        1.097341
## ScreenPorch   1.288485e+00  1        1.135115
## PoolArea      1.498178e+00  1        1.224001
## MiscVal       1.383216e+00  1        1.176102
## MoSold        1.205865e+00  1        1.098119
## YrSold        1.289333e+00  1        1.135488
## SaleType      1.586747e+02  8        1.372561
## SaleCondition 1.702461e+02  5        1.671505
plot(mod2)
## Warning: not plotting observations with leverage one:
##   121, 186, 251, 272, 326, 376, 399, 584, 596, 667, 1004, 1231, 1271, 1276, 1299, 1371

## Warning: not plotting observations with leverage one:
##   121, 186, 251, 272, 326, 376, 399, 584, 596, 667, 1004, 1231, 1271, 1276, 1299, 1371

# Model 4 backwards select full 
# seems slightly better and reduces inputs from 117-83
fit2 <- lm(data=full_df,SalePrice~.)
step.model1 <- stepAIC(fit2, direction = "both", 
                      trace = FALSE)
predictions1 <- predict(step.model1, test)
model_4_rmse <- rmse(log(test$SalePrice), log(predictions1))#502544959

Score models

scores_df <- as.data.frame(rbind(model_1_rmse,model_2_rmse,model_3_rmse,model_4_rmse))
colnames(scores_df) <- c("log_rmse")
kable(scores_df)
log_rmse
model_1_rmse 0.1414235
model_2_rmse 0.1428636
model_3_rmse 0.1227174
model_4_rmse 0.1304504

Attempt elastic net

  • Elastic net on entire df
    • Best split ridge alpha=0 logrmse= .155
  • Elastic net on selected variables
    • Best split ridge alpha=0 logrmse= .181
# Model 5 try elastic net with lasso on entire df
## taken from https://www4.stat.ncsu.edu/~post/josh/LASSO_Ridge_Elastic_Net_-_Examples.html

pkgs <- list("glmnet", "doParallel", "foreach", "pROC")
lapply(pkgs, require, character.only = T)
## Loading required package: doParallel
## Warning: package 'doParallel' was built under R version 3.4.4
## Loading required package: parallel
## Loading required package: pROC
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following object is masked from 'package:Metrics':
## 
##     auc
## The following object is masked from 'package:colorspace':
## 
##     coords
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
registerDoParallel(cores = 4)
  
n <- nrow(full_df)
sample <- sample(seq(n), size = n * 0.7, replace = FALSE)
train <- full_df[sample,]


test <- full_df[-sample,]
mdlY <- as.matrix(train["SalePrice"])
#train <- subset(train,select = -c(SalePrice1))
mdlX <-  model.matrix(~., data=train[,1:68])[,-1]


colnames(full_df)
##  [1] "MSSubClass"    "MSZoning"      "LotFrontage"   "LotArea"      
##  [5] "LotShape"      "LandContour"   "LotConfig"     "LandSlope"    
##  [9] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [13] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [17] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior2nd"  
## [21] "MasVnrType"    "MasVnrArea"    "ExterQual"     "ExterCond"    
## [25] "Foundation"    "BsmtQual"      "BsmtCond"      "BsmtExposure" 
## [29] "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2"  "BsmtFinSF2"   
## [33] "BsmtUnfSF"     "HeatingQC"     "CentralAir"    "Electrical"   
## [37] "X1stFlrSF"     "X2ndFlrSF"     "LowQualFinSF"  "BsmtFullBath" 
## [41] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [45] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [49] "Fireplaces"    "GarageType"    "GarageYrBlt"   "GarageFinish" 
## [53] "GarageCars"    "GarageArea"    "GarageQual"    "GarageCond"   
## [57] "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"   "EnclosedPorch"
## [61] "X3SsnPorch"    "ScreenPorch"   "PoolArea"      "MiscVal"      
## [65] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [69] "SalePrice"
y.test <- as.matrix(test["SalePrice"])
x.test <- model.matrix(~., data=test[,1:68])[,-1]


fit.lasso <- glmnet(mdlX, mdlY, family="gaussian", alpha=1)
fit.ridge <- glmnet(mdlX, mdlY, family="gaussian", alpha=0)
fit.elnet <- glmnet(mdlX, mdlY, family="gaussian", alpha=.5)

for (i in 0:10) {
    assign(paste("fit", i, sep=""), cv.glmnet(mdlX, mdlY, type.measure="mse", 
                                              alpha=i/10,family="gaussian"))
}

par(mfrow=c(3,2))
# For plotting options, type '?plot.glmnet' in R console
plot(fit.lasso, xvar="lambda")
plot(fit10, main="LASSO")

plot(fit.ridge, xvar="lambda")
plot(fit0, main="Ridge")

plot(fit.elnet, xvar="lambda")
plot(fit5, main="Elastic Net")

yhat0 <- predict(fit0, s=fit0$lambda.1se, newx=x.test)
yhat1 <- predict(fit1, s=fit1$lambda.1se, newx=x.test)
yhat2 <- predict(fit2, s=fit2$lambda.1se, newx=x.test)
yhat3 <- predict(fit3, s=fit3$lambda.1se, newx=x.test)
yhat4 <- predict(fit4, s=fit4$lambda.1se, newx=x.test)
yhat5 <- predict(fit5, s=fit5$lambda.1se, newx=x.test)
yhat6 <- predict(fit6, s=fit6$lambda.1se, newx=x.test)
yhat7 <- predict(fit7, s=fit7$lambda.1se, newx=x.test)
yhat8 <- predict(fit8, s=fit8$lambda.1se, newx=x.test)
yhat9 <- predict(fit9, s=fit9$lambda.1se, newx=x.test)
yhat10 <- predict(fit10, s=fit10$lambda.1se, newx=x.test)

mse0 <- sqrt(mean((log(y.test) - log(yhat0))^2))
mse1 <- sqrt(mean((log(y.test) - log(yhat1))^2))
mse2 <- sqrt(mean((log(y.test) - log(yhat2))^2))
mse3 <- sqrt(mean((log(y.test) - log(yhat3))^2))
mse4 <- sqrt(mean((log(y.test) - log(yhat4))^2))
mse5 <- sqrt(mean((log(y.test) - log(yhat5))^2))
mse6 <- sqrt(mean((log(y.test) - log(yhat6))^2))
mse7 <- sqrt(mean((log(y.test) - log(yhat7))^2))
mse8 <- sqrt(mean((log(y.test) - log(yhat8))^2))
mse9 <- sqrt(mean((log(y.test) - log(yhat9))^2))
mse10 <-sqrt( mean((log(y.test) - log(yhat10))^2))

as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), MSE=c(mse0,mse1,mse2,mse3,mse4,mse5,mse6,mse7,mse8,mse9,mse10)))
##    alpha       MSE
## 1    0.0 0.1557263
## 2    0.1 0.1558410
## 3    0.2 0.1563490
## 4    0.3 0.1563748
## 5    0.4 0.1620576
## 6    0.5 0.1586945
## 7    0.6 0.1609745
## 8    0.7 0.1642626
## 9    0.8 0.1644280
## 10   0.9 0.1613305
## 11   1.0 0.1631247
#### Model 6 elastic net hand selected

pkgs <- list("glmnet", "doParallel", "foreach", "pROC")
lapply(pkgs, require, character.only = T)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
registerDoParallel(cores = 4)
n <- nrow(hand_selected__df)
sample <- sample(seq(n), size = n * 0.7, replace = FALSE)
train <- hand_selected__df[sample,]




test <- hand_selected__df[-sample,]
mdlY <- as.matrix(train["SalePrice1"])
#train <- subset(train,select = -c(SalePrice1))
mdlX <-  model.matrix(~., data=train[,1:30])[,-1]



y.test <- as.matrix(test["SalePrice1"])
x.test <- model.matrix(~., data=test[,1:30])[,-1]


fit.lasso <- glmnet(mdlX, mdlY, family="gaussian", alpha=1)
fit.ridge <- glmnet(mdlX, mdlY, family="gaussian", alpha=0)
fit.elnet <- glmnet(mdlX, mdlY, family="gaussian", alpha=.5)

for (i in 0:10) {
    assign(paste("fit", i, sep=""), cv.glmnet(mdlX, mdlY, type.measure="mse", 
                                              alpha=i/10,family="gaussian"))
}

par(mfrow=c(3,2))

plot(fit.lasso, xvar="lambda")
plot(fit10, main="LASSO")

plot(fit.ridge, xvar="lambda")
plot(fit0, main="Ridge")

plot(fit.elnet, xvar="lambda")
plot(fit5, main="Elastic Net")

yhat0 <- predict(fit0, s=fit0$lambda.1se, newx=x.test)
yhat1 <- predict(fit1, s=fit1$lambda.1se, newx=x.test)
yhat2 <- predict(fit2, s=fit2$lambda.1se, newx=x.test)
yhat3 <- predict(fit3, s=fit3$lambda.1se, newx=x.test)
yhat4 <- predict(fit4, s=fit4$lambda.1se, newx=x.test)
yhat5 <- predict(fit5, s=fit5$lambda.1se, newx=x.test)
yhat6 <- predict(fit6, s=fit6$lambda.1se, newx=x.test)
yhat7 <- predict(fit7, s=fit7$lambda.1se, newx=x.test)
yhat8 <- predict(fit8, s=fit8$lambda.1se, newx=x.test)
yhat9 <- predict(fit9, s=fit9$lambda.1se, newx=x.test)
yhat10 <- predict(fit10, s=fit10$lambda.1se, newx=x.test)

mymse0 <- sqrt(mean((log(y.test) - log(yhat0))^2))
mymse1 <- sqrt(mean((log(y.test) - log(yhat1))^2))
mymse2 <- sqrt(mean((log(y.test) - log(yhat2))^2))
mymse3 <- sqrt(mean((log(y.test) - log(yhat3))^2))
mymse4 <- sqrt(mean((log(y.test) - log(yhat4))^2))
mymse5 <- sqrt(mean((log(y.test) - log(yhat5))^2))
mymse6 <- sqrt(mean((log(y.test) - log(yhat6))^2))
mymse7 <- sqrt(mean((log(y.test) - log(yhat7))^2))
mymse8 <- sqrt(mean((log(y.test) - log(yhat8))^2))
mymse9 <- sqrt(mean((log(y.test) - log(yhat9))^2))
mymse10 <-sqrt( mean((log(y.test) - log(yhat10))^2))

as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), myMSE=c(mymse0,mymse1,mymse2,mymse3,mymse4,mymse5,mymse6,mymse7,mymse8,mymse9,mymse10)))
##    alpha     myMSE
## 1    0.0 0.1818170
## 2    0.1 0.1991496
## 3    0.2 0.1972896
## 4    0.3 0.1918283
## 5    0.4 0.1926680
## 6    0.5 0.1948259
## 7    0.6 0.1934737
## 8    0.7 0.1926466
## 9    0.8 0.1958409
## 10   0.9 0.1858260
## 11   1.0 0.1914759
#as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), #MSE_whole_df=c(mse0,mse1,mse2,mse3,mse4,mse5,mse6,mse7,mse8,mse9,mse10)))

Kaggle predictions

  • Model 3 was used for prediction
  • reformat Kaggle df to match training data col types as well as impute NA
kaggle_test$SalePrice <- NA
filter_names <- colnames(full_df)
kaggle_test <- kaggle_test[,filter_names]
mice_plot <- aggr(kaggle_test, col=c('navyblue','yellow'),
                    numbers=TRUE, sortVars=TRUE,
                    labels=names(kaggle_test), cex.axis=.7,
                    gap=3, ylab=c("Missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##       Variable       Count
##      SalePrice 1.000000000
##    LotFrontage 0.155586018
##    GarageYrBlt 0.053461275
##   GarageFinish 0.053461275
##     GarageQual 0.053461275
##     GarageCond 0.053461275
##     GarageType 0.052090473
##       BsmtCond 0.030843043
##       BsmtQual 0.030157642
##   BsmtExposure 0.030157642
##   BsmtFinType1 0.028786840
##   BsmtFinType2 0.028786840
##     MasVnrType 0.010966415
##     MasVnrArea 0.010281014
##       MSZoning 0.002741604
##   BsmtFullBath 0.001370802
##   BsmtHalfBath 0.001370802
##     Functional 0.001370802
##    Exterior2nd 0.000685401
##     BsmtFinSF1 0.000685401
##     BsmtFinSF2 0.000685401
##      BsmtUnfSF 0.000685401
##    KitchenQual 0.000685401
##     GarageCars 0.000685401
##     GarageArea 0.000685401
##       SaleType 0.000685401
##     MSSubClass 0.000000000
##        LotArea 0.000000000
##       LotShape 0.000000000
##    LandContour 0.000000000
##      LotConfig 0.000000000
##      LandSlope 0.000000000
##   Neighborhood 0.000000000
##     Condition1 0.000000000
##     Condition2 0.000000000
##       BldgType 0.000000000
##     HouseStyle 0.000000000
##    OverallQual 0.000000000
##    OverallCond 0.000000000
##      YearBuilt 0.000000000
##   YearRemodAdd 0.000000000
##      RoofStyle 0.000000000
##       RoofMatl 0.000000000
##      ExterQual 0.000000000
##      ExterCond 0.000000000
##     Foundation 0.000000000
##      HeatingQC 0.000000000
##     CentralAir 0.000000000
##     Electrical 0.000000000
##      X1stFlrSF 0.000000000
##      X2ndFlrSF 0.000000000
##   LowQualFinSF 0.000000000
##       FullBath 0.000000000
##       HalfBath 0.000000000
##   BedroomAbvGr 0.000000000
##   KitchenAbvGr 0.000000000
##   TotRmsAbvGrd 0.000000000
##     Fireplaces 0.000000000
##     PavedDrive 0.000000000
##     WoodDeckSF 0.000000000
##    OpenPorchSF 0.000000000
##  EnclosedPorch 0.000000000
##     X3SsnPorch 0.000000000
##    ScreenPorch 0.000000000
##       PoolArea 0.000000000
##        MiscVal 0.000000000
##         MoSold 0.000000000
##         YrSold 0.000000000
##  SaleCondition 0.000000000
my_names <- names(Filter(is.factor, full_df)) 
my_names2 <- names(Filter(is.numeric, full_df)) 


## get same types
kaggle_test[,my_names] <- lapply( kaggle_test[,my_names],factor)
#kaggle_test[,my_names2] <- lapply( kaggle_test[,my_names2],numeric)
# #impute missing values, using all parameters as default values
# kaggle_test<- missForest(kaggle_test)
# kaggle_test<- kaggle_test$ximp
# save(kaggle_test,file="Kaggle_ready_predict.Rda")

Make prediction

#loads as kaggle_test
load("Kaggle_ready_predict.Rda")
predictions <- predict(mod2, kaggle_test)
predictions_df <- as.data.frame(predictions)
predictions_df$Id_column <- Id_column
colnames(predictions_df) <- c("SalePrice","Id")
predictions_df2 <- predictions_df[,c("Id","SalePrice")]
write.csv(predictions_df2,'Jherman_submission.csv')
# mod2 <- lm(data=full_df,SalePrice~.)
# mod2_sum <- summary(mod2)
# alias(mod2)
# predictions <- predict(mod2, test)
# model_3_mse <- rmse(log(test$SalePrice), log(predictions))## .473676544
# vif(mod2)
# plot(mod2)

Extra models/exploration

#df_model <- subset(df_44, select = -c(Id,YearRemodAdd))
#fit2 <- lm(data=df_model,SalePrice1~.)
#summary(fit2)

# 
# #step.model <- stepAIC(fit2, direction = "both", 
#         #      trace = FALSE)
# #summary(step.model)
# #crPlots(step.model)
# 
# 
# 
# 
# 
# ## POssible changes 
# #yes
# summary(lm(data=hand_selected__df,SalePrice1~LotArea))
# summary(lm(data=hand_selected__df,SalePrice1~log(LotArea)))
# 
# #no
# summary(lm(data=hand_selected__df,SalePrice1~TotalBsmtSF))
# summary(lm(data=hand_selected__df,SalePrice1~log(TotalBsmtSF+.0001)))
# plot(data=hand_selected__df,SalePrice1~GrLivArea)
# plot(data=hand_selected__df,SalePrice1~I(GrLivArea^2))
# plot(data=hand_selected__df,SalePrice1~log(GrLivArea))
# plot(data=hand_selected__df,SalePrice1~BsmtFinSF1)
# plot(data=hand_selected__df,SalePrice1~I(BsmtFinSF1^2))
# plot(data=hand_selected__df,SalePrice1~log(BsmtFinSF1+.001))
# summary(lm(data=hand_selected__df,SalePrice1~GrLivArea+I(GrLivArea^2)))
# summary(lm(data=hand_selected__df,SalePrice1~log(GrLivArea+.001)))
# 
# ## TotalBsmtSF GrlivingArea Lot area
# #avPlots(step.model)
# 
# 

Attempt numeric only model

my_names2 <- names(Filter(is.numeric, full_df)) 
get_numeric <- sapply(full_df, is.numeric)
numeric_df <- full_df[,my_names2]
numeric_2 <- subset(numeric_df,select = -c(X1stFlrSF,GarageCars))

## Nan was produced so lets run simple alias check and vif check 
numeric_fit <- lm( SalePrice~., data=numeric_2)
alias(numeric_fit)
## Model :
## SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + 
##     OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + 
##     BsmtFinSF2 + BsmtUnfSF + X2ndFlrSF + LowQualFinSF + BsmtFullBath + 
##     BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + 
##     TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageArea + WoodDeckSF + 
##     OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + 
##     PoolArea + MiscVal + MoSold + YrSold
vif(numeric_fit)
##    MSSubClass   LotFrontage       LotArea   OverallQual   OverallCond 
##      1.722172      1.789459      1.275532      3.187443      1.566937 
##     YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1    BsmtFinSF2 
##      5.519956      2.376252      1.390745      3.922340      1.367020 
##     BsmtUnfSF     X2ndFlrSF  LowQualFinSF  BsmtFullBath  BsmtHalfBath 
##      3.123205      4.155479      1.121306      2.211703      1.150635 
##      FullBath      HalfBath  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd 
##      2.739210      2.154137      2.323749      1.584425      4.073303 
##    Fireplaces   GarageYrBlt    GarageArea    WoodDeckSF   OpenPorchSF 
##      1.496814      4.653908      2.168518      1.206311      1.209374 
## EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea       MiscVal 
##      1.284538      1.022178      1.108475      1.097447      1.020406 
##        MoSold        YrSold 
##      1.049460      1.050457
plot(numeric_fit)

summary(numeric_fit)
## 
## Call:
## lm(formula = SalePrice ~ ., data = numeric_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -473869  -17657   -2977   13553  333242 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.806e+05  1.452e+06   0.538 0.590862    
## MSSubClass    -1.809e+02  2.900e+01  -6.239 5.80e-10 ***
## LotFrontage   -2.564e+01  5.458e+01  -0.470 0.638598    
## LotArea        4.893e-01  1.058e-01   4.626 4.07e-06 ***
## OverallQual    1.881e+04  1.207e+03  15.582  < 2e-16 ***
## OverallCond    3.627e+03  1.052e+03   3.449 0.000579 ***
## YearBuilt      2.700e+02  7.272e+01   3.713 0.000213 ***
## YearRemodAdd   2.259e+02  6.980e+01   3.236 0.001239 ** 
## MasVnrArea     3.666e+01  6.095e+00   6.015 2.28e-09 ***
## BsmtFinSF1     3.758e+01  4.059e+00   9.258  < 2e-16 ***
## BsmtFinSF2     2.730e+01  6.776e+00   4.029 5.89e-05 ***
## BsmtUnfSF      2.583e+01  3.739e+00   6.907 7.41e-12 ***
## X2ndFlrSF      2.633e+01  4.366e+00   6.032 2.06e-09 ***
## LowQualFinSF   1.736e+01  2.036e+01   0.853 0.393863    
## BsmtFullBath   8.360e+03  2.679e+03   3.120 0.001844 ** 
## BsmtHalfBath   1.542e+03  4.200e+03   0.367 0.713635    
## FullBath       1.040e+04  2.809e+03   3.704 0.000220 ***
## HalfBath      -6.135e+02  2.729e+03  -0.225 0.822118    
## BedroomAbvGr  -1.068e+04  1.747e+03  -6.115 1.24e-09 ***
## KitchenAbvGr  -8.565e+03  5.341e+03  -1.604 0.108983    
## TotRmsAbvGrd   9.593e+03  1.161e+03   8.264 3.18e-16 ***
## Fireplaces     7.407e+03  1.774e+03   4.175 3.17e-05 ***
## GarageYrBlt   -5.961e+01  8.113e+01  -0.735 0.462629    
## GarageArea     3.757e+01  6.439e+00   5.835 6.66e-09 ***
## WoodDeckSF     3.149e+01  8.192e+00   3.843 0.000127 ***
## OpenPorchSF    1.612e-01  1.552e+01   0.010 0.991711    
## EnclosedPorch  1.360e+01  1.734e+01   0.784 0.432952    
## X3SsnPorch     3.436e+01  3.224e+01   1.066 0.286711    
## ScreenPorch    6.015e+01  1.765e+01   3.408 0.000674 ***
## PoolArea      -1.677e+01  2.438e+01  -0.688 0.491546    
## MiscVal       -1.535e+00  1.903e+00  -0.806 0.420102    
## MoSold         4.335e+00  3.542e+02   0.012 0.990238    
## YrSold        -8.493e+02  7.215e+02  -1.177 0.239305    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35710 on 1427 degrees of freedom
## Multiple R-squared:  0.8024, Adjusted R-squared:  0.7979 
## F-statistic: 181.1 on 32 and 1427 DF,  p-value: < 2.2e-16
n <- nrow(numeric_2)
sample <- sample(seq(n), size = n * 0.7, replace = FALSE)
train <- numeric_2[sample,]



test <- numeric_2[-sample,]
mdlY <- as.matrix(train["SalePrice"])
#train <- subset(train,select = -c(SalePrice1))
mdlX <-  model.matrix(~., data=train[,1:32])[,-1]


colnames(numeric_2)
##  [1] "MSSubClass"    "LotFrontage"   "LotArea"       "OverallQual"  
##  [5] "OverallCond"   "YearBuilt"     "YearRemodAdd"  "MasVnrArea"   
##  [9] "BsmtFinSF1"    "BsmtFinSF2"    "BsmtUnfSF"     "X2ndFlrSF"    
## [13] "LowQualFinSF"  "BsmtFullBath"  "BsmtHalfBath"  "FullBath"     
## [17] "HalfBath"      "BedroomAbvGr"  "KitchenAbvGr"  "TotRmsAbvGrd" 
## [21] "Fireplaces"    "GarageYrBlt"   "GarageArea"    "WoodDeckSF"   
## [25] "OpenPorchSF"   "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"  
## [29] "PoolArea"      "MiscVal"       "MoSold"        "YrSold"       
## [33] "SalePrice"
y.test <- as.matrix(test["SalePrice"])
x.test <- model.matrix(~., data=test[,1:32])[,-1]


fit.lasso <- glmnet(mdlX, mdlY, family="gaussian", alpha=1)
fit.ridge <- glmnet(mdlX, mdlY, family="gaussian", alpha=0)
fit.elnet <- glmnet(mdlX, mdlY, family="gaussian", alpha=.5)

for (i in 0:10) {
    assign(paste("fit", i, sep=""), cv.glmnet(mdlX, mdlY, type.measure="mse", 
                                              alpha=i/10,family="gaussian"))
}

par(mfrow=c(3,2))
# For plotting options, type '?plot.glmnet' in R console
plot(fit.lasso, xvar="lambda")
plot(fit10, main="LASSO")

plot(fit.ridge, xvar="lambda")
plot(fit0, main="Ridge")

plot(fit.elnet, xvar="lambda")
plot(fit5, main="Elastic Net")

yhat0 <- predict(fit0, s=fit0$lambda.1se, newx=x.test)
yhat1 <- predict(fit1, s=fit1$lambda.1se, newx=x.test)
yhat2 <- predict(fit2, s=fit2$lambda.1se, newx=x.test)
yhat3 <- predict(fit3, s=fit3$lambda.1se, newx=x.test)
yhat4 <- predict(fit4, s=fit4$lambda.1se, newx=x.test)
yhat5 <- predict(fit5, s=fit5$lambda.1se, newx=x.test)
yhat6 <- predict(fit6, s=fit6$lambda.1se, newx=x.test)
yhat7 <- predict(fit7, s=fit7$lambda.1se, newx=x.test)
yhat8 <- predict(fit8, s=fit8$lambda.1se, newx=x.test)
yhat9 <- predict(fit9, s=fit9$lambda.1se, newx=x.test)
yhat10 <- predict(fit10, s=fit10$lambda.1se, newx=x.test)

mse0 <- sqrt(mean((log(y.test) - log(yhat0))^2))
mse1 <- sqrt(mean((log(y.test) - log(yhat1))^2))
mse2 <- sqrt(mean((log(y.test) - log(yhat2))^2))
mse3 <- sqrt(mean((log(y.test) - log(yhat3))^2))
mse4 <- sqrt(mean((log(y.test) - log(yhat4))^2))
mse5 <- sqrt(mean((log(y.test) - log(yhat5))^2))
mse6 <- sqrt(mean((log(y.test) - log(yhat6))^2))
## Warning in log(yhat6): NaNs produced
mse7 <- sqrt(mean((log(y.test) - log(yhat7))^2))
## Warning in log(yhat7): NaNs produced
mse8 <- sqrt(mean((log(y.test) - log(yhat8))^2))
## Warning in log(yhat8): NaNs produced
mse9 <- sqrt(mean((log(y.test) - log(yhat9))^2))
## Warning in log(yhat9): NaNs produced
mse10 <-sqrt( mean((log(y.test) - log(yhat10))^2))
## Warning in log(yhat10): NaNs produced
as.data.frame(cbind(alpha= c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1), MSE=c(mse0,mse1,mse2,mse3,mse4,mse5,mse6,mse7,mse8,mse9,mse10)))
##    alpha       MSE
## 1    0.0 0.1846513
## 2    0.1 0.1859044
## 3    0.2 0.1922189
## 4    0.3 0.1995716
## 5    0.4 0.2074744
## 6    0.5 0.2196102
## 7    0.6       NaN
## 8    0.7       NaN
## 9    0.8       NaN
## 10   0.9       NaN
## 11   1.0       NaN
# library(glmulti)
# my_names <- colnames(hand_selected__df)[1:30]
# my_fit <- glmulti('SalePrice1',my_names,hand_selected__df,level=1)
## drop multicollnear col
# df_22 <- subset(df_22, select = -c(TotalBsmtSF))
# df_22 <- subset(df_22, select = -c(GrLivArea))
# df_model <- subset(df_model, select = -c(Electrical,Exterior2nd,BldgType))

# model 2 on whole dataset
# library(MASS)
# ind <- sapply(full_df, is.numeric)
# full_df[ind] <- lapply(full_df[ind], scale)

# ind <- sapply(hand_selected__df, is.numeric)
# hand_selected__df[ind] <- lapply(hand_selected__df[ind], scale)