Data 605 Final Part 2

Steven Ellingson

5/20/2020

I was very surprised to find that the final was going to be based on a dataset I have some familiarity with. I used this house price data set for the Week 12 and 13 discussion. My wife and I are in the process of looking for our first house and that’s why housing prices came to mind when I was looking for a dataset.

Anyway, I will take what I did in those discussions and expand upon it.

Initial Data Exploration

Below I have a function I wrote to pull in some important features of the columns in a dataframe. This will tell me how many unique values we have for the categorical varaibles, and descriptive statistics (mean, standard deviation, minimum, maximum) for the numercial variables.

Because our target variable will be “sale price”, I am also pulling the R^2 of the variable with sale price. For categorical features, I built a model using just that categorical feature (as dummy variables) and reported the R^2 for that model.

library(MASS)
setwd("C:/Users/steven.ellingson/PycharmProjects/cuny/data 605")
df = read.csv('housing_data.csv')

getDFSummary = function(df,target=NULL) {
   #the dplyr library has a nice join function that works well here, tibble is needed to easily move the column nmes into it's own column
    library(dplyr)
    library(tibble)
  
    #Start with easy ones, class, levels, missing values
    s = rownames_to_column(as.data.frame(cbind(sapply(df, class),sapply(df, nlevels),colSums(is.na(df)))))
    
    #next we have some that we only want to calculate for numeric fields
    df.num = df[sapply(df, is.numeric)]
s.num =  rownames_to_column(as.data.frame(cbind(
            sapply(df.num, function(x) trimws(formatC(mean(x,na.rm=TRUE), drop0trailing = TRUE,format = "fg"))),
            sapply(df.num, function(x) trimws(formatC(median(x,na.rm=TRUE),, drop0trailing = TRUE,format = "fg"))),
            sapply(df.num, function(x) trimws(formatC(sd(x,na.rm=TRUE),, drop0trailing = TRUE,format = "fg"))),
            sapply(df.num, function(x) min(x,na.rm=TRUE)),
            sapply(df.num, function(x) max(x,na.rm=TRUE)))
            ,stringsAsFactors = FALSE)
    )
  
    #s = merge(x=s,y=s.num,by="row.names",all.x=TRUE)
    s2 = left_join(s,s.num, by='rowname')
    colnames(s2) = c('Column','Data Type', 'Factor Levels', 'Missing Values', 'Mean','Median', 'SD', 'Min', 'Max')
  
      #If we passed in a target variable, let's create a quick model and append the Adjusted R2
    if(!is.null(target)) {
        #Add the column
        R2col = paste0('R2-',target)
        if(target %in% colnames(df)) {
            s2[,R2col]=NA
            #loop through columns, create the model and store the Adjusted R2
            for(col in s2$Column) {
                #Create a model
                temp_model = lm(df[,target]~df[,col])
                temp_ar = summary(temp_model)$adj.r.squared
                #Make negative if coeffecients add up to a negative value. This is more meaninful for numeric fields
                temp_ar = temp_ar * sign(sum(temp_model$coefficients[-1]))
                s2[,R2col][s2$Column==col] = trimws(formatC(temp_ar, drop0trailing = TRUE,format = "fg"))
            }
            s2[,R2col] = as.numeric(as.character(s2[,R2col]))
        }
    }
  
  
  
  
    #Our columns are all factors.  Let's fix that.
    s2$'Factor Levels' = as.integer(as.character(s2$'Factor Levels'))
    s2$'Missing Values' = as.integer(as.character(s2$'Missing Values'))
    s2$'Mean' = as.numeric(as.character(s2$'Mean'))
    s2$'Median' = as.numeric(as.character(s2$'Median'))
    s2$'SD' = as.numeric(as.character(s2$'SD'))
    s2$'Min' = as.numeric(as.character(s2$'Min'))
    s2$'Max' = as.numeric(as.character(s2$'Max'))

    return(s2)
}

summ = getDFSummary(df,'SalePrice')
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summ[order(-summ$'Missing Values'),]
##           Column Data Type Factor Levels Missing Values        Mean   Median
## 73        PoolQC    factor             3           1453          NA       NA
## 75   MiscFeature    factor             4           1406          NA       NA
## 7          Alley    factor             2           1369          NA       NA
## 74         Fence    factor             4           1179          NA       NA
## 58   FireplaceQu    factor             5            690          NA       NA
## 4    LotFrontage   integer             0            259 7.00500e+01     69.0
## 59    GarageType    factor             6             81          NA       NA
## 60   GarageYrBlt   integer             0             81 1.97900e+03   1980.0
## 61  GarageFinish    factor             3             81          NA       NA
## 64    GarageQual    factor             5             81          NA       NA
## 65    GarageCond    factor             5             81          NA       NA
## 33  BsmtExposure    factor             4             38          NA       NA
## 36  BsmtFinType2    factor             6             38          NA       NA
## 31      BsmtQual    factor             4             37          NA       NA
## 32      BsmtCond    factor             4             37          NA       NA
## 34  BsmtFinType1    factor             6             37          NA       NA
## 26    MasVnrType    factor             4              8          NA       NA
## 27    MasVnrArea   integer             0              8 1.03700e+02      0.0
## 43    Electrical    factor             5              1          NA       NA
## 1             Id   integer             0              0 7.30500e+02    730.5
## 2     MSSubClass   integer             0              0 5.69000e+01     50.0
## 3       MSZoning    factor             5              0          NA       NA
## 5        LotArea   integer             0              0 1.05170e+04   9478.0
## 6         Street    factor             2              0          NA       NA
## 8       LotShape    factor             4              0          NA       NA
## 9    LandContour    factor             4              0          NA       NA
## 10     Utilities    factor             2              0          NA       NA
## 11     LotConfig    factor             5              0          NA       NA
## 12     LandSlope    factor             3              0          NA       NA
## 13  Neighborhood    factor            25              0          NA       NA
## 14    Condition1    factor             9              0          NA       NA
## 15    Condition2    factor             8              0          NA       NA
## 16      BldgType    factor             5              0          NA       NA
## 17    HouseStyle    factor             8              0          NA       NA
## 18   OverallQual   integer             0              0 6.09900e+00      6.0
## 19   OverallCond   integer             0              0 5.57500e+00      5.0
## 20     YearBuilt   integer             0              0 1.97100e+03   1973.0
## 21  YearRemodAdd   integer             0              0 1.98500e+03   1994.0
## 22     RoofStyle    factor             6              0          NA       NA
## 23      RoofMatl    factor             8              0          NA       NA
## 24   Exterior1st    factor            15              0          NA       NA
## 25   Exterior2nd    factor            16              0          NA       NA
## 28     ExterQual    factor             4              0          NA       NA
## 29     ExterCond    factor             5              0          NA       NA
## 30    Foundation    factor             6              0          NA       NA
## 35    BsmtFinSF1   integer             0              0 4.43600e+02    383.5
## 37    BsmtFinSF2   integer             0              0 4.65500e+01      0.0
## 38     BsmtUnfSF   integer             0              0 5.67200e+02    477.5
## 39   TotalBsmtSF   integer             0              0 1.05700e+03    991.5
## 40       Heating    factor             6              0          NA       NA
## 41     HeatingQC    factor             5              0          NA       NA
## 42    CentralAir    factor             2              0          NA       NA
## 44     X1stFlrSF   integer             0              0 1.16300e+03   1087.0
## 45     X2ndFlrSF   integer             0              0 3.47000e+02      0.0
## 46  LowQualFinSF   integer             0              0 5.84500e+00      0.0
## 47     GrLivArea   integer             0              0 1.51500e+03   1464.0
## 48  BsmtFullBath   integer             0              0 4.25300e-01      0.0
## 49  BsmtHalfBath   integer             0              0 5.75300e-02      0.0
## 50      FullBath   integer             0              0 1.56500e+00      2.0
## 51      HalfBath   integer             0              0 3.82900e-01      0.0
## 52  BedroomAbvGr   integer             0              0 2.86600e+00      3.0
## 53  KitchenAbvGr   integer             0              0 1.04700e+00      1.0
## 54   KitchenQual    factor             4              0          NA       NA
## 55  TotRmsAbvGrd   integer             0              0 6.51800e+00      6.0
## 56    Functional    factor             7              0          NA       NA
## 57    Fireplaces   integer             0              0 6.13000e-01      1.0
## 62    GarageCars   integer             0              0 1.76700e+00      2.0
## 63    GarageArea   integer             0              0 4.73000e+02    480.0
## 66    PavedDrive    factor             3              0          NA       NA
## 67    WoodDeckSF   integer             0              0 9.42400e+01      0.0
## 68   OpenPorchSF   integer             0              0 4.66600e+01     25.0
## 69 EnclosedPorch   integer             0              0 2.19500e+01      0.0
## 70    X3SsnPorch   integer             0              0 3.41000e+00      0.0
## 71   ScreenPorch   integer             0              0 1.50600e+01      0.0
## 72      PoolArea   integer             0              0 2.75900e+00      0.0
## 76       MiscVal   integer             0              0 4.34900e+01      0.0
## 77        MoSold   integer             0              0 6.32200e+00      6.0
## 78        YrSold   integer             0              0 2.00800e+03   2008.0
## 79      SaleType    factor             9              0          NA       NA
## 80 SaleCondition    factor             6              0          NA       NA
## 81     SalePrice   integer             0              0 1.80921e+05 163000.0
##            SD   Min    Max R2-SalePrice
## 73         NA    NA     NA   -1.730e-01
## 75         NA    NA     NA   -6.148e-02
## 7          NA    NA     NA    2.775e-01
## 74         NA    NA     NA   -4.058e-02
## 58         NA    NA     NA   -1.085e-01
## 4     24.2800    21    313    1.230e-01
## 59         NA    NA     NA    2.037e-01
## 60    24.6900  1900   2010    2.360e-01
## 61         NA    NA     NA   -2.662e-01
## 64         NA    NA     NA   -2.427e-02
## 65         NA    NA     NA    2.419e-02
## 33         NA    NA     NA   -1.286e-01
## 36         NA    NA     NA   -5.955e-03
## 31         NA    NA     NA   -4.526e-01
## 32         NA    NA     NA    2.676e-02
## 34         NA    NA     NA    1.897e-01
## 26         NA    NA     NA    1.862e-01
## 27   181.1000     0   1600    2.275e-01
## 43         NA    NA     NA   -5.709e-02
## 1    421.6000     1   1460    2.052e-04
## 2     42.3000    20    190   -6.423e-03
## 3          NA    NA     NA    1.051e-01
## 5   9981.0000  1300 215245    6.898e-02
## 6          NA    NA     NA    9.992e-04
## 8          NA    NA     NA    7.447e-02
## 9          NA    NA     NA    2.379e-02
## 10         NA    NA     NA    4.808e-04
## 11         NA    NA     NA    1.833e-02
## 12         NA    NA     NA    1.313e-03
## 13         NA    NA     NA   -5.380e-01
## 14         NA    NA     NA    2.730e-02
## 15         NA    NA     NA    5.126e-03
## 16         NA    NA     NA   -3.188e-02
## 17         NA    NA     NA    8.191e-02
## 18     1.3830     1     10    6.254e-01
## 19     1.1130     1      9   -5.380e-03
## 20    30.2000  1872   2010    2.729e-01
## 21    20.6500  1950   2010    2.566e-01
## 22         NA    NA     NA   -5.446e-02
## 23         NA    NA     NA    2.674e-02
## 24         NA    NA     NA    1.446e-01
## 25         NA    NA     NA    1.450e-01
## 28         NA    NA     NA   -4.763e-01
## 29         NA    NA     NA   -2.093e-02
## 30         NA    NA     NA    2.538e-01
## 35   456.1000     0   5644    1.487e-01
## 37   161.3000     0   1474    5.563e-04
## 38   441.9000     0   2336    4.535e-02
## 39   438.7000     0   6110    3.761e-01
## 40         NA    NA     NA    1.105e-02
## 41         NA    NA     NA   -1.933e-01
## 42         NA    NA     NA    6.252e-02
## 44   386.6000   334   4692    3.666e-01
## 45   436.5000     0   2065    1.014e-01
## 46    48.6200     0    572    2.975e-05
## 47   525.5000   334   5642    5.018e-01
## 48     0.5189     0      3    5.093e-02
## 49     0.2388     0      2    4.020e-04
## 50     0.5509     0      3    3.139e-01
## 51     0.5029     0      2    8.009e-02
## 52     0.8158     0      8    2.763e-02
## 53     0.2203     0      3   -1.780e-02
## 54         NA    NA     NA   -4.555e-01
## 55     1.6250     2     14    2.844e-01
## 56         NA    NA     NA   -1.242e-02
## 57     0.6447     0      3    2.175e-01
## 62     0.7473     0      4    4.097e-01
## 63   213.8000     0   1418    3.882e-01
## 66         NA    NA     NA    5.324e-02
## 67   125.3000     0    857    1.046e-01
## 68    66.2600     0    547    9.915e-02
## 69    61.1200     0    552   -1.586e-02
## 70    29.3200     0    508    1.303e-03
## 71    55.7600     0    480    1.174e-02
## 72    40.1800     0    738    7.858e-03
## 76   496.1000     0  15500    2.366e-04
## 77     2.7040     1     12    1.472e-03
## 78     1.3280  2006   2010   -1.512e-04
## 79         NA    NA     NA    1.325e-01
## 80         NA    NA     NA    1.325e-01
## 81 79443.0000 34900 755000    1.000e+00

First, there are quite a few columns with missing values. Luckily, most are factors. We can just think of missing as another level to these factors.

#Get Factor columns.
factors_w_missing = summ$Column[summ$'Data Type'=='factor' & summ$'Missing Values' > 0]

#loop through and add the NA level, then replace 
for(col in factors_w_missing) {
    levels(df[,col]) = c(levels(df[,col]),'NA') 
    df[,col][is.na(df[,col])] = 'NA'
}

#Let's take another look
summ = getDFSummary(df,'SalePrice')
summ[order(-summ$'Missing Values',-summ$'Factor Levels'),]
##           Column Data Type Factor Levels Missing Values        Mean   Median
## 4    LotFrontage   integer             0            259 7.00500e+01     69.0
## 60   GarageYrBlt   integer             0             81 1.97900e+03   1980.0
## 27    MasVnrArea   integer             0              8 1.03700e+02      0.0
## 13  Neighborhood    factor            25              0          NA       NA
## 25   Exterior2nd    factor            16              0          NA       NA
## 24   Exterior1st    factor            15              0          NA       NA
## 14    Condition1    factor             9              0          NA       NA
## 79      SaleType    factor             9              0          NA       NA
## 15    Condition2    factor             8              0          NA       NA
## 17    HouseStyle    factor             8              0          NA       NA
## 23      RoofMatl    factor             8              0          NA       NA
## 34  BsmtFinType1    factor             7              0          NA       NA
## 36  BsmtFinType2    factor             7              0          NA       NA
## 56    Functional    factor             7              0          NA       NA
## 59    GarageType    factor             7              0          NA       NA
## 22     RoofStyle    factor             6              0          NA       NA
## 30    Foundation    factor             6              0          NA       NA
## 40       Heating    factor             6              0          NA       NA
## 43    Electrical    factor             6              0          NA       NA
## 58   FireplaceQu    factor             6              0          NA       NA
## 64    GarageQual    factor             6              0          NA       NA
## 65    GarageCond    factor             6              0          NA       NA
## 80 SaleCondition    factor             6              0          NA       NA
## 3       MSZoning    factor             5              0          NA       NA
## 11     LotConfig    factor             5              0          NA       NA
## 16      BldgType    factor             5              0          NA       NA
## 26    MasVnrType    factor             5              0          NA       NA
## 29     ExterCond    factor             5              0          NA       NA
## 31      BsmtQual    factor             5              0          NA       NA
## 32      BsmtCond    factor             5              0          NA       NA
## 33  BsmtExposure    factor             5              0          NA       NA
## 41     HeatingQC    factor             5              0          NA       NA
## 74         Fence    factor             5              0          NA       NA
## 75   MiscFeature    factor             5              0          NA       NA
## 8       LotShape    factor             4              0          NA       NA
## 9    LandContour    factor             4              0          NA       NA
## 28     ExterQual    factor             4              0          NA       NA
## 54   KitchenQual    factor             4              0          NA       NA
## 61  GarageFinish    factor             4              0          NA       NA
## 73        PoolQC    factor             4              0          NA       NA
## 7          Alley    factor             3              0          NA       NA
## 12     LandSlope    factor             3              0          NA       NA
## 66    PavedDrive    factor             3              0          NA       NA
## 6         Street    factor             2              0          NA       NA
## 10     Utilities    factor             2              0          NA       NA
## 42    CentralAir    factor             2              0          NA       NA
## 1             Id   integer             0              0 7.30500e+02    730.5
## 2     MSSubClass   integer             0              0 5.69000e+01     50.0
## 5        LotArea   integer             0              0 1.05170e+04   9478.0
## 18   OverallQual   integer             0              0 6.09900e+00      6.0
## 19   OverallCond   integer             0              0 5.57500e+00      5.0
## 20     YearBuilt   integer             0              0 1.97100e+03   1973.0
## 21  YearRemodAdd   integer             0              0 1.98500e+03   1994.0
## 35    BsmtFinSF1   integer             0              0 4.43600e+02    383.5
## 37    BsmtFinSF2   integer             0              0 4.65500e+01      0.0
## 38     BsmtUnfSF   integer             0              0 5.67200e+02    477.5
## 39   TotalBsmtSF   integer             0              0 1.05700e+03    991.5
## 44     X1stFlrSF   integer             0              0 1.16300e+03   1087.0
## 45     X2ndFlrSF   integer             0              0 3.47000e+02      0.0
## 46  LowQualFinSF   integer             0              0 5.84500e+00      0.0
## 47     GrLivArea   integer             0              0 1.51500e+03   1464.0
## 48  BsmtFullBath   integer             0              0 4.25300e-01      0.0
## 49  BsmtHalfBath   integer             0              0 5.75300e-02      0.0
## 50      FullBath   integer             0              0 1.56500e+00      2.0
## 51      HalfBath   integer             0              0 3.82900e-01      0.0
## 52  BedroomAbvGr   integer             0              0 2.86600e+00      3.0
## 53  KitchenAbvGr   integer             0              0 1.04700e+00      1.0
## 55  TotRmsAbvGrd   integer             0              0 6.51800e+00      6.0
## 57    Fireplaces   integer             0              0 6.13000e-01      1.0
## 62    GarageCars   integer             0              0 1.76700e+00      2.0
## 63    GarageArea   integer             0              0 4.73000e+02    480.0
## 67    WoodDeckSF   integer             0              0 9.42400e+01      0.0
## 68   OpenPorchSF   integer             0              0 4.66600e+01     25.0
## 69 EnclosedPorch   integer             0              0 2.19500e+01      0.0
## 70    X3SsnPorch   integer             0              0 3.41000e+00      0.0
## 71   ScreenPorch   integer             0              0 1.50600e+01      0.0
## 72      PoolArea   integer             0              0 2.75900e+00      0.0
## 76       MiscVal   integer             0              0 4.34900e+01      0.0
## 77        MoSold   integer             0              0 6.32200e+00      6.0
## 78        YrSold   integer             0              0 2.00800e+03   2008.0
## 81     SalePrice   integer             0              0 1.80921e+05 163000.0
##            SD   Min    Max R2-SalePrice
## 4     24.2800    21    313    1.230e-01
## 60    24.6900  1900   2010    2.360e-01
## 27   181.1000     0   1600    2.275e-01
## 13         NA    NA     NA   -5.380e-01
## 25         NA    NA     NA    1.450e-01
## 24         NA    NA     NA    1.446e-01
## 14         NA    NA     NA    2.730e-02
## 79         NA    NA     NA    1.325e-01
## 15         NA    NA     NA    5.126e-03
## 17         NA    NA     NA    8.191e-02
## 23         NA    NA     NA    2.674e-02
## 34         NA    NA     NA   -2.076e-01
## 36         NA    NA     NA   -2.629e-02
## 56         NA    NA     NA   -1.242e-02
## 59         NA    NA     NA    2.461e-01
## 22         NA    NA     NA   -5.446e-02
## 30         NA    NA     NA    2.538e-01
## 40         NA    NA     NA    1.105e-02
## 43         NA    NA     NA    5.646e-02
## 58         NA    NA     NA   -2.915e-01
## 64         NA    NA     NA   -7.826e-02
## 65         NA    NA     NA    7.819e-02
## 80         NA    NA     NA    1.325e-01
## 3          NA    NA     NA    1.051e-01
## 11         NA    NA     NA    1.833e-02
## 16         NA    NA     NA   -3.188e-02
## 26         NA    NA     NA    1.866e-01
## 29         NA    NA     NA   -2.093e-02
## 31         NA    NA     NA   -4.635e-01
## 32         NA    NA     NA    4.879e-02
## 33         NA    NA     NA   -1.472e-01
## 41         NA    NA     NA   -1.933e-01
## 74         NA    NA     NA   -3.296e-02
## 75         NA    NA     NA   -4.350e-03
## 8          NA    NA     NA    7.447e-02
## 9          NA    NA     NA    2.379e-02
## 28         NA    NA     NA   -4.763e-01
## 54         NA    NA     NA   -4.555e-01
## 61         NA    NA     NA   -3.044e-01
## 73         NA    NA     NA   -1.918e-02
## 7          NA    NA     NA    1.906e-02
## 12         NA    NA     NA    1.313e-03
## 66         NA    NA     NA    5.324e-02
## 6          NA    NA     NA    9.992e-04
## 10         NA    NA     NA    4.808e-04
## 42         NA    NA     NA    6.252e-02
## 1    421.6000     1   1460    2.052e-04
## 2     42.3000    20    190   -6.423e-03
## 5   9981.0000  1300 215245    6.898e-02
## 18     1.3830     1     10    6.254e-01
## 19     1.1130     1      9   -5.380e-03
## 20    30.2000  1872   2010    2.729e-01
## 21    20.6500  1950   2010    2.566e-01
## 35   456.1000     0   5644    1.487e-01
## 37   161.3000     0   1474    5.563e-04
## 38   441.9000     0   2336    4.535e-02
## 39   438.7000     0   6110    3.761e-01
## 44   386.6000   334   4692    3.666e-01
## 45   436.5000     0   2065    1.014e-01
## 46    48.6200     0    572    2.975e-05
## 47   525.5000   334   5642    5.018e-01
## 48     0.5189     0      3    5.093e-02
## 49     0.2388     0      2    4.020e-04
## 50     0.5509     0      3    3.139e-01
## 51     0.5029     0      2    8.009e-02
## 52     0.8158     0      8    2.763e-02
## 53     0.2203     0      3   -1.780e-02
## 55     1.6250     2     14    2.844e-01
## 57     0.6447     0      3    2.175e-01
## 62     0.7473     0      4    4.097e-01
## 63   213.8000     0   1418    3.882e-01
## 67   125.3000     0    857    1.046e-01
## 68    66.2600     0    547    9.915e-02
## 69    61.1200     0    552   -1.586e-02
## 70    29.3200     0    508    1.303e-03
## 71    55.7600     0    480    1.174e-02
## 72    40.1800     0    738    7.858e-03
## 76   496.1000     0  15500    2.366e-04
## 77     2.7040     1     12    1.472e-03
## 78     1.3280  2006   2010   -1.512e-04
## 81 79443.0000 34900 755000    1.000e+00

For two of these, the missing values align with a categorical feature.

df[is.na(df['MasVnrArea']),'MasVnrType']
## [1] NA NA NA NA NA NA NA NA
## Levels: BrkCmn BrkFace None Stone NA
df[is.na(df['GarageYrBlt']),'GarageCond']
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [76] NA NA NA NA NA NA
## Levels: Ex Fa Gd Po TA NA

I’m not a big fan of either just imputing based on row data or removing good information. Ideally I’d talk to the subject matter expert, but in this case I’ll just have to use my intuition.

For garage year built, I would guess that most are the same year the house was built:

mean(df$GarageYrBlt==df$YearBuilt, na.rm = TRUE)
## [1] 0.7897027

79% of this time it is true. I will just impute the YearBuilt. This is basically nonsense data as these houses don’t have garages, but it should keep it from affecting the model too much.

df$GarageYrBlt[is.na(df$GarageYrBlt)] =df$YearBuilt[is.na(df$GarageYrBlt)]

For masonry Veneer area, I’m going to guess these houses don’t have any. So we’ll impute 0.

df$MasVnrArea[is.na(df$MasVnrArea)] = 0

Lot frontage is tough. There is no categorical feature that seems to split these NAs out. So I will add a binary feature that tells us whether this was blank. That way the model can separate them if there is good information there. I will impute the mean after doing that.

df$LotFrontage_NA=0
df$LotFrontage_NA[is.na(df$LotFrontage)] = 1
sum(df$LotFrontage_NA)
## [1] 259
df$LotFrontage[is.na(df$LotFrontage)] = 70.05
summ = getDFSummary(df,'SalePrice')
summ[order(-summ$'Missing Values',-summ$'Factor Levels'),]
##            Column Data Type Factor Levels Missing Values        Mean    Median
## 13   Neighborhood    factor            25              0          NA        NA
## 25    Exterior2nd    factor            16              0          NA        NA
## 24    Exterior1st    factor            15              0          NA        NA
## 14     Condition1    factor             9              0          NA        NA
## 79       SaleType    factor             9              0          NA        NA
## 15     Condition2    factor             8              0          NA        NA
## 17     HouseStyle    factor             8              0          NA        NA
## 23       RoofMatl    factor             8              0          NA        NA
## 34   BsmtFinType1    factor             7              0          NA        NA
## 36   BsmtFinType2    factor             7              0          NA        NA
## 56     Functional    factor             7              0          NA        NA
## 59     GarageType    factor             7              0          NA        NA
## 22      RoofStyle    factor             6              0          NA        NA
## 30     Foundation    factor             6              0          NA        NA
## 40        Heating    factor             6              0          NA        NA
## 43     Electrical    factor             6              0          NA        NA
## 58    FireplaceQu    factor             6              0          NA        NA
## 64     GarageQual    factor             6              0          NA        NA
## 65     GarageCond    factor             6              0          NA        NA
## 80  SaleCondition    factor             6              0          NA        NA
## 3        MSZoning    factor             5              0          NA        NA
## 11      LotConfig    factor             5              0          NA        NA
## 16       BldgType    factor             5              0          NA        NA
## 26     MasVnrType    factor             5              0          NA        NA
## 29      ExterCond    factor             5              0          NA        NA
## 31       BsmtQual    factor             5              0          NA        NA
## 32       BsmtCond    factor             5              0          NA        NA
## 33   BsmtExposure    factor             5              0          NA        NA
## 41      HeatingQC    factor             5              0          NA        NA
## 74          Fence    factor             5              0          NA        NA
## 75    MiscFeature    factor             5              0          NA        NA
## 8        LotShape    factor             4              0          NA        NA
## 9     LandContour    factor             4              0          NA        NA
## 28      ExterQual    factor             4              0          NA        NA
## 54    KitchenQual    factor             4              0          NA        NA
## 61   GarageFinish    factor             4              0          NA        NA
## 73         PoolQC    factor             4              0          NA        NA
## 7           Alley    factor             3              0          NA        NA
## 12      LandSlope    factor             3              0          NA        NA
## 66     PavedDrive    factor             3              0          NA        NA
## 6          Street    factor             2              0          NA        NA
## 10      Utilities    factor             2              0          NA        NA
## 42     CentralAir    factor             2              0          NA        NA
## 1              Id   integer             0              0 7.30500e+02    730.50
## 2      MSSubClass   integer             0              0 5.69000e+01     50.00
## 4     LotFrontage   numeric             0              0 7.00500e+01     70.05
## 5         LotArea   integer             0              0 1.05170e+04   9478.00
## 18    OverallQual   integer             0              0 6.09900e+00      6.00
## 19    OverallCond   integer             0              0 5.57500e+00      5.00
## 20      YearBuilt   integer             0              0 1.97100e+03   1973.00
## 21   YearRemodAdd   integer             0              0 1.98500e+03   1994.00
## 27     MasVnrArea   numeric             0              0 1.03100e+02      0.00
## 35     BsmtFinSF1   integer             0              0 4.43600e+02    383.50
## 37     BsmtFinSF2   integer             0              0 4.65500e+01      0.00
## 38      BsmtUnfSF   integer             0              0 5.67200e+02    477.50
## 39    TotalBsmtSF   integer             0              0 1.05700e+03    991.50
## 44      X1stFlrSF   integer             0              0 1.16300e+03   1087.00
## 45      X2ndFlrSF   integer             0              0 3.47000e+02      0.00
## 46   LowQualFinSF   integer             0              0 5.84500e+00      0.00
## 47      GrLivArea   integer             0              0 1.51500e+03   1464.00
## 48   BsmtFullBath   integer             0              0 4.25300e-01      0.00
## 49   BsmtHalfBath   integer             0              0 5.75300e-02      0.00
## 50       FullBath   integer             0              0 1.56500e+00      2.00
## 51       HalfBath   integer             0              0 3.82900e-01      0.00
## 52   BedroomAbvGr   integer             0              0 2.86600e+00      3.00
## 53   KitchenAbvGr   integer             0              0 1.04700e+00      1.00
## 55   TotRmsAbvGrd   integer             0              0 6.51800e+00      6.00
## 57     Fireplaces   integer             0              0 6.13000e-01      1.00
## 60    GarageYrBlt   integer             0              0 1.97700e+03   1978.00
## 62     GarageCars   integer             0              0 1.76700e+00      2.00
## 63     GarageArea   integer             0              0 4.73000e+02    480.00
## 67     WoodDeckSF   integer             0              0 9.42400e+01      0.00
## 68    OpenPorchSF   integer             0              0 4.66600e+01     25.00
## 69  EnclosedPorch   integer             0              0 2.19500e+01      0.00
## 70     X3SsnPorch   integer             0              0 3.41000e+00      0.00
## 71    ScreenPorch   integer             0              0 1.50600e+01      0.00
## 72       PoolArea   integer             0              0 2.75900e+00      0.00
## 76        MiscVal   integer             0              0 4.34900e+01      0.00
## 77         MoSold   integer             0              0 6.32200e+00      6.00
## 78         YrSold   integer             0              0 2.00800e+03   2008.00
## 81      SalePrice   integer             0              0 1.80921e+05 163000.00
## 82 LotFrontage_NA   numeric             0              0 1.77400e-01      0.00
##            SD   Min    Max R2-SalePrice
## 13         NA    NA     NA   -5.380e-01
## 25         NA    NA     NA    1.450e-01
## 24         NA    NA     NA    1.446e-01
## 14         NA    NA     NA    2.730e-02
## 79         NA    NA     NA    1.325e-01
## 15         NA    NA     NA    5.126e-03
## 17         NA    NA     NA    8.191e-02
## 23         NA    NA     NA    2.674e-02
## 34         NA    NA     NA   -2.076e-01
## 36         NA    NA     NA   -2.629e-02
## 56         NA    NA     NA   -1.242e-02
## 59         NA    NA     NA    2.461e-01
## 22         NA    NA     NA   -5.446e-02
## 30         NA    NA     NA    2.538e-01
## 40         NA    NA     NA    1.105e-02
## 43         NA    NA     NA    5.646e-02
## 58         NA    NA     NA   -2.915e-01
## 64         NA    NA     NA   -7.826e-02
## 65         NA    NA     NA    7.819e-02
## 80         NA    NA     NA    1.325e-01
## 3          NA    NA     NA    1.051e-01
## 11         NA    NA     NA    1.833e-02
## 16         NA    NA     NA   -3.188e-02
## 26         NA    NA     NA    1.866e-01
## 29         NA    NA     NA   -2.093e-02
## 31         NA    NA     NA   -4.635e-01
## 32         NA    NA     NA    4.879e-02
## 33         NA    NA     NA   -1.472e-01
## 41         NA    NA     NA   -1.933e-01
## 74         NA    NA     NA   -3.296e-02
## 75         NA    NA     NA   -4.350e-03
## 8          NA    NA     NA    7.447e-02
## 9          NA    NA     NA    2.379e-02
## 28         NA    NA     NA   -4.763e-01
## 54         NA    NA     NA   -4.555e-01
## 61         NA    NA     NA   -3.044e-01
## 73         NA    NA     NA   -1.918e-02
## 7          NA    NA     NA    1.906e-02
## 12         NA    NA     NA    1.313e-03
## 66         NA    NA     NA    5.324e-02
## 6          NA    NA     NA    9.992e-04
## 10         NA    NA     NA    4.808e-04
## 42         NA    NA     NA    6.252e-02
## 1    421.6000     1   1460    2.052e-04
## 2     42.3000    20    190   -6.423e-03
## 4     22.0200    21    313    1.115e-01
## 5   9981.0000  1300 215245    6.898e-02
## 18     1.3830     1     10    6.254e-01
## 19     1.1130     1      9   -5.380e-03
## 20    30.2000  1872   2010    2.729e-01
## 21    20.6500  1950   2010    2.566e-01
## 27   180.7000     0   1600    2.228e-01
## 35   456.1000     0   5644    1.487e-01
## 37   161.3000     0   1474    5.563e-04
## 38   441.9000     0   2336    4.535e-02
## 39   438.7000     0   6110    3.761e-01
## 44   386.6000   334   4692    3.666e-01
## 45   436.5000     0   2065    1.014e-01
## 46    48.6200     0    572    2.975e-05
## 47   525.5000   334   5642    5.018e-01
## 48     0.5189     0      3    5.093e-02
## 49     0.2388     0      2    4.020e-04
## 50     0.5509     0      3    3.139e-01
## 51     0.5029     0      2    8.009e-02
## 52     0.8158     0      8    2.763e-02
## 53     0.2203     0      3   -1.780e-02
## 55     1.6250     2     14    2.844e-01
## 57     0.6447     0      3    2.175e-01
## 60    26.3100  1872   2010    2.576e-01
## 62     0.7473     0      4    4.097e-01
## 63   213.8000     0   1418    3.882e-01
## 67   125.3000     0    857    1.046e-01
## 68    66.2600     0    547    9.915e-02
## 69    61.1200     0    552   -1.586e-02
## 70    29.3200     0    508    1.303e-03
## 71    55.7600     0    480    1.174e-02
## 72    40.1800     0    738    7.858e-03
## 76   496.1000     0  15500    2.366e-04
## 77     2.7040     1     12    1.472e-03
## 78     1.3280  2006   2010   -1.512e-04
## 81 79443.0000 34900 755000    1.000e+00
## 82     0.3821     0      1   -6.692e-04

OK, we have no missing values anymore, let’s look at the correlations

summ = getDFSummary(df,'SalePrice')
summ[order(-abs(summ$'R2-SalePrice'),-summ$'Factor Levels'),]
##            Column Data Type Factor Levels Missing Values        Mean    Median
## 81      SalePrice   integer             0              0 1.80921e+05 163000.00
## 18    OverallQual   integer             0              0 6.09900e+00      6.00
## 13   Neighborhood    factor            25              0          NA        NA
## 47      GrLivArea   integer             0              0 1.51500e+03   1464.00
## 28      ExterQual    factor             4              0          NA        NA
## 31       BsmtQual    factor             5              0          NA        NA
## 54    KitchenQual    factor             4              0          NA        NA
## 62     GarageCars   integer             0              0 1.76700e+00      2.00
## 63     GarageArea   integer             0              0 4.73000e+02    480.00
## 39    TotalBsmtSF   integer             0              0 1.05700e+03    991.50
## 44      X1stFlrSF   integer             0              0 1.16300e+03   1087.00
## 50       FullBath   integer             0              0 1.56500e+00      2.00
## 61   GarageFinish    factor             4              0          NA        NA
## 58    FireplaceQu    factor             6              0          NA        NA
## 55   TotRmsAbvGrd   integer             0              0 6.51800e+00      6.00
## 20      YearBuilt   integer             0              0 1.97100e+03   1973.00
## 60    GarageYrBlt   integer             0              0 1.97700e+03   1978.00
## 21   YearRemodAdd   integer             0              0 1.98500e+03   1994.00
## 30     Foundation    factor             6              0          NA        NA
## 59     GarageType    factor             7              0          NA        NA
## 27     MasVnrArea   numeric             0              0 1.03100e+02      0.00
## 57     Fireplaces   integer             0              0 6.13000e-01      1.00
## 34   BsmtFinType1    factor             7              0          NA        NA
## 41      HeatingQC    factor             5              0          NA        NA
## 26     MasVnrType    factor             5              0          NA        NA
## 35     BsmtFinSF1   integer             0              0 4.43600e+02    383.50
## 33   BsmtExposure    factor             5              0          NA        NA
## 25    Exterior2nd    factor            16              0          NA        NA
## 24    Exterior1st    factor            15              0          NA        NA
## 79       SaleType    factor             9              0          NA        NA
## 80  SaleCondition    factor             6              0          NA        NA
## 4     LotFrontage   numeric             0              0 7.00500e+01     70.05
## 3        MSZoning    factor             5              0          NA        NA
## 67     WoodDeckSF   integer             0              0 9.42400e+01      0.00
## 45      X2ndFlrSF   integer             0              0 3.47000e+02      0.00
## 68    OpenPorchSF   integer             0              0 4.66600e+01     25.00
## 17     HouseStyle    factor             8              0          NA        NA
## 51       HalfBath   integer             0              0 3.82900e-01      0.00
## 64     GarageQual    factor             6              0          NA        NA
## 65     GarageCond    factor             6              0          NA        NA
## 8        LotShape    factor             4              0          NA        NA
## 5         LotArea   integer             0              0 1.05170e+04   9478.00
## 42     CentralAir    factor             2              0          NA        NA
## 43     Electrical    factor             6              0          NA        NA
## 22      RoofStyle    factor             6              0          NA        NA
## 66     PavedDrive    factor             3              0          NA        NA
## 48   BsmtFullBath   integer             0              0 4.25300e-01      0.00
## 32       BsmtCond    factor             5              0          NA        NA
## 38      BsmtUnfSF   integer             0              0 5.67200e+02    477.50
## 74          Fence    factor             5              0          NA        NA
## 16       BldgType    factor             5              0          NA        NA
## 52   BedroomAbvGr   integer             0              0 2.86600e+00      3.00
## 14     Condition1    factor             9              0          NA        NA
## 23       RoofMatl    factor             8              0          NA        NA
## 36   BsmtFinType2    factor             7              0          NA        NA
## 9     LandContour    factor             4              0          NA        NA
## 29      ExterCond    factor             5              0          NA        NA
## 73         PoolQC    factor             4              0          NA        NA
## 7           Alley    factor             3              0          NA        NA
## 11      LotConfig    factor             5              0          NA        NA
## 53   KitchenAbvGr   integer             0              0 1.04700e+00      1.00
## 69  EnclosedPorch   integer             0              0 2.19500e+01      0.00
## 56     Functional    factor             7              0          NA        NA
## 71    ScreenPorch   integer             0              0 1.50600e+01      0.00
## 40        Heating    factor             6              0          NA        NA
## 72       PoolArea   integer             0              0 2.75900e+00      0.00
## 2      MSSubClass   integer             0              0 5.69000e+01     50.00
## 19    OverallCond   integer             0              0 5.57500e+00      5.00
## 15     Condition2    factor             8              0          NA        NA
## 75    MiscFeature    factor             5              0          NA        NA
## 77         MoSold   integer             0              0 6.32200e+00      6.00
## 12      LandSlope    factor             3              0          NA        NA
## 70     X3SsnPorch   integer             0              0 3.41000e+00      0.00
## 6          Street    factor             2              0          NA        NA
## 82 LotFrontage_NA   numeric             0              0 1.77400e-01      0.00
## 37     BsmtFinSF2   integer             0              0 4.65500e+01      0.00
## 10      Utilities    factor             2              0          NA        NA
## 49   BsmtHalfBath   integer             0              0 5.75300e-02      0.00
## 76        MiscVal   integer             0              0 4.34900e+01      0.00
## 1              Id   integer             0              0 7.30500e+02    730.50
## 78         YrSold   integer             0              0 2.00800e+03   2008.00
## 46   LowQualFinSF   integer             0              0 5.84500e+00      0.00
##            SD   Min    Max R2-SalePrice
## 81 79443.0000 34900 755000    1.000e+00
## 18     1.3830     1     10    6.254e-01
## 13         NA    NA     NA   -5.380e-01
## 47   525.5000   334   5642    5.018e-01
## 28         NA    NA     NA   -4.763e-01
## 31         NA    NA     NA   -4.635e-01
## 54         NA    NA     NA   -4.555e-01
## 62     0.7473     0      4    4.097e-01
## 63   213.8000     0   1418    3.882e-01
## 39   438.7000     0   6110    3.761e-01
## 44   386.6000   334   4692    3.666e-01
## 50     0.5509     0      3    3.139e-01
## 61         NA    NA     NA   -3.044e-01
## 58         NA    NA     NA   -2.915e-01
## 55     1.6250     2     14    2.844e-01
## 20    30.2000  1872   2010    2.729e-01
## 60    26.3100  1872   2010    2.576e-01
## 21    20.6500  1950   2010    2.566e-01
## 30         NA    NA     NA    2.538e-01
## 59         NA    NA     NA    2.461e-01
## 27   180.7000     0   1600    2.228e-01
## 57     0.6447     0      3    2.175e-01
## 34         NA    NA     NA   -2.076e-01
## 41         NA    NA     NA   -1.933e-01
## 26         NA    NA     NA    1.866e-01
## 35   456.1000     0   5644    1.487e-01
## 33         NA    NA     NA   -1.472e-01
## 25         NA    NA     NA    1.450e-01
## 24         NA    NA     NA    1.446e-01
## 79         NA    NA     NA    1.325e-01
## 80         NA    NA     NA    1.325e-01
## 4     22.0200    21    313    1.115e-01
## 3          NA    NA     NA    1.051e-01
## 67   125.3000     0    857    1.046e-01
## 45   436.5000     0   2065    1.014e-01
## 68    66.2600     0    547    9.915e-02
## 17         NA    NA     NA    8.191e-02
## 51     0.5029     0      2    8.009e-02
## 64         NA    NA     NA   -7.826e-02
## 65         NA    NA     NA    7.819e-02
## 8          NA    NA     NA    7.447e-02
## 5   9981.0000  1300 215245    6.898e-02
## 42         NA    NA     NA    6.252e-02
## 43         NA    NA     NA    5.646e-02
## 22         NA    NA     NA   -5.446e-02
## 66         NA    NA     NA    5.324e-02
## 48     0.5189     0      3    5.093e-02
## 32         NA    NA     NA    4.879e-02
## 38   441.9000     0   2336    4.535e-02
## 74         NA    NA     NA   -3.296e-02
## 16         NA    NA     NA   -3.188e-02
## 52     0.8158     0      8    2.763e-02
## 14         NA    NA     NA    2.730e-02
## 23         NA    NA     NA    2.674e-02
## 36         NA    NA     NA   -2.629e-02
## 9          NA    NA     NA    2.379e-02
## 29         NA    NA     NA   -2.093e-02
## 73         NA    NA     NA   -1.918e-02
## 7          NA    NA     NA    1.906e-02
## 11         NA    NA     NA    1.833e-02
## 53     0.2203     0      3   -1.780e-02
## 69    61.1200     0    552   -1.586e-02
## 56         NA    NA     NA   -1.242e-02
## 71    55.7600     0    480    1.174e-02
## 40         NA    NA     NA    1.105e-02
## 72    40.1800     0    738    7.858e-03
## 2     42.3000    20    190   -6.423e-03
## 19     1.1130     1      9   -5.380e-03
## 15         NA    NA     NA    5.126e-03
## 75         NA    NA     NA   -4.350e-03
## 77     2.7040     1     12    1.472e-03
## 12         NA    NA     NA    1.313e-03
## 70    29.3200     0    508    1.303e-03
## 6          NA    NA     NA    9.992e-04
## 82     0.3821     0      1   -6.692e-04
## 37   161.3000     0   1474    5.563e-04
## 10         NA    NA     NA    4.808e-04
## 49     0.2388     0      2    4.020e-04
## 76   496.1000     0  15500    2.366e-04
## 1    421.6000     1   1460    2.052e-04
## 78     1.3280  2006   2010   -1.512e-04
## 46    48.6200     0    572    2.975e-05

Matrix Manipulation

Let’s pull out three quantitative variables that are highly correlated to the predictor for our correlation matrix

X = df[c('OverallQual', 'GrLivArea','GarageCars')]
Xc = cor(X)
Xc
##             OverallQual GrLivArea GarageCars
## OverallQual   1.0000000 0.5930074  0.6006707
## GrLivArea     0.5930074 1.0000000  0.4672474
## GarageCars    0.6006707 0.4672474  1.0000000

Not a huge surprise to see some high correlations here, both because they all correlate well with SalePrice, and also just knowing what these are. It makes sense that larger homes have larger garages. Quality of the house correlating with size isn’t quite as obvious but isn’t a surprise either.

det(Xc)
## [1] 0.4020858

The matrix is non-singular, we can invert. I’ll use a function I wrote earlier in the semester

Xi = solve(Xc)
Xi
##             OverallQual  GrLivArea GarageCars
## OverallQual   1.9440621 -0.7768132 -0.8047772
## GrLivArea    -0.7768132  1.5896971 -0.2761729
## GarageCars   -0.8047772 -0.2761729  1.6124472
Xc%*%Xi
##               OverallQual     GrLivArea    GarageCars
## OverallQual  1.000000e+00  0.000000e+00  0.000000e+00
## GrLivArea   -1.110223e-16  1.000000e+00 -1.110223e-16
## GarageCars   0.000000e+00 -5.551115e-17  1.000000e+00
Xi%*%Xc
##              OverallQual     GrLivArea    GarageCars
## OverallQual 1.000000e+00 -1.665335e-16 -1.110223e-16
## GrLivArea   0.000000e+00  1.000000e+00 -5.551115e-17
## GarageCars  1.110223e-16 -1.110223e-16  1.000000e+00

Here I have a function I wrote earlier in the semester that will do the LU decomposition for me.

matrix_functions =function(A.list){
  n = sqrt(length(A.list))
  A = matrix(A.list,byrow=TRUE,nrow=n)
  #For U, just start with a copy of A
  U = A
  #create L as an identity matrix
  L = matrix(0,nrow=n,ncol=n)
  for (i in 1:n) {
    L[i,i]=1
  }
  
  R = A

  #p will be the pivot row
  for (p in 1:(n)) {
    #set "done" to false, we'll use this later
    done = FALSE

    #check for zero pivot, switch column if necessary
    if (U[p,p]==0) {
      #our pivot is zero, we need to loop through the remaining rows to find a non-zero
      k = p
      found = FALSE
      while (k < n & found == FALSE) {
        k = k+1

        if (U[k,p]!=0) {
          found = TRUE
        }
      }
      #If we found a non-zero entry to pivot on,switch with current row
      if (found) {
        temprow = U[p,]
        U[p,] = U[k,]
        U[k,] = temprow
      } else {
        #if everything is zero, I think we're ok, just leave it.
        done = TRUE
      }
    }
    
    #Same thing for R 
    if (R[p,p]==0) {
      #our pivot is zero, we need to loop through the remaining rows to find a non-zero
      k = 0
      found = FALSE
      while (k < n & found == FALSE) {
        k = k+1

        if (R[k,p]!=0) {
          found = TRUE
        }
      }
      #If we found a non-zero entry to pivot on,switch with current row
      if (found) {
        temprow = R[p,]
        R[p,] = R[k,]
        R[k,] = temprow
      } else {
        #if everything is zero, I think we're ok, just leave it.
        done = TRUE
      }
    }
    
    
    
    if (!done) {
      for (i in 1:n){
        #find the multiplier
        if (i>p & p<n) {
          r = U[i,p]/U[p,p]*(-1)
          #for factorization, we are only looking for the values further south than p (lower left of matrix)
        #multiply by row p and add to row i
          U[i,] = round((U[p,]*r)+U[i,],8)
          L[i,p] = round(r*(-1), 8)
        }
        if (i!=p) {
          r = R[i,p]/R[p,p]*(-1)
          R[i,] = round((R[p,]*r)+R[i,],8)
        }
      }
    }
  }
  
  #For R, we want to divide by the pivot to get all 1s as pivots.
  #we can also determine the rank
  
  rank = 0
  for (i in 1:n) {
    p=0
    for (j in 1:n) {
      if(p==0) {
          if(R[i,j] != 0) {
            p = R[i,j]      
          }
      }
    }
    if (p!=0) {
      R[i,] = R[i,]/p
      rank = rank + 1
    }
  }

  #Let's do some checks before assuming we got this right
  #Is L a lower trangular matrix?
  L.good = TRUE
  for (i in 1:n) {
    for (j in 1:n) {
      val = L[i,j]
      if(i==j) {
        if (val!=1) {
          # For L, the diagonal should be 1s
          L.good = FALSE
            print ("WARNING: L does not have all 1s for diagonals!")
        } else if (j>i) {
          #j>i is the upper right of the matrix, should be all 0's for L
          if (val!=0) {
            L.good = FALSE
            print ("WARNING: L is not a lower triangular matrix!")
          }
        }
      }
    }
  }
  #Is U an upper trangular matrix?
  U.good = TRUE
  for (i in 1:n) {
    for (j in 1:n) {
      val = U[i,j]
      if (j<i) {
        #j<i is the lower left of the matrix, should be all 0's for U
        if (val!=0) {
          U.good = FALSE
          print ("WARNING: U is not an lower triangular matrix!")
        }
      }
    }
  }

  #Multiply them together to see if we got a successful factorization.
  A.new = L%*%U
  A.good = TRUE
  for (i in 1:n) {
    for (j in 1:n) {
      dif  = abs(A[i,j]-A.new[i,j])
      if (dif > 0.001) {
        A.good = FALSE
        print(paste0("WARNING: A!=UL, A[",i,",",j,"]=",A[i,j],",Anew[",i,",",j,"]=",A.new[i,j],", dif = ",dif))
      }
    }
  }

  return(list("L"=L,"U"=U,"L is LTM"=L.good,"U is UTM" = U.good,"UL = A" = A.good, "R"=R, "rank"=rank))
#  return(c(L,U))
}
matrix_functions(Xc)
## $L
##           [,1]      [,2] [,3]
## [1,] 1.0000000 0.0000000    0
## [2,] 0.5930074 1.0000000    0
## [3,] 0.6006707 0.1712756    1
## 
## $U
##      [,1]      [,2]      [,3]
## [1,]    1 0.5930074 0.6006707
## [2,]    0 0.6483422 0.1110452
## [3,]    0 0.0000000 0.6201754
## 
## $`L is LTM`
## [1] TRUE
## 
## $`U is UTM`
## [1] TRUE
## 
## $`UL = A`
## [1] TRUE
## 
## $R
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1
## 
## $rank
## [1] 3

Exponential Fit

We should be able to find a right-skewed variable easily with our summary table. Let’s look for variables where the mean is much higher than the median. We’ll create

summ$MeanMedian = round((summ$Mean-summ$Median)/summ$SD, 2)

summ[order(-summ$'MeanMedian'),]
##            Column Data Type Factor Levels Missing Values        Mean    Median
## 48   BsmtFullBath   integer             0              0 4.25300e-01      0.00
## 45      X2ndFlrSF   integer             0              0 3.47000e+02      0.00
## 51       HalfBath   integer             0              0 3.82900e-01      0.00
## 67     WoodDeckSF   integer             0              0 9.42400e+01      0.00
## 27     MasVnrArea   numeric             0              0 1.03100e+02      0.00
## 19    OverallCond   integer             0              0 5.57500e+00      5.00
## 82 LotFrontage_NA   numeric             0              0 1.77400e-01      0.00
## 69  EnclosedPorch   integer             0              0 2.19500e+01      0.00
## 68    OpenPorchSF   integer             0              0 4.66600e+01     25.00
## 55   TotRmsAbvGrd   integer             0              0 6.51800e+00      6.00
## 37     BsmtFinSF2   integer             0              0 4.65500e+01      0.00
## 71    ScreenPorch   integer             0              0 1.50600e+01      0.00
## 49   BsmtHalfBath   integer             0              0 5.75300e-02      0.00
## 81      SalePrice   integer             0              0 1.80921e+05 163000.00
## 53   KitchenAbvGr   integer             0              0 1.04700e+00      1.00
## 38      BsmtUnfSF   integer             0              0 5.67200e+02    477.50
## 44      X1stFlrSF   integer             0              0 1.16300e+03   1087.00
## 2      MSSubClass   integer             0              0 5.69000e+01     50.00
## 39    TotalBsmtSF   integer             0              0 1.05700e+03    991.50
## 35     BsmtFinSF1   integer             0              0 4.43600e+02    383.50
## 46   LowQualFinSF   integer             0              0 5.84500e+00      0.00
## 70     X3SsnPorch   integer             0              0 3.41000e+00      0.00
## 77         MoSold   integer             0              0 6.32200e+00      6.00
## 5         LotArea   integer             0              0 1.05170e+04   9478.00
## 47      GrLivArea   integer             0              0 1.51500e+03   1464.00
## 76        MiscVal   integer             0              0 4.34900e+01      0.00
## 18    OverallQual   integer             0              0 6.09900e+00      6.00
## 72       PoolArea   integer             0              0 2.75900e+00      0.00
## 1              Id   integer             0              0 7.30500e+02    730.50
## 4     LotFrontage   numeric             0              0 7.00500e+01     70.05
## 78         YrSold   integer             0              0 2.00800e+03   2008.00
## 63     GarageArea   integer             0              0 4.73000e+02    480.00
## 60    GarageYrBlt   integer             0              0 1.97700e+03   1978.00
## 20      YearBuilt   integer             0              0 1.97100e+03   1973.00
## 52   BedroomAbvGr   integer             0              0 2.86600e+00      3.00
## 62     GarageCars   integer             0              0 1.76700e+00      2.00
## 21   YearRemodAdd   integer             0              0 1.98500e+03   1994.00
## 57     Fireplaces   integer             0              0 6.13000e-01      1.00
## 50       FullBath   integer             0              0 1.56500e+00      2.00
## 3        MSZoning    factor             5              0          NA        NA
## 6          Street    factor             2              0          NA        NA
## 7           Alley    factor             3              0          NA        NA
## 8        LotShape    factor             4              0          NA        NA
## 9     LandContour    factor             4              0          NA        NA
## 10      Utilities    factor             2              0          NA        NA
## 11      LotConfig    factor             5              0          NA        NA
## 12      LandSlope    factor             3              0          NA        NA
## 13   Neighborhood    factor            25              0          NA        NA
## 14     Condition1    factor             9              0          NA        NA
## 15     Condition2    factor             8              0          NA        NA
## 16       BldgType    factor             5              0          NA        NA
## 17     HouseStyle    factor             8              0          NA        NA
## 22      RoofStyle    factor             6              0          NA        NA
## 23       RoofMatl    factor             8              0          NA        NA
## 24    Exterior1st    factor            15              0          NA        NA
## 25    Exterior2nd    factor            16              0          NA        NA
## 26     MasVnrType    factor             5              0          NA        NA
## 28      ExterQual    factor             4              0          NA        NA
## 29      ExterCond    factor             5              0          NA        NA
## 30     Foundation    factor             6              0          NA        NA
## 31       BsmtQual    factor             5              0          NA        NA
## 32       BsmtCond    factor             5              0          NA        NA
## 33   BsmtExposure    factor             5              0          NA        NA
## 34   BsmtFinType1    factor             7              0          NA        NA
## 36   BsmtFinType2    factor             7              0          NA        NA
## 40        Heating    factor             6              0          NA        NA
## 41      HeatingQC    factor             5              0          NA        NA
## 42     CentralAir    factor             2              0          NA        NA
## 43     Electrical    factor             6              0          NA        NA
## 54    KitchenQual    factor             4              0          NA        NA
## 56     Functional    factor             7              0          NA        NA
## 58    FireplaceQu    factor             6              0          NA        NA
## 59     GarageType    factor             7              0          NA        NA
## 61   GarageFinish    factor             4              0          NA        NA
## 64     GarageQual    factor             6              0          NA        NA
## 65     GarageCond    factor             6              0          NA        NA
## 66     PavedDrive    factor             3              0          NA        NA
## 73         PoolQC    factor             4              0          NA        NA
## 74          Fence    factor             5              0          NA        NA
## 75    MiscFeature    factor             5              0          NA        NA
## 79       SaleType    factor             9              0          NA        NA
## 80  SaleCondition    factor             6              0          NA        NA
##            SD   Min    Max R2-SalePrice MeanMedian
## 48     0.5189     0      3    5.093e-02       0.82
## 45   436.5000     0   2065    1.014e-01       0.79
## 51     0.5029     0      2    8.009e-02       0.76
## 67   125.3000     0    857    1.046e-01       0.75
## 27   180.7000     0   1600    2.228e-01       0.57
## 19     1.1130     1      9   -5.380e-03       0.52
## 82     0.3821     0      1   -6.692e-04       0.46
## 69    61.1200     0    552   -1.586e-02       0.36
## 68    66.2600     0    547    9.915e-02       0.33
## 55     1.6250     2     14    2.844e-01       0.32
## 37   161.3000     0   1474    5.563e-04       0.29
## 71    55.7600     0    480    1.174e-02       0.27
## 49     0.2388     0      2    4.020e-04       0.24
## 81 79443.0000 34900 755000    1.000e+00       0.23
## 53     0.2203     0      3   -1.780e-02       0.21
## 38   441.9000     0   2336    4.535e-02       0.20
## 44   386.6000   334   4692    3.666e-01       0.20
## 2     42.3000    20    190   -6.423e-03       0.16
## 39   438.7000     0   6110    3.761e-01       0.15
## 35   456.1000     0   5644    1.487e-01       0.13
## 46    48.6200     0    572    2.975e-05       0.12
## 70    29.3200     0    508    1.303e-03       0.12
## 77     2.7040     1     12    1.472e-03       0.12
## 5   9981.0000  1300 215245    6.898e-02       0.10
## 47   525.5000   334   5642    5.018e-01       0.10
## 76   496.1000     0  15500    2.366e-04       0.09
## 18     1.3830     1     10    6.254e-01       0.07
## 72    40.1800     0    738    7.858e-03       0.07
## 1    421.6000     1   1460    2.052e-04       0.00
## 4     22.0200    21    313    1.115e-01       0.00
## 78     1.3280  2006   2010   -1.512e-04       0.00
## 63   213.8000     0   1418    3.882e-01      -0.03
## 60    26.3100  1872   2010    2.576e-01      -0.04
## 20    30.2000  1872   2010    2.729e-01      -0.07
## 52     0.8158     0      8    2.763e-02      -0.16
## 62     0.7473     0      4    4.097e-01      -0.31
## 21    20.6500  1950   2010    2.566e-01      -0.44
## 57     0.6447     0      3    2.175e-01      -0.60
## 50     0.5509     0      3    3.139e-01      -0.79
## 3          NA    NA     NA    1.051e-01         NA
## 6          NA    NA     NA    9.992e-04         NA
## 7          NA    NA     NA    1.906e-02         NA
## 8          NA    NA     NA    7.447e-02         NA
## 9          NA    NA     NA    2.379e-02         NA
## 10         NA    NA     NA    4.808e-04         NA
## 11         NA    NA     NA    1.833e-02         NA
## 12         NA    NA     NA    1.313e-03         NA
## 13         NA    NA     NA   -5.380e-01         NA
## 14         NA    NA     NA    2.730e-02         NA
## 15         NA    NA     NA    5.126e-03         NA
## 16         NA    NA     NA   -3.188e-02         NA
## 17         NA    NA     NA    8.191e-02         NA
## 22         NA    NA     NA   -5.446e-02         NA
## 23         NA    NA     NA    2.674e-02         NA
## 24         NA    NA     NA    1.446e-01         NA
## 25         NA    NA     NA    1.450e-01         NA
## 26         NA    NA     NA    1.866e-01         NA
## 28         NA    NA     NA   -4.763e-01         NA
## 29         NA    NA     NA   -2.093e-02         NA
## 30         NA    NA     NA    2.538e-01         NA
## 31         NA    NA     NA   -4.635e-01         NA
## 32         NA    NA     NA    4.879e-02         NA
## 33         NA    NA     NA   -1.472e-01         NA
## 34         NA    NA     NA   -2.076e-01         NA
## 36         NA    NA     NA   -2.629e-02         NA
## 40         NA    NA     NA    1.105e-02         NA
## 41         NA    NA     NA   -1.933e-01         NA
## 42         NA    NA     NA    6.252e-02         NA
## 43         NA    NA     NA    5.646e-02         NA
## 54         NA    NA     NA   -4.555e-01         NA
## 56         NA    NA     NA   -1.242e-02         NA
## 58         NA    NA     NA   -2.915e-01         NA
## 59         NA    NA     NA    2.461e-01         NA
## 61         NA    NA     NA   -3.044e-01         NA
## 64         NA    NA     NA   -7.826e-02         NA
## 65         NA    NA     NA    7.819e-02         NA
## 66         NA    NA     NA    5.324e-02         NA
## 73         NA    NA     NA   -1.918e-02         NA
## 74         NA    NA     NA   -3.296e-02         NA
## 75         NA    NA     NA   -4.350e-03         NA
## 79         NA    NA     NA    1.325e-01         NA
## 80         NA    NA     NA    1.325e-01         NA

Any of these at the top should work, many of have a minimum and a median of 0, so at least half the values of the field are 0. Let’s go with MasVnrArea, since part of the reason this is right skewed is because I imputed 0 for the missing values.

hist(df$MasVnrArea)

library(MASS)
expfit = fitdistr(df$MasVnrArea, 'exponential')
l = unname(expfit$estimate)
l
## [1] 0.00969771
n = 1000
exprand = rexp(n,l)

hist(exprand,breaks = c((0:20)*50,5000),xlim=c(0,1000))

hist(df$MasVnrArea,breaks = c((0:20)*50,5000),xlim=c(0,1000))

This doesn’t look great, MVA doesn’t seem to fit an exponential distribution well. There are too many zeros an then it drops off.

PDF \(\lambda*e^{-\lambda x}\) CDF \(1-e^{-\lambda x}\)

\(p = 1-e^{-\lambda x}\)
\(1-p = e^{-\lambda x}\)
\(e^{-\lambda x}=(1-p)\)
\(-\lambda x = log(1-p)\)
\(x = -log(1-p)/\lambda\)

percents = c(0.05,0.95)
for (p in percents) {
  print(c(p,-log(1-p)/l,quantile(exprand,p)[[1]],quantile(df$MasVnrArea,p)[[1]]))
}
## [1] 0.050000 5.289217 4.633443 0.000000
## [1]   0.9500 308.9113 320.7942 456.0000

The estimates from our random sample look pretty close (5.3 vs 4.5 and 309 vs 330). But the original data is way off. This is heavy at the tails, and has too many zeroes and too many over 400+.

error <- qnorm(0.975)*sd(df$MasVnrArea)/sqrt(nrow(df))
print(c(mean(df$MasVnrArea)-error,mean(df$MasVnrArea)+error))
## [1]  93.84658 112.38767

Normal distribution definitely doesn’t make sense here, as you can see. So even the relatively poorly fit exponential curve does look like an improvement.

More data Exploration and Model Building

Let’s take a look at our target variable, along with the three highly correlated variables we identified above. I’ll use a package called GGally to give me a correlation matrix along with some useful plots.

# Quick display of two cabapilities of GGally, to assess the distribution and correlation of variables 
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
# From the help page:
ggpairs(df, columns = c('SalePrice','OverallQual', 'GrLivArea','GarageCars')) 

Sale Price is continuous, and right skewed a bit. Overal Quality is discrete from 1-9. It looks a bit left skewed. GrLivArea is continuous and right skewed. GarageCars is discrete and left skewed I guess, with a lot at 2. These all have strong relationships to SalePrice. GrLivArea looks pretty linear, while the other two don’t look quite linear and may benefit from a transformation.

The categorical variable that showed up at the top of the list was “Neighborhood”. That being said, it has 25 factors, which is quite a lot when we only have 1400 rows. Let’s take a look at th

nh = cbind(aggregate(df['SalePrice'], list(df$Neighborhood), mean),aggregate(df$Neighborhood, list(df$Neighborhood), length))
names(nh) = c('Neighborhood','AvgPrice','remove','count')
rownames(nh) = nh$Neighborhood
nh$remove <- NULL
nh$AvgPrice = round(nh$AvgPrice/100000,2)

nh[order(nh$count),]
##         Neighborhood AvgPrice count
## Blueste      Blueste     1.38     2
## NPkVill      NPkVill     1.43     9
## Veenker      Veenker     2.39    11
## BrDale        BrDale     1.04    16
## Blmngtn      Blmngtn     1.95    17
## MeadowV      MeadowV     0.99    17
## StoneBr      StoneBr     3.10    25
## SWISU          SWISU     1.43    25
## ClearCr      ClearCr     2.13    28
## IDOTRR        IDOTRR     1.00    37
## Timber        Timber     2.42    38
## NoRidge      NoRidge     3.35    41
## Mitchel      Mitchel     1.56    49
## Crawfor      Crawfor     2.11    51
## BrkSide      BrkSide     1.25    58
## SawyerW      SawyerW     1.87    59
## NWAmes        NWAmes     1.89    73
## Sawyer        Sawyer     1.37    74
## NridgHt      NridgHt     3.16    77
## Gilbert      Gilbert     1.93    79
## Somerst      Somerst     2.25    86
## Edwards      Edwards     1.28   100
## OldTown      OldTown     1.28   113
## CollgCr      CollgCr     1.98   150
## NAmes          NAmes     1.46   225

Because there is some good information here, but it is fairly granular for our sample size, I’m going to add a column that is the average price of homes in that neighborhood. This has it’s own warts, as I’m giving a neighborhood with 2 houses an average and not treating it differently than a neighborhood with 100 houses. I will also add the neighborhood count and an interaction between the two, to hopefully help the model sort out the data here. This may lead to overfitting, but I think it will be better than 25 dummy variables.

for (nhval in nh$Neighborhood ){
    print(nhval)
    df$NBCount[df$Neighborhood==nhval] = nh[nhval,'count']
    df$NBPrice[df$Neighborhood==nhval] = nh[nhval,'AvgPrice']
}
## [1] "Blmngtn"
## [1] "Blueste"
## [1] "BrDale"
## [1] "BrkSide"
## [1] "ClearCr"
## [1] "CollgCr"
## [1] "Crawfor"
## [1] "Edwards"
## [1] "Gilbert"
## [1] "IDOTRR"
## [1] "MeadowV"
## [1] "Mitchel"
## [1] "NAmes"
## [1] "NoRidge"
## [1] "NPkVill"
## [1] "NridgHt"
## [1] "NWAmes"
## [1] "OldTown"
## [1] "Sawyer"
## [1] "SawyerW"
## [1] "Somerst"
## [1] "StoneBr"
## [1] "SWISU"
## [1] "Timber"
## [1] "Veenker"
summ = getDFSummary(df,'SalePrice')
summ[order(-abs(summ$'R2-SalePrice'),-summ$'Factor Levels'),]
##            Column Data Type Factor Levels Missing Values        Mean    Median
## 81      SalePrice   integer             0              0 1.80921e+05 163000.00
## 18    OverallQual   integer             0              0 6.09900e+00      6.00
## 84        NBPrice   numeric             0              0 1.80900e+00      1.87
## 13   Neighborhood    factor            25              0          NA        NA
## 47      GrLivArea   integer             0              0 1.51500e+03   1464.00
## 28      ExterQual    factor             4              0          NA        NA
## 31       BsmtQual    factor             5              0          NA        NA
## 54    KitchenQual    factor             4              0          NA        NA
## 62     GarageCars   integer             0              0 1.76700e+00      2.00
## 63     GarageArea   integer             0              0 4.73000e+02    480.00
## 39    TotalBsmtSF   integer             0              0 1.05700e+03    991.50
## 44      X1stFlrSF   integer             0              0 1.16300e+03   1087.00
## 50       FullBath   integer             0              0 1.56500e+00      2.00
## 61   GarageFinish    factor             4              0          NA        NA
## 58    FireplaceQu    factor             6              0          NA        NA
## 55   TotRmsAbvGrd   integer             0              0 6.51800e+00      6.00
## 20      YearBuilt   integer             0              0 1.97100e+03   1973.00
## 60    GarageYrBlt   integer             0              0 1.97700e+03   1978.00
## 21   YearRemodAdd   integer             0              0 1.98500e+03   1994.00
## 30     Foundation    factor             6              0          NA        NA
## 59     GarageType    factor             7              0          NA        NA
## 27     MasVnrArea   numeric             0              0 1.03100e+02      0.00
## 57     Fireplaces   integer             0              0 6.13000e-01      1.00
## 34   BsmtFinType1    factor             7              0          NA        NA
## 41      HeatingQC    factor             5              0          NA        NA
## 26     MasVnrType    factor             5              0          NA        NA
## 35     BsmtFinSF1   integer             0              0 4.43600e+02    383.50
## 33   BsmtExposure    factor             5              0          NA        NA
## 25    Exterior2nd    factor            16              0          NA        NA
## 24    Exterior1st    factor            15              0          NA        NA
## 79       SaleType    factor             9              0          NA        NA
## 80  SaleCondition    factor             6              0          NA        NA
## 4     LotFrontage   numeric             0              0 7.00500e+01     70.05
## 3        MSZoning    factor             5              0          NA        NA
## 67     WoodDeckSF   integer             0              0 9.42400e+01      0.00
## 45      X2ndFlrSF   integer             0              0 3.47000e+02      0.00
## 68    OpenPorchSF   integer             0              0 4.66600e+01     25.00
## 17     HouseStyle    factor             8              0          NA        NA
## 51       HalfBath   integer             0              0 3.82900e-01      0.00
## 64     GarageQual    factor             6              0          NA        NA
## 65     GarageCond    factor             6              0          NA        NA
## 8        LotShape    factor             4              0          NA        NA
## 5         LotArea   integer             0              0 1.05170e+04   9478.00
## 42     CentralAir    factor             2              0          NA        NA
## 43     Electrical    factor             6              0          NA        NA
## 22      RoofStyle    factor             6              0          NA        NA
## 66     PavedDrive    factor             3              0          NA        NA
## 48   BsmtFullBath   integer             0              0 4.25300e-01      0.00
## 32       BsmtCond    factor             5              0          NA        NA
## 38      BsmtUnfSF   integer             0              0 5.67200e+02    477.50
## 83        NBCount   integer             0              0 9.97800e+01     79.00
## 74          Fence    factor             5              0          NA        NA
## 16       BldgType    factor             5              0          NA        NA
## 52   BedroomAbvGr   integer             0              0 2.86600e+00      3.00
## 14     Condition1    factor             9              0          NA        NA
## 23       RoofMatl    factor             8              0          NA        NA
## 36   BsmtFinType2    factor             7              0          NA        NA
## 9     LandContour    factor             4              0          NA        NA
## 29      ExterCond    factor             5              0          NA        NA
## 73         PoolQC    factor             4              0          NA        NA
## 7           Alley    factor             3              0          NA        NA
## 11      LotConfig    factor             5              0          NA        NA
## 53   KitchenAbvGr   integer             0              0 1.04700e+00      1.00
## 69  EnclosedPorch   integer             0              0 2.19500e+01      0.00
## 56     Functional    factor             7              0          NA        NA
## 71    ScreenPorch   integer             0              0 1.50600e+01      0.00
## 40        Heating    factor             6              0          NA        NA
## 72       PoolArea   integer             0              0 2.75900e+00      0.00
## 2      MSSubClass   integer             0              0 5.69000e+01     50.00
## 19    OverallCond   integer             0              0 5.57500e+00      5.00
## 15     Condition2    factor             8              0          NA        NA
## 75    MiscFeature    factor             5              0          NA        NA
## 77         MoSold   integer             0              0 6.32200e+00      6.00
## 12      LandSlope    factor             3              0          NA        NA
## 70     X3SsnPorch   integer             0              0 3.41000e+00      0.00
## 6          Street    factor             2              0          NA        NA
## 82 LotFrontage_NA   numeric             0              0 1.77400e-01      0.00
## 37     BsmtFinSF2   integer             0              0 4.65500e+01      0.00
## 10      Utilities    factor             2              0          NA        NA
## 49   BsmtHalfBath   integer             0              0 5.75300e-02      0.00
## 76        MiscVal   integer             0              0 4.34900e+01      0.00
## 1              Id   integer             0              0 7.30500e+02    730.50
## 78         YrSold   integer             0              0 2.00800e+03   2008.00
## 46   LowQualFinSF   integer             0              0 5.84500e+00      0.00
##            SD      Min       Max R2-SalePrice
## 81 79443.0000 34900.00 755000.00    1.000e+00
## 18     1.3830     1.00     10.00    6.254e-01
## 84     0.5860     0.99      3.35    5.453e-01
## 13         NA       NA        NA   -5.380e-01
## 47   525.5000   334.00   5642.00    5.018e-01
## 28         NA       NA        NA   -4.763e-01
## 31         NA       NA        NA   -4.635e-01
## 54         NA       NA        NA   -4.555e-01
## 62     0.7473     0.00      4.00    4.097e-01
## 63   213.8000     0.00   1418.00    3.882e-01
## 39   438.7000     0.00   6110.00    3.761e-01
## 44   386.6000   334.00   4692.00    3.666e-01
## 50     0.5509     0.00      3.00    3.139e-01
## 61         NA       NA        NA   -3.044e-01
## 58         NA       NA        NA   -2.915e-01
## 55     1.6250     2.00     14.00    2.844e-01
## 20    30.2000  1872.00   2010.00    2.729e-01
## 60    26.3100  1872.00   2010.00    2.576e-01
## 21    20.6500  1950.00   2010.00    2.566e-01
## 30         NA       NA        NA    2.538e-01
## 59         NA       NA        NA    2.461e-01
## 27   180.7000     0.00   1600.00    2.228e-01
## 57     0.6447     0.00      3.00    2.175e-01
## 34         NA       NA        NA   -2.076e-01
## 41         NA       NA        NA   -1.933e-01
## 26         NA       NA        NA    1.866e-01
## 35   456.1000     0.00   5644.00    1.487e-01
## 33         NA       NA        NA   -1.472e-01
## 25         NA       NA        NA    1.450e-01
## 24         NA       NA        NA    1.446e-01
## 79         NA       NA        NA    1.325e-01
## 80         NA       NA        NA    1.325e-01
## 4     22.0200    21.00    313.00    1.115e-01
## 3          NA       NA        NA    1.051e-01
## 67   125.3000     0.00    857.00    1.046e-01
## 45   436.5000     0.00   2065.00    1.014e-01
## 68    66.2600     0.00    547.00    9.915e-02
## 17         NA       NA        NA    8.191e-02
## 51     0.5029     0.00      2.00    8.009e-02
## 64         NA       NA        NA   -7.826e-02
## 65         NA       NA        NA    7.819e-02
## 8          NA       NA        NA    7.447e-02
## 5   9981.0000  1300.00 215245.00    6.898e-02
## 42         NA       NA        NA    6.252e-02
## 43         NA       NA        NA    5.646e-02
## 22         NA       NA        NA   -5.446e-02
## 66         NA       NA        NA    5.324e-02
## 48     0.5189     0.00      3.00    5.093e-02
## 32         NA       NA        NA    4.879e-02
## 38   441.9000     0.00   2336.00    4.535e-02
## 83    63.8400     2.00    225.00   -3.784e-02
## 74         NA       NA        NA   -3.296e-02
## 16         NA       NA        NA   -3.188e-02
## 52     0.8158     0.00      8.00    2.763e-02
## 14         NA       NA        NA    2.730e-02
## 23         NA       NA        NA    2.674e-02
## 36         NA       NA        NA   -2.629e-02
## 9          NA       NA        NA    2.379e-02
## 29         NA       NA        NA   -2.093e-02
## 73         NA       NA        NA   -1.918e-02
## 7          NA       NA        NA    1.906e-02
## 11         NA       NA        NA    1.833e-02
## 53     0.2203     0.00      3.00   -1.780e-02
## 69    61.1200     0.00    552.00   -1.586e-02
## 56         NA       NA        NA   -1.242e-02
## 71    55.7600     0.00    480.00    1.174e-02
## 40         NA       NA        NA    1.105e-02
## 72    40.1800     0.00    738.00    7.858e-03
## 2     42.3000    20.00    190.00   -6.423e-03
## 19     1.1130     1.00      9.00   -5.380e-03
## 15         NA       NA        NA    5.126e-03
## 75         NA       NA        NA   -4.350e-03
## 77     2.7040     1.00     12.00    1.472e-03
## 12         NA       NA        NA    1.313e-03
## 70    29.3200     0.00    508.00    1.303e-03
## 6          NA       NA        NA    9.992e-04
## 82     0.3821     0.00      1.00   -6.692e-04
## 37   161.3000     0.00   1474.00    5.563e-04
## 10         NA       NA        NA    4.808e-04
## 49     0.2388     0.00      2.00    4.020e-04
## 76   496.1000     0.00  15500.00    2.366e-04
## 1    421.6000     1.00   1460.00    2.052e-04
## 78     1.3280  2006.00   2010.00   -1.512e-04
## 46    48.6200     0.00    572.00    2.975e-05

OK, so NBPrice is showing as a similar score to Neighborhood. This is expected since we basically took that model and jammed it into 1 variable. I’m going to take the most correlated variables (I put my arbitrary cutoff at 0.05). I added interactions for the most correlated temrs as well as some other I thought could be helpful.

model = lm(SalePrice ~ (OverallQual + NBPrice + GrLivArea + GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF + FullBath)^2 +
  NBPrice*NBCount +   + ExterQual + BsmtQual + KitchenQual + GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF +
  FullBath + GarageFinish + FireplaceQu + TotRmsAbvGrd +  (YearBuilt*GarageYrBlt*YearRemodAdd) + Foundation +
  GarageType + MasVnrArea + Fireplaces + BsmtFinType1 +  HeatingQC + MasVnrType + BsmtFinSF1 + BsmtExposure +
  Exterior2nd + Exterior1st + SaleType + SaleCondition + LotFrontage*LotFrontage_NA + MSZoning + WoodDeckSF +
  X2ndFlrSF + OpenPorchSF + HouseStyle + HalfBath +  GarageQual + GarageCond + LotShape + LotArea + CentralAir +
  Electrical + RoofStyle + PavedDrive + BsmtFullBath , data = df)
summary(model)
## 
## Call:
## lm(formula = SalePrice ~ (OverallQual + NBPrice + GrLivArea + 
##     GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF + FullBath)^2 + 
##     NBPrice * NBCount + +ExterQual + BsmtQual + KitchenQual + 
##     GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF + FullBath + 
##     GarageFinish + FireplaceQu + TotRmsAbvGrd + (YearBuilt * 
##     GarageYrBlt * YearRemodAdd) + Foundation + GarageType + MasVnrArea + 
##     Fireplaces + BsmtFinType1 + HeatingQC + MasVnrType + BsmtFinSF1 + 
##     BsmtExposure + Exterior2nd + Exterior1st + SaleType + SaleCondition + 
##     LotFrontage * LotFrontage_NA + MSZoning + WoodDeckSF + X2ndFlrSF + 
##     OpenPorchSF + HouseStyle + HalfBath + GarageQual + GarageCond + 
##     LotShape + LotArea + CentralAir + Electrical + RoofStyle + 
##     PavedDrive + BsmtFullBath, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172792  -10478      85   10581  154665 
## 
## Coefficients: (6 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -9.292e+08  5.230e+08  -1.776 0.075888 .  
## OverallQual                        -2.297e+03  3.299e+03  -0.696 0.486298    
## NBPrice                            -5.743e+04  1.016e+04  -5.654 1.93e-08 ***
## GrLivArea                          -6.329e+01  2.056e+01  -3.078 0.002129 ** 
## GarageCars                          1.998e+03  9.372e+03   0.213 0.831238    
## GarageArea                         -1.476e+01  3.086e+01  -0.478 0.632582    
## TotalBsmtSF                        -6.627e-01  1.764e+01  -0.038 0.970044    
## X1stFlrSF                           1.711e+01  2.674e+01   0.640 0.522292    
## FullBath                           -4.140e+02  9.202e+03  -0.045 0.964118    
## NBCount                            -1.384e+02  6.256e+01  -2.212 0.027110 *  
## ExterQualFa                        -1.004e+04  9.533e+03  -1.053 0.292647    
## ExterQualGd                        -6.997e+03  4.656e+03  -1.503 0.133104    
## ExterQualTA                        -8.432e+03  5.163e+03  -1.633 0.102673    
## BsmtQualFa                         -1.341e+04  6.016e+03  -2.228 0.026035 *  
## BsmtQualGd                         -8.094e+03  3.273e+03  -2.473 0.013535 *  
## BsmtQualTA                         -1.020e+04  3.979e+03  -2.562 0.010516 *  
## BsmtQualNA                         -7.381e+03  2.506e+04  -0.295 0.768393    
## KitchenQualFa                      -1.730e+04  5.788e+03  -2.989 0.002849 ** 
## KitchenQualGd                      -1.663e+04  3.439e+03  -4.837 1.48e-06 ***
## KitchenQualTA                      -1.977e+04  3.829e+03  -5.163 2.82e-07 ***
## GarageFinishRFn                    -2.050e+03  1.872e+03  -1.095 0.273795    
## GarageFinishUnf                    -2.161e+03  2.294e+03  -0.942 0.346235    
## GarageFinishNA                     -1.290e+03  2.101e+04  -0.061 0.951052    
## FireplaceQuFa                      -2.295e+02  6.807e+03  -0.034 0.973109    
## FireplaceQuGd                       3.766e+03  5.324e+03   0.707 0.479459    
## FireplaceQuPo                      -3.051e+03  7.598e+03  -0.402 0.688036    
## FireplaceQuTA                       1.426e+03  5.540e+03   0.257 0.796928    
## FireplaceQuNA                       6.124e+03  6.171e+03   0.992 0.321180    
## TotRmsAbvGrd                        2.814e+02  7.919e+02   0.355 0.722361    
## YearBuilt                           4.812e+05  2.707e+05   1.778 0.075687 .  
## GarageYrBlt                         4.734e+05  2.660e+05   1.780 0.075354 .  
## YearRemodAdd                        4.653e+05  2.623e+05   1.774 0.076373 .  
## FoundationCBlock                    5.479e+03  3.049e+03   1.797 0.072630 .  
## FoundationPConc                     5.147e+03  3.316e+03   1.552 0.120878    
## FoundationSlab                     -4.096e+02  9.284e+03  -0.044 0.964818    
## FoundationStone                     1.459e+04  1.078e+04   1.353 0.176158    
## FoundationWood                     -2.544e+04  1.411e+04  -1.804 0.071501 .  
## GarageTypeAttchd                    1.917e+04  1.101e+04   1.741 0.081891 .  
## GarageTypeBasment                   1.360e+04  1.249e+04   1.089 0.276567    
## GarageTypeBuiltIn                   1.631e+04  1.149e+04   1.420 0.155927    
## GarageTypeCarPort                   6.530e+03  1.402e+04   0.466 0.641337    
## GarageTypeDetchd                    2.043e+04  1.095e+04   1.865 0.062429 .  
## GarageTypeNA                               NA         NA      NA       NA    
## MasVnrArea                         -1.097e+00  5.526e+00  -0.199 0.842652    
## Fireplaces                          7.507e+03  2.474e+03   3.034 0.002459 ** 
## BsmtFinType1BLQ                    -2.383e+03  2.620e+03  -0.909 0.363328    
## BsmtFinType1GLQ                     1.995e+02  2.356e+03   0.085 0.932541    
## BsmtFinType1LwQ                    -4.119e+03  3.359e+03  -1.226 0.220376    
## BsmtFinType1Rec                    -3.209e+03  2.797e+03  -1.147 0.251447    
## BsmtFinType1Unf                     1.833e+02  2.674e+03   0.069 0.945358    
## BsmtFinType1NA                             NA         NA      NA       NA    
## HeatingQCFa                        -5.724e+03  4.078e+03  -1.404 0.160655    
## HeatingQCGd                        -2.911e+03  1.964e+03  -1.482 0.138640    
## HeatingQCPo                        -2.454e+04  2.447e+04  -1.003 0.316068    
## HeatingQCTA                        -4.567e+03  1.949e+03  -2.343 0.019268 *  
## MasVnrTypeBrkFace                   4.727e+03  6.390e+03   0.740 0.459604    
## MasVnrTypeNone                      7.392e+03  6.457e+03   1.145 0.252512    
## MasVnrTypeStone                     1.056e+04  6.838e+03   1.544 0.122782    
## MasVnrTypeNA                       -1.954e+03  1.057e+04  -0.185 0.853388    
## BsmtFinSF1                          1.996e+01  2.704e+00   7.383 2.78e-13 ***
## BsmtExposureGd                      1.232e+04  2.836e+03   4.345 1.50e-05 ***
## BsmtExposureMn                      7.181e+01  2.881e+03   0.025 0.980119    
## BsmtExposureNo                     -3.774e+03  2.052e+03  -1.839 0.066136 .  
## BsmtExposureNA                     -1.034e+04  2.272e+04  -0.455 0.649195    
## Exterior2ndAsphShn                  1.191e+04  2.105e+04   0.566 0.571528    
## Exterior2ndBrk Cmn                  7.503e+03  1.584e+04   0.474 0.635886    
## Exterior2ndBrkFace                 -7.348e+03  1.257e+04  -0.584 0.559041    
## Exterior2ndCBlock                  -2.121e+04  2.533e+04  -0.837 0.402481    
## Exterior2ndCmentBd                 -3.469e+02  1.779e+04  -0.020 0.984445    
## Exterior2ndHdBoard                  6.843e+03  1.167e+04   0.586 0.557888    
## Exterior2ndImStucc                  1.195e+04  1.349e+04   0.886 0.375955    
## Exterior2ndMetalSd                  1.329e+04  1.344e+04   0.989 0.322894    
## Exterior2ndOther                    1.555e+03  2.618e+04   0.059 0.952651    
## Exterior2ndPlywood                  6.388e+03  1.134e+04   0.563 0.573363    
## Exterior2ndStone                   -4.573e+03  1.629e+04  -0.281 0.778974    
## Exterior2ndStucco                   9.915e+03  1.280e+04   0.775 0.438730    
## Exterior2ndVinylSd                  1.042e+04  1.225e+04   0.851 0.395136    
## Exterior2ndWd Sdng                  1.024e+04  1.125e+04   0.910 0.363198    
## Exterior2ndWd Shng                  6.488e+03  1.177e+04   0.551 0.581558    
## Exterior1stAsphShn                 -4.715e+03  3.527e+04  -0.134 0.893665    
## Exterior1stBrkComm                 -3.578e+04  2.350e+04  -1.523 0.128061    
## Exterior1stBrkFace                  1.296e+04  1.196e+04   1.084 0.278574    
## Exterior1stCBlock                          NA         NA      NA       NA    
## Exterior1stCemntBd                 -7.846e+03  1.794e+04  -0.437 0.661964    
## Exterior1stHdBoard                 -1.205e+04  1.203e+04  -1.001 0.316819    
## Exterior1stImStucc                 -1.931e+04  2.728e+04  -0.708 0.479080    
## Exterior1stMetalSd                 -1.675e+04  1.371e+04  -1.222 0.221770    
## Exterior1stPlywood                 -1.150e+04  1.185e+04  -0.970 0.332064    
## Exterior1stStone                    7.728e+03  2.260e+04   0.342 0.732476    
## Exterior1stStucco                  -6.517e+03  1.342e+04  -0.485 0.627470    
## Exterior1stVinylSd                 -1.540e+04  1.256e+04  -1.226 0.220286    
## Exterior1stWd Sdng                 -1.513e+04  1.158e+04  -1.307 0.191528    
## Exterior1stWdShing                 -1.114e+04  1.249e+04  -0.892 0.372610    
## SaleTypeCon                         2.180e+04  1.699e+04   1.283 0.199814    
## SaleTypeConLD                       9.245e+03  9.267e+03   0.998 0.318602    
## SaleTypeConLI                      -5.298e+03  1.115e+04  -0.475 0.634910    
## SaleTypeConLw                       2.030e+03  1.146e+04   0.177 0.859389    
## SaleTypeCWD                         1.926e+04  1.253e+04   1.538 0.124341    
## SaleTypeNew                         4.201e+04  1.454e+04   2.890 0.003915 ** 
## SaleTypeOth                         7.498e+03  1.418e+04   0.529 0.597133    
## SaleTypeWD                         -1.137e+03  3.994e+03  -0.285 0.775900    
## SaleConditionAdjLand                5.036e+03  1.369e+04   0.368 0.713097    
## SaleConditionAlloca                 9.425e+03  8.170e+03   1.154 0.248873    
## SaleConditionFamily                 1.038e+03  5.948e+03   0.175 0.861482    
## SaleConditionNormal                 9.182e+03  2.763e+03   3.324 0.000913 ***
## SaleConditionPartial               -2.101e+04  1.397e+04  -1.504 0.132716    
## LotFrontage                         5.464e+01  3.705e+01   1.475 0.140523    
## LotFrontage_NA                      8.266e+02  1.795e+03   0.461 0.645143    
## MSZoningFV                          4.982e+04  9.533e+03   5.226 2.02e-07 ***
## MSZoningRH                          4.426e+04  1.052e+04   4.205 2.79e-05 ***
## MSZoningRL                          3.849e+04  8.819e+03   4.364 1.38e-05 ***
## MSZoningRM                          2.926e+04  8.633e+03   3.389 0.000722 ***
## WoodDeckSF                          8.990e+00  5.488e+00   1.638 0.101615    
## X2ndFlrSF                           3.438e+01  1.699e+01   2.024 0.043200 *  
## OpenPorchSF                         2.541e+01  1.069e+01   2.377 0.017598 *  
## HouseStyle1.5Unf                    5.224e+03  7.808e+03   0.669 0.503593    
## HouseStyle1Story                    1.840e+03  4.097e+03   0.449 0.653505    
## HouseStyle2.5Fin                    8.452e+03  1.165e+04   0.725 0.468297    
## HouseStyle2.5Unf                   -2.885e+03  8.168e+03  -0.353 0.724008    
## HouseStyle2Story                   -3.465e+03  3.112e+03  -1.114 0.265696    
## HouseStyleSFoyer                   -6.561e+03  5.878e+03  -1.116 0.264552    
## HouseStyleSLvl                      4.703e+03  4.874e+03   0.965 0.334769    
## HalfBath                            4.443e+03  1.997e+03   2.224 0.026290 *  
## GarageQualFa                       -1.177e+05  3.111e+04  -3.783 0.000162 ***
## GarageQualGd                       -1.083e+05  3.144e+04  -3.446 0.000588 ***
## GarageQualPo                       -1.001e+05  3.803e+04  -2.632 0.008587 ** 
## GarageQualTA                       -1.154e+05  3.086e+04  -3.738 0.000193 ***
## GarageQualNA                               NA         NA      NA       NA    
## GarageCondFa                        1.064e+05  3.598e+04   2.958 0.003150 ** 
## GarageCondGd                        1.126e+05  3.673e+04   3.065 0.002222 ** 
## GarageCondPo                        1.063e+05  3.761e+04   2.826 0.004790 ** 
## GarageCondTA                        1.116e+05  3.576e+04   3.120 0.001848 ** 
## GarageCondNA                               NA         NA      NA       NA    
## LotShapeIR2                         6.049e+03  4.039e+03   1.498 0.134447    
## LotShapeIR3                         3.853e+03  8.376e+03   0.460 0.645556    
## LotShapeReg                        -2.330e+03  1.502e+03  -1.552 0.121014    
## LotArea                             3.233e-01  7.723e-02   4.186 3.03e-05 ***
## CentralAirY                         5.959e+03  3.562e+03   1.673 0.094570 .  
## ElectricalFuseF                    -2.417e+03  5.633e+03  -0.429 0.667899    
## ElectricalFuseP                    -1.660e+04  1.781e+04  -0.932 0.351382    
## ElectricalMix                      -3.035e+04  3.080e+04  -0.985 0.324612    
## ElectricalSBrkr                     5.433e+02  2.815e+03   0.193 0.846953    
## ElectricalNA                        2.672e+04  2.392e+04   1.117 0.264183    
## RoofStyleGable                     -9.631e+03  7.424e+03  -1.297 0.194765    
## RoofStyleGambrel                   -1.113e+03  1.055e+04  -0.106 0.915948    
## RoofStyleHip                       -9.617e+03  7.607e+03  -1.264 0.206389    
## RoofStyleMansard                   -2.574e+03  1.208e+04  -0.213 0.831279    
## RoofStyleShed                       4.805e+03  1.833e+04   0.262 0.793302    
## PavedDriveP                         8.301e+02  5.311e+03   0.156 0.875827    
## PavedDriveY                         1.916e+03  3.302e+03   0.580 0.561795    
## BsmtFullBath                        6.806e+02  1.712e+03   0.398 0.691058    
## OverallQual:NBPrice                -2.409e+03  1.591e+03  -1.514 0.130361    
## OverallQual:GrLivArea               5.091e+00  2.242e+00   2.271 0.023300 *  
## OverallQual:GarageCars             -4.672e+02  2.225e+03  -0.210 0.833682    
## OverallQual:GarageArea              1.041e+01  7.619e+00   1.367 0.171886    
## OverallQual:TotalBsmtSF             2.808e-01  3.226e+00   0.087 0.930637    
## OverallQual:X1stFlrSF               6.381e+00  3.883e+00   1.643 0.100536    
## OverallQual:FullBath               -1.971e+03  1.851e+03  -1.065 0.287147    
## NBPrice:GrLivArea                   3.822e+01  4.074e+00   9.382  < 2e-16 ***
## NBPrice:GarageCars                 -5.098e+03  4.616e+03  -1.105 0.269568    
## NBPrice:GarageArea                 -2.873e+01  1.547e+01  -1.857 0.063533 .  
## NBPrice:TotalBsmtSF                 1.673e+01  8.435e+00   1.984 0.047518 *  
## NBPrice:X1stFlrSF                   1.784e+01  9.518e+00   1.875 0.061066 .  
## NBPrice:FullBath                    6.233e+02  4.143e+03   0.150 0.880427    
## GrLivArea:GarageCars               -4.450e+00  5.340e+00  -0.833 0.404858    
## GrLivArea:GarageArea                2.354e-02  1.913e-02   1.231 0.218669    
## GrLivArea:TotalBsmtSF              -5.388e-03  8.360e-03  -0.644 0.519367    
## GrLivArea:X1stFlrSF                -3.662e-02  8.362e-03  -4.379 1.29e-05 ***
## GrLivArea:FullBath                  1.306e+01  4.102e+00   3.184 0.001487 ** 
## GarageCars:GarageArea               1.067e+01  6.560e+00   1.626 0.104173    
## GarageCars:TotalBsmtSF             -1.054e+01  8.712e+00  -1.210 0.226433    
## GarageCars:X1stFlrSF                2.450e+01  1.056e+01   2.319 0.020548 *  
## GarageCars:FullBath                -4.587e+02  4.971e+03  -0.092 0.926497    
## GarageArea:TotalBsmtSF              3.409e-02  2.837e-02   1.201 0.229785    
## GarageArea:X1stFlrSF               -7.324e-02  3.508e-02  -2.088 0.036995 *  
## GarageArea:FullBath                -3.798e-01  1.823e+01  -0.021 0.983380    
## TotalBsmtSF:X1stFlrSF              -2.051e-03  6.305e-03  -0.325 0.745001    
## TotalBsmtSF:FullBath               -3.827e+00  7.222e+00  -0.530 0.596256    
## X1stFlrSF:FullBath                 -4.151e-01  8.566e+00  -0.048 0.961360    
## NBPrice:NBCount                     8.792e+01  3.918e+01   2.244 0.025012 *  
## YearBuilt:GarageYrBlt              -2.453e+02  1.376e+02  -1.782 0.075004 .  
## YearBuilt:YearRemodAdd             -2.409e+02  1.358e+02  -1.774 0.076222 .  
## GarageYrBlt:YearRemodAdd           -2.370e+02  1.334e+02  -1.777 0.075871 .  
## LotFrontage:LotFrontage_NA                 NA         NA      NA       NA    
## YearBuilt:GarageYrBlt:YearRemodAdd  1.228e-01  6.903e-02   1.779 0.075529 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22420 on 1281 degrees of freedom
## Multiple R-squared:  0.9301, Adjusted R-squared:  0.9203 
## F-statistic:  95.7 on 178 and 1281 DF,  p-value: < 2.2e-16

Going to take a shortcut to feature selection by using the “step” function to trim down based on AIC

stepmodel = step(model, direction = 'backward')
summary(stepmodel)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + NBPrice + GrLivArea + 
##     GarageCars + GarageArea + TotalBsmtSF + X1stFlrSF + FullBath + 
##     NBCount + BsmtQual + KitchenQual + YearBuilt + GarageYrBlt + 
##     YearRemodAdd + Foundation + Fireplaces + MasVnrType + BsmtFinSF1 + 
##     BsmtExposure + Exterior1st + SaleCondition + LotFrontage + 
##     MSZoning + WoodDeckSF + X2ndFlrSF + OpenPorchSF + HalfBath + 
##     GarageQual + GarageCond + LotShape + LotArea + CentralAir + 
##     OverallQual:NBPrice + OverallQual:GrLivArea + OverallQual:GarageArea + 
##     OverallQual:X1stFlrSF + NBPrice:GrLivArea + NBPrice:GarageCars + 
##     NBPrice:GarageArea + NBPrice:TotalBsmtSF + NBPrice:X1stFlrSF + 
##     GrLivArea:X1stFlrSF + GrLivArea:FullBath + GarageCars:GarageArea + 
##     GarageCars:TotalBsmtSF + GarageCars:X1stFlrSF + GarageArea:TotalBsmtSF + 
##     GarageArea:X1stFlrSF + TotalBsmtSF:X1stFlrSF + TotalBsmtSF:FullBath + 
##     NBPrice:NBCount + YearBuilt:GarageYrBlt + YearBuilt:YearRemodAdd + 
##     GarageYrBlt:YearRemodAdd + YearBuilt:GarageYrBlt:YearRemodAdd, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -187801  -10829      63   10993  153705 
## 
## Coefficients: (1 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -1.141e+09  4.808e+08  -2.372 0.017826 *  
## OverallQual                        -3.853e+03  2.992e+03  -1.288 0.197997    
## NBPrice                            -5.101e+04  9.222e+03  -5.532 3.80e-08 ***
## GrLivArea                          -5.250e+01  1.513e+01  -3.471 0.000536 ***
## GarageCars                         -1.324e+03  7.594e+03  -0.174 0.861603    
## GarageArea                         -7.654e+00  2.579e+01  -0.297 0.766624    
## TotalBsmtSF                         5.213e+00  1.319e+01   0.395 0.692745    
## X1stFlrSF                           8.461e+00  2.021e+01   0.419 0.675481    
## FullBath                           -6.764e+03  5.896e+03  -1.147 0.251516    
## NBCount                            -1.274e+02  5.890e+01  -2.162 0.030754 *  
## BsmtQualFa                         -1.563e+04  5.655e+03  -2.765 0.005773 ** 
## BsmtQualGd                         -1.098e+04  3.056e+03  -3.595 0.000336 ***
## BsmtQualTA                         -1.210e+04  3.782e+03  -3.200 0.001406 ** 
## BsmtQualNA                         -5.907e+03  2.427e+04  -0.243 0.807711    
## KitchenQualFa                      -1.911e+04  5.366e+03  -3.560 0.000383 ***
## KitchenQualGd                      -1.609e+04  3.128e+03  -5.143 3.10e-07 ***
## KitchenQualTA                      -2.025e+04  3.529e+03  -5.738 1.18e-08 ***
## YearBuilt                           5.912e+05  2.488e+05   2.376 0.017640 *  
## GarageYrBlt                         5.826e+05  2.444e+05   2.384 0.017261 *  
## YearRemodAdd                        5.714e+05  2.411e+05   2.370 0.017934 *  
## FoundationCBlock                    2.948e+03  2.878e+03   1.024 0.305865    
## FoundationPConc                     4.288e+03  3.184e+03   1.347 0.178336    
## FoundationSlab                     -1.815e+03  8.729e+03  -0.208 0.835365    
## FoundationStone                     1.633e+04  9.956e+03   1.640 0.101163    
## FoundationWood                     -2.695e+04  1.364e+04  -1.976 0.048317 *  
## Fireplaces                          4.934e+03  1.198e+03   4.117 4.07e-05 ***
## MasVnrTypeBrkFace                   8.450e+03  6.109e+03   1.383 0.166835    
## MasVnrTypeNone                      1.102e+04  6.036e+03   1.826 0.068053 .  
## MasVnrTypeStone                     1.317e+04  6.522e+03   2.019 0.043720 *  
## MasVnrTypeNA                        1.346e+03  1.011e+04   0.133 0.894118    
## BsmtFinSF1                          2.033e+01  1.766e+00  11.506  < 2e-16 ***
## BsmtExposureGd                      1.284e+04  2.687e+03   4.777 1.97e-06 ***
## BsmtExposureMn                      1.135e+02  2.714e+03   0.042 0.966638    
## BsmtExposureNo                     -3.517e+03  1.852e+03  -1.900 0.057693 .  
## BsmtExposureNA                     -1.345e+04  2.254e+04  -0.597 0.550731    
## Exterior1stAsphShn                  7.363e+03  2.727e+04   0.270 0.787215    
## Exterior1stBrkComm                 -2.184e+04  1.737e+04  -1.257 0.208959    
## Exterior1stBrkFace                  1.646e+04  6.476e+03   2.541 0.011158 *  
## Exterior1stCBlock                  -1.661e+04  2.339e+04  -0.710 0.477879    
## Exterior1stCemntBd                 -5.065e+03  6.383e+03  -0.794 0.427602    
## Exterior1stHdBoard                 -3.549e+03  5.855e+03  -0.606 0.544513    
## Exterior1stImStucc                 -9.189e+03  2.365e+04  -0.388 0.697732    
## Exterior1stMetalSd                 -1.417e+03  5.680e+03  -0.249 0.803045    
## Exterior1stPlywood                 -2.712e+03  6.173e+03  -0.439 0.660501    
## Exterior1stStone                    9.223e+03  1.806e+04   0.511 0.609674    
## Exterior1stStucco                   3.599e+03  7.271e+03   0.495 0.620697    
## Exterior1stVinylSd                 -2.975e+03  5.755e+03  -0.517 0.605286    
## Exterior1stWd Sdng                 -3.510e+03  5.674e+03  -0.619 0.536208    
## Exterior1stWdShing                 -2.319e+03  7.065e+03  -0.328 0.742759    
## SaleConditionAdjLand                4.237e+03  1.202e+04   0.352 0.724565    
## SaleConditionAlloca                 5.678e+03  7.518e+03   0.755 0.450216    
## SaleConditionFamily                 2.857e+02  5.680e+03   0.050 0.959888    
## SaleConditionNormal                 8.625e+03  2.482e+03   3.476 0.000526 ***
## SaleConditionPartial                1.993e+04  3.646e+03   5.467 5.45e-08 ***
## LotFrontage                         6.409e+01  3.486e+01   1.839 0.066192 .  
## MSZoningFV                          5.100e+04  8.796e+03   5.798 8.34e-09 ***
## MSZoningRH                          4.079e+04  9.784e+03   4.169 3.25e-05 ***
## MSZoningRL                          3.918e+04  8.136e+03   4.815 1.63e-06 ***
## MSZoningRM                          2.946e+04  8.041e+03   3.664 0.000258 ***
## WoodDeckSF                          1.000e+01  5.314e+00   1.882 0.060093 .  
## X2ndFlrSF                           2.671e+01  1.396e+01   1.913 0.055958 .  
## OpenPorchSF                         2.274e+01  1.020e+01   2.231 0.025867 *  
## HalfBath                            3.446e+03  1.854e+03   1.858 0.063387 .  
## GarageQualFa                       -1.336e+05  2.836e+04  -4.711 2.72e-06 ***
## GarageQualGd                       -1.228e+05  2.832e+04  -4.336 1.56e-05 ***
## GarageQualPo                       -1.265e+05  3.337e+04  -3.793 0.000156 ***
## GarageQualTA                       -1.304e+05  2.805e+04  -4.651 3.62e-06 ***
## GarageQualNA                       -2.416e+04  1.781e+04  -1.357 0.175134    
## GarageCondFa                        1.177e+05  3.331e+04   3.534 0.000423 ***
## GarageCondGd                        1.242e+05  3.380e+04   3.673 0.000249 ***
## GarageCondPo                        1.148e+05  3.490e+04   3.290 0.001027 ** 
## GarageCondTA                        1.214e+05  3.302e+04   3.676 0.000246 ***
## GarageCondNA                               NA         NA      NA       NA    
## LotShapeIR2                         6.027e+03  3.901e+03   1.545 0.122535    
## LotShapeIR3                         6.975e+02  7.928e+03   0.088 0.929904    
## LotShapeReg                        -2.375e+03  1.389e+03  -1.709 0.087599 .  
## LotArea                             3.346e-01  7.331e-02   4.564 5.48e-06 ***
## CentralAirY                         7.739e+03  3.126e+03   2.476 0.013410 *  
## OverallQual:NBPrice                -3.034e+03  1.443e+03  -2.103 0.035681 *  
## OverallQual:GrLivArea               3.854e+00  1.688e+00   2.283 0.022589 *  
## OverallQual:GarageArea              1.037e+01  3.441e+00   3.013 0.002634 ** 
## OverallQual:X1stFlrSF               7.828e+00  2.458e+00   3.185 0.001479 ** 
## NBPrice:GrLivArea                   3.680e+01  3.476e+00  10.585  < 2e-16 ***
## NBPrice:GarageCars                 -6.247e+03  3.738e+03  -1.671 0.094886 .  
## NBPrice:GarageArea                 -2.212e+01  1.287e+01  -1.718 0.086027 .  
## NBPrice:TotalBsmtSF                 1.882e+01  7.186e+00   2.619 0.008924 ** 
## NBPrice:X1stFlrSF                   1.633e+01  8.371e+00   1.951 0.051260 .  
## GrLivArea:X1stFlrSF                -3.341e-02  5.631e-03  -5.934 3.75e-09 ***
## GrLivArea:FullBath                  1.219e+01  3.239e+00   3.762 0.000176 ***
## GarageCars:GarageArea               1.052e+01  5.999e+00   1.753 0.079809 .  
## GarageCars:TotalBsmtSF             -1.201e+01  7.692e+00  -1.562 0.118611    
## GarageCars:X1stFlrSF                2.160e+01  9.449e+00   2.286 0.022413 *  
## GarageArea:TotalBsmtSF              3.572e-02  2.416e-02   1.479 0.139502    
## GarageArea:X1stFlrSF               -6.250e-02  3.082e-02  -2.028 0.042761 *  
## TotalBsmtSF:X1stFlrSF              -7.412e-03  4.898e-03  -1.513 0.130409    
## TotalBsmtSF:FullBath               -8.733e+00  3.873e+00  -2.255 0.024283 *  
## NBPrice:NBCount                     8.160e+01  3.692e+01   2.210 0.027247 *  
## YearBuilt:GarageYrBlt              -3.021e+02  1.264e+02  -2.389 0.017026 *  
## YearBuilt:YearRemodAdd             -2.961e+02  1.248e+02  -2.373 0.017759 *  
## GarageYrBlt:YearRemodAdd           -2.918e+02  1.225e+02  -2.382 0.017376 *  
## YearBuilt:GarageYrBlt:YearRemodAdd  1.513e-01  6.339e-02   2.387 0.017138 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22360 on 1360 degrees of freedom
## Multiple R-squared:  0.9261, Adjusted R-squared:  0.9208 
## F-statistic: 172.3 on 99 and 1360 DF,  p-value: < 2.2e-16

Alright. our model has an adjusted R-squared of 0.9208, which is a bit of an improvement with a much simpler model (185 down to 101). This is also a huge improvement from my week 14 model, which was 0.7394. There is definitely some concern that I introduced overfitting with my data engineering though.

Let’s take a look at the residuals

plot(stepmodel$fitted.values,stepmodel$residuals)
abline(h=0, col="blue")

It still seems to spread out just a bit toward the end, but the obvious pattern from my previous model is gone.

qqnorm(resid(model))
qqline(resid(model))

Looks like we are still having trouble at the tails. It is likely that I needed to do some transformations in order to fine the linear relationships.

Prediction Test Data and Submitting

Let’s pull in the test data to make predictions.

df2 = read.csv('housing_data_test.csv')
summ = getDFSummary(df2,'SalePrice')

#Get Factor columns with missing data
#Note - I'm not checking the new data frame for missing data, I'm just filling in the columns that had missing data in the train.
#The reason for this is that we don't want to add a new level (a new dummy variable) if a column has missing data that didn't previously

#loop through and add the NA level, then replace 
for(col in factors_w_missing) {
    levels(df2[,col]) = c(levels(df2[,col]),'NA') 
    df2[,col][is.na(df2[,col])] = 'NA'
}
summ = getDFSummary(df2,'SalePrice')
#OK, if we DO have missing data in categorical columns that didn't previously (don't worry, I looked into the future and found this to be true), we need to do something besides moving to NA. I've decided to move it to the most common value

factors_w_missing_2 = summ$Column[summ$'Data Type'=='factor' & summ$'Missing Values' > 0]

#loop through and add the NA level, then replace with most common value
for(col in factors_w_missing_2) {
    df2[,col][is.na(df2[,col])] = tail(names(sort(table(df[,col]))), 1)
}

#impute year built for garage year built
df2$GarageYrBlt[is.na(df2$GarageYrBlt)] =df2$YearBuilt[is.na(df2$GarageYrBlt)]
##For masonry Veneer area impute 0.
df2$MasVnrArea[is.na(df2$MasVnrArea)] = 0

#Add Lot frontage NA dummy and impute 70.05
df2$LotFrontage_NA=0
df2$LotFrontage_NA[is.na(df2$LotFrontage)] = 1
sum(df2$LotFrontage_NA)
## [1] 227
df2$LotFrontage[is.na(df2$LotFrontage)] = 70.05
summ = getDFSummary(df2,'SalePrice')
summ[order(-summ$'Missing Values',-summ$'Factor Levels'),]
##            Column Data Type Factor Levels Missing Values      Mean  Median
## 48   BsmtFullBath   integer             0              2    0.4345    0.00
## 49   BsmtHalfBath   integer             0              2    0.0652    0.00
## 35     BsmtFinSF1   integer             0              1  439.2000  350.50
## 37     BsmtFinSF2   integer             0              1   52.6200    0.00
## 38      BsmtUnfSF   integer             0              1  554.3000  460.00
## 39    TotalBsmtSF   integer             0              1 1046.0000  988.00
## 62     GarageCars   integer             0              1    1.7660    2.00
## 63     GarageArea   integer             0              1  472.8000  480.00
## 13   Neighborhood    factor            25              0        NA      NA
## 25    Exterior2nd    factor            15              0        NA      NA
## 24    Exterior1st    factor            13              0        NA      NA
## 14     Condition1    factor             9              0        NA      NA
## 79       SaleType    factor             9              0        NA      NA
## 17     HouseStyle    factor             7              0        NA      NA
## 34   BsmtFinType1    factor             7              0        NA      NA
## 36   BsmtFinType2    factor             7              0        NA      NA
## 56     Functional    factor             7              0        NA      NA
## 59     GarageType    factor             7              0        NA      NA
## 22      RoofStyle    factor             6              0        NA      NA
## 30     Foundation    factor             6              0        NA      NA
## 58    FireplaceQu    factor             6              0        NA      NA
## 65     GarageCond    factor             6              0        NA      NA
## 80  SaleCondition    factor             6              0        NA      NA
## 3        MSZoning    factor             5              0        NA      NA
## 11      LotConfig    factor             5              0        NA      NA
## 15     Condition2    factor             5              0        NA      NA
## 16       BldgType    factor             5              0        NA      NA
## 26     MasVnrType    factor             5              0        NA      NA
## 29      ExterCond    factor             5              0        NA      NA
## 31       BsmtQual    factor             5              0        NA      NA
## 32       BsmtCond    factor             5              0        NA      NA
## 33   BsmtExposure    factor             5              0        NA      NA
## 41      HeatingQC    factor             5              0        NA      NA
## 43     Electrical    factor             5              0        NA      NA
## 64     GarageQual    factor             5              0        NA      NA
## 74          Fence    factor             5              0        NA      NA
## 8        LotShape    factor             4              0        NA      NA
## 9     LandContour    factor             4              0        NA      NA
## 23       RoofMatl    factor             4              0        NA      NA
## 28      ExterQual    factor             4              0        NA      NA
## 40        Heating    factor             4              0        NA      NA
## 54    KitchenQual    factor             4              0        NA      NA
## 61   GarageFinish    factor             4              0        NA      NA
## 75    MiscFeature    factor             4              0        NA      NA
## 7           Alley    factor             3              0        NA      NA
## 12      LandSlope    factor             3              0        NA      NA
## 66     PavedDrive    factor             3              0        NA      NA
## 73         PoolQC    factor             3              0        NA      NA
## 6          Street    factor             2              0        NA      NA
## 42     CentralAir    factor             2              0        NA      NA
## 10      Utilities    factor             1              0        NA      NA
## 1              Id   integer             0              0 2190.0000 2190.00
## 2      MSSubClass   integer             0              0   57.3800   50.00
## 4     LotFrontage   numeric             0              0   68.8100   70.05
## 5         LotArea   integer             0              0 9819.0000 9399.00
## 18    OverallQual   integer             0              0    6.0790    6.00
## 19    OverallCond   integer             0              0    5.5540    5.00
## 20      YearBuilt   integer             0              0 1971.0000 1973.00
## 21   YearRemodAdd   integer             0              0 1984.0000 1992.00
## 27     MasVnrArea   numeric             0              0   99.6700    0.00
## 44      X1stFlrSF   integer             0              0 1157.0000 1079.00
## 45      X2ndFlrSF   integer             0              0  326.0000    0.00
## 46   LowQualFinSF   integer             0              0    3.5440    0.00
## 47      GrLivArea   integer             0              0 1486.0000 1432.00
## 50       FullBath   integer             0              0    1.5710    2.00
## 51       HalfBath   integer             0              0    0.3777    0.00
## 52   BedroomAbvGr   integer             0              0    2.8540    3.00
## 53   KitchenAbvGr   integer             0              0    1.0420    1.00
## 55   TotRmsAbvGrd   integer             0              0    6.3850    6.00
## 57     Fireplaces   integer             0              0    0.5812    0.00
## 60    GarageYrBlt   integer             0              0 1976.0000 1978.00
## 67     WoodDeckSF   integer             0              0   93.1700    0.00
## 68    OpenPorchSF   integer             0              0   48.3100   28.00
## 69  EnclosedPorch   integer             0              0   24.2400    0.00
## 70     X3SsnPorch   integer             0              0    1.7940    0.00
## 71    ScreenPorch   integer             0              0   17.0600    0.00
## 72       PoolArea   integer             0              0    1.7440    0.00
## 76        MiscVal   integer             0              0   58.1700    0.00
## 77         MoSold   integer             0              0    6.1040    6.00
## 78         YrSold   integer             0              0 2008.0000 2008.00
## 81 LotFrontage_NA   numeric             0              0    0.1556    0.00
##           SD  Min   Max
## 48    0.5306    0     3
## 49    0.2525    0     2
## 35  455.3000    0  4010
## 37  176.8000    0  1526
## 38  437.3000    0  2140
## 39  442.9000    0  5095
## 62    0.7759    0     5
## 63  217.0000    0  1488
## 13        NA   NA    NA
## 25        NA   NA    NA
## 24        NA   NA    NA
## 14        NA   NA    NA
## 79        NA   NA    NA
## 17        NA   NA    NA
## 34        NA   NA    NA
## 36        NA   NA    NA
## 56        NA   NA    NA
## 59        NA   NA    NA
## 22        NA   NA    NA
## 30        NA   NA    NA
## 58        NA   NA    NA
## 65        NA   NA    NA
## 80        NA   NA    NA
## 3         NA   NA    NA
## 11        NA   NA    NA
## 15        NA   NA    NA
## 16        NA   NA    NA
## 26        NA   NA    NA
## 29        NA   NA    NA
## 31        NA   NA    NA
## 32        NA   NA    NA
## 33        NA   NA    NA
## 41        NA   NA    NA
## 43        NA   NA    NA
## 64        NA   NA    NA
## 74        NA   NA    NA
## 8         NA   NA    NA
## 9         NA   NA    NA
## 23        NA   NA    NA
## 28        NA   NA    NA
## 40        NA   NA    NA
## 54        NA   NA    NA
## 61        NA   NA    NA
## 75        NA   NA    NA
## 7         NA   NA    NA
## 12        NA   NA    NA
## 66        NA   NA    NA
## 73        NA   NA    NA
## 6         NA   NA    NA
## 42        NA   NA    NA
## 10        NA   NA    NA
## 1   421.3000 1461  2919
## 2    42.7500   20   190
## 4    20.5700   21   200
## 5  4956.0000 1470 56600
## 18    1.4370    1    10
## 19    1.1140    1     9
## 20   30.3900 1879  2010
## 21   21.1300 1950  2010
## 27  177.0000    0  1290
## 44  398.2000  407  5095
## 45  420.6000    0  1862
## 46   44.0400    0  1064
## 47  485.6000  407  5095
## 50    0.5552    0     4
## 51    0.5030    0     2
## 52    0.8298    0     6
## 53    0.2085    0     2
## 55    1.5090    3    15
## 57    0.6474    0     4
## 60   27.7600 1890  2207
## 67  127.7000    0  1424
## 68   68.8800    0   742
## 69   67.2300    0  1012
## 70   20.2100    0   360
## 71   56.6100    0   576
## 72   30.4900    0   800
## 76  630.8000    0 17000
## 77    2.7220    1    12
## 78    1.3020 2006  2010
## 81    0.3626    0     1

We have some missing data for numeric columns that weren’t missing in the original, so we don’t have a rule. Because of this I will just impute the mean and move on. Not ideal but it’s just a couple of rows.

numeric_w_missing = summ[summ$'Data Type'!='factor' & summ$'Missing Values' > 0,c('Column','Mean')]

#loop through and replace with mean
for(row in 1:nrow(numeric_w_missing)){
    col = numeric_w_missing[row,'Column']
    mean = numeric_w_missing[row,'Mean']
    df2[,col][is.na(df2[,col])] = mean
}

Now we just need the Engineered neighborhood columns. We don’t know the Sale Price for this data set because we are trying to predict it. So we need to use the stored averages from the test set

for (nhval in nh$Neighborhood ){

    df2$NBCount[df2$Neighborhood==nhval] = nh[nhval,'count']
    df2$NBPrice[df2$Neighborhood==nhval] = nh[nhval,'AvgPrice']
}

summ = getDFSummary(df2,'SalePrice')
summ[order(-summ$'Missing Values',-summ$'Factor Levels'),]
##            Column Data Type Factor Levels Missing Values      Mean  Median
## 13   Neighborhood    factor            25              0        NA      NA
## 25    Exterior2nd    factor            15              0        NA      NA
## 24    Exterior1st    factor            13              0        NA      NA
## 14     Condition1    factor             9              0        NA      NA
## 79       SaleType    factor             9              0        NA      NA
## 17     HouseStyle    factor             7              0        NA      NA
## 34   BsmtFinType1    factor             7              0        NA      NA
## 36   BsmtFinType2    factor             7              0        NA      NA
## 56     Functional    factor             7              0        NA      NA
## 59     GarageType    factor             7              0        NA      NA
## 22      RoofStyle    factor             6              0        NA      NA
## 30     Foundation    factor             6              0        NA      NA
## 58    FireplaceQu    factor             6              0        NA      NA
## 65     GarageCond    factor             6              0        NA      NA
## 80  SaleCondition    factor             6              0        NA      NA
## 3        MSZoning    factor             5              0        NA      NA
## 11      LotConfig    factor             5              0        NA      NA
## 15     Condition2    factor             5              0        NA      NA
## 16       BldgType    factor             5              0        NA      NA
## 26     MasVnrType    factor             5              0        NA      NA
## 29      ExterCond    factor             5              0        NA      NA
## 31       BsmtQual    factor             5              0        NA      NA
## 32       BsmtCond    factor             5              0        NA      NA
## 33   BsmtExposure    factor             5              0        NA      NA
## 41      HeatingQC    factor             5              0        NA      NA
## 43     Electrical    factor             5              0        NA      NA
## 64     GarageQual    factor             5              0        NA      NA
## 74          Fence    factor             5              0        NA      NA
## 8        LotShape    factor             4              0        NA      NA
## 9     LandContour    factor             4              0        NA      NA
## 23       RoofMatl    factor             4              0        NA      NA
## 28      ExterQual    factor             4              0        NA      NA
## 40        Heating    factor             4              0        NA      NA
## 54    KitchenQual    factor             4              0        NA      NA
## 61   GarageFinish    factor             4              0        NA      NA
## 75    MiscFeature    factor             4              0        NA      NA
## 7           Alley    factor             3              0        NA      NA
## 12      LandSlope    factor             3              0        NA      NA
## 66     PavedDrive    factor             3              0        NA      NA
## 73         PoolQC    factor             3              0        NA      NA
## 6          Street    factor             2              0        NA      NA
## 42     CentralAir    factor             2              0        NA      NA
## 10      Utilities    factor             1              0        NA      NA
## 1              Id   integer             0              0 2190.0000 2190.00
## 2      MSSubClass   integer             0              0   57.3800   50.00
## 4     LotFrontage   numeric             0              0   68.8100   70.05
## 5         LotArea   integer             0              0 9819.0000 9399.00
## 18    OverallQual   integer             0              0    6.0790    6.00
## 19    OverallCond   integer             0              0    5.5540    5.00
## 20      YearBuilt   integer             0              0 1971.0000 1973.00
## 21   YearRemodAdd   integer             0              0 1984.0000 1992.00
## 27     MasVnrArea   numeric             0              0   99.6700    0.00
## 35     BsmtFinSF1   numeric             0              0  439.2000  351.00
## 37     BsmtFinSF2   numeric             0              0   52.6200    0.00
## 38      BsmtUnfSF   numeric             0              0  554.3000  460.00
## 39    TotalBsmtSF   numeric             0              0 1046.0000  988.00
## 44      X1stFlrSF   integer             0              0 1157.0000 1079.00
## 45      X2ndFlrSF   integer             0              0  326.0000    0.00
## 46   LowQualFinSF   integer             0              0    3.5440    0.00
## 47      GrLivArea   integer             0              0 1486.0000 1432.00
## 48   BsmtFullBath   numeric             0              0    0.4345    0.00
## 49   BsmtHalfBath   numeric             0              0    0.0652    0.00
## 50       FullBath   integer             0              0    1.5710    2.00
## 51       HalfBath   integer             0              0    0.3777    0.00
## 52   BedroomAbvGr   integer             0              0    2.8540    3.00
## 53   KitchenAbvGr   integer             0              0    1.0420    1.00
## 55   TotRmsAbvGrd   integer             0              0    6.3850    6.00
## 57     Fireplaces   integer             0              0    0.5812    0.00
## 60    GarageYrBlt   integer             0              0 1976.0000 1978.00
## 62     GarageCars   numeric             0              0    1.7660    2.00
## 63     GarageArea   numeric             0              0  472.8000  480.00
## 67     WoodDeckSF   integer             0              0   93.1700    0.00
## 68    OpenPorchSF   integer             0              0   48.3100   28.00
## 69  EnclosedPorch   integer             0              0   24.2400    0.00
## 70     X3SsnPorch   integer             0              0    1.7940    0.00
## 71    ScreenPorch   integer             0              0   17.0600    0.00
## 72       PoolArea   integer             0              0    1.7440    0.00
## 76        MiscVal   integer             0              0   58.1700    0.00
## 77         MoSold   integer             0              0    6.1040    6.00
## 78         YrSold   integer             0              0 2008.0000 2008.00
## 81 LotFrontage_NA   numeric             0              0    0.1556    0.00
## 82        NBCount   integer             0              0   97.3300   79.00
## 83        NBPrice   numeric             0              0    1.7900    1.56
##           SD     Min      Max
## 13        NA      NA       NA
## 25        NA      NA       NA
## 24        NA      NA       NA
## 14        NA      NA       NA
## 79        NA      NA       NA
## 17        NA      NA       NA
## 34        NA      NA       NA
## 36        NA      NA       NA
## 56        NA      NA       NA
## 59        NA      NA       NA
## 22        NA      NA       NA
## 30        NA      NA       NA
## 58        NA      NA       NA
## 65        NA      NA       NA
## 80        NA      NA       NA
## 3         NA      NA       NA
## 11        NA      NA       NA
## 15        NA      NA       NA
## 16        NA      NA       NA
## 26        NA      NA       NA
## 29        NA      NA       NA
## 31        NA      NA       NA
## 32        NA      NA       NA
## 33        NA      NA       NA
## 41        NA      NA       NA
## 43        NA      NA       NA
## 64        NA      NA       NA
## 74        NA      NA       NA
## 8         NA      NA       NA
## 9         NA      NA       NA
## 23        NA      NA       NA
## 28        NA      NA       NA
## 40        NA      NA       NA
## 54        NA      NA       NA
## 61        NA      NA       NA
## 75        NA      NA       NA
## 7         NA      NA       NA
## 12        NA      NA       NA
## 66        NA      NA       NA
## 73        NA      NA       NA
## 6         NA      NA       NA
## 42        NA      NA       NA
## 10        NA      NA       NA
## 1   421.3000 1461.00  2919.00
## 2    42.7500   20.00   190.00
## 4    20.5700   21.00   200.00
## 5  4956.0000 1470.00 56600.00
## 18    1.4370    1.00    10.00
## 19    1.1140    1.00     9.00
## 20   30.3900 1879.00  2010.00
## 21   21.1300 1950.00  2010.00
## 27  177.0000    0.00  1290.00
## 35  455.1000    0.00  4010.00
## 37  176.7000    0.00  1526.00
## 38  437.1000    0.00  2140.00
## 39  442.7000    0.00  5095.00
## 44  398.2000  407.00  5095.00
## 45  420.6000    0.00  1862.00
## 46   44.0400    0.00  1064.00
## 47  485.6000  407.00  5095.00
## 48    0.5303    0.00     3.00
## 49    0.2523    0.00     2.00
## 50    0.5552    0.00     4.00
## 51    0.5030    0.00     2.00
## 52    0.8298    0.00     6.00
## 53    0.2085    0.00     2.00
## 55    1.5090    3.00    15.00
## 57    0.6474    0.00     4.00
## 60   27.7600 1890.00  2207.00
## 62    0.7757    0.00     5.00
## 63  217.0000    0.00  1488.00
## 67  127.7000    0.00  1424.00
## 68   68.8800    0.00   742.00
## 69   67.2300    0.00  1012.00
## 70   20.2100    0.00   360.00
## 71   56.6100    0.00   576.00
## 72   30.4900    0.00   800.00
## 76  630.8000    0.00 17000.00
## 77    2.7220    1.00    12.00
## 78    1.3020 2006.00  2010.00
## 81    0.3626    0.00     1.00
## 82   63.2100    2.00   225.00
## 83    0.5925    0.99     3.35
pred = predict(stepmodel,newdata = df2)
sum(is.na(pred))
## [1] 0
mean(pred)
## [1] 178244.6

Well, we don’t have the sale price for these, so there’s not much else I can do besides submit and see what happens!

pred_df = as.data.frame(cbind(df2$Id,pred))
names(pred_df) = c('Id','SalePrice') 
write.csv(pred_df,'housing_predictions_steven_ellingson.csv', row.names=FALSE)

I got a score of 0.15499, which is apparently the RMSLE.
I’m curious what the RMSLE is for my training set.

library(Metrics)
## Warning: package 'Metrics' was built under R version 3.6.3
rmsle(df$SalePrice, stepmodel$fitted.values)
## [1] 0.1240596

OK, so we did have a bit of overfitting - the RMSLE went from 0.124 in train to 0.155 in test.