## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: Matrix
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

Problem 1

Pick one of the quantitative independent variables (Xi) from the data set below, and define that variable as X. Also, pick one of the dependent variables (Yi) below and define that as Y.

# Using Y1 and X1

Y <- c(20.3, 19.1, 19.3, 20.9, 22.0, 23.5, 13.8, 18.8, 20.9, 18.6,
       22.3, 17.6, 20.8, 28.7, 15.2, 20.9, 18.4, 10.3, 26.3, 28.1)

X <- c(9.3, 4.1, 22.4, 9.1, 15.8, 7.1, 15.9, 6.9, 16.0, 6.7, 8.2,
       16.0, 6.4, 11.8, 3.5, 21.7, 12.2, 9.3, 8.0, 6.2)

df <- data.frame(X, Y)

Calculate as a minimum the below probabilities a - c. Assume the small letter “x” is estimated as the 3rd quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

a. P(X>x|Y>y)

We need to calculate the probability that a random number in X is greater than the 3rd quartile, given that we have a random number in Y that is greater than the 1st quartile. So first, we’ll need to define the 3rd quartile of X and the 1st quartile of Y, using the built in R function quantile.

XQ3 <- quantile(df$X, 0.75)
YQ1 <- quantile(df$Y, 0.25)

We can test these values very easily, given that each set of numbers is 20. If XQ3 is the 3rd quartile, then 25% of the numbers in X will be greater than it. Likewise, if YQ1 is the 1st quartile, then 75% of the numbers in Y will be greater than it.

length(df[df$X>XQ3, 1]) / length(df$X)
## [1] 0.25
length(df[df$Y>YQ1, 2]) / length(df$Y)
## [1] 0.75

So what is \(P(X>x|Y>y)\)? We can easily do this just by filtering the dataset for the values that only follow that conditional, then dividing by the total number of entries.

filter(df, X > XQ3 & Y > YQ1)
##      X    Y
## 1 22.4 19.3
## 2 16.0 20.9
## 3 21.7 20.9

We see that only three values of X meet this requirement, so \(P(X>x|Y>y) = \frac{3}{20} = 0.15\)

b. P(X>x, Y>y)

This one isn’t conditional, so we’re simply looking for all values that satisfy \(P(X>x)\) and all values that satisfy \(P(Y>y)\). Given that they’re quantiles, however, we know that this is simply \(0.75 * 0.25\), but we can doublecheck against our data.

p.xg3 <- filter(df, X > XQ3) %>%
            tally()/nrow(df)

p.yg1 <- filter(df, Y > YQ1) %>%
            tally()/nrow(df)

p.xg3
##      n
## 1 0.25
p.yg1
##      n
## 1 0.75
p.xg3 * p.yg1
##        n
## 1 0.1875

And indeed, \(P(X>x, Y>y) = 0.1875\)

ans.b <- p.xg3 * p.yg1
ans.b
##        n
## 1 0.1875

c. P(X<x | Y>y)

We know \(P(Y>y)\), but now we must calculate \(P(X<x)\). This is actually simple, as it should almost be \(1-P(X>x)\) It might not be exact, if the 3rd quartile value of X happens to also exist in the dataset.

filter(df, X < XQ3) %>%
  tally()/nrow(df)
##      n
## 1 0.75

So as with our first problem, we can simply filter the dataset and find the values that satisfy our conditional. Normally we could use math to figure this out, like \(P(X < x|Y>y) = \frac{P(X < x \text{ and } Y >y)}{P(Y>y)}\), but that usually requires knowing the other conditional.

filter(df, X < XQ3 & Y > YQ1)
##       X    Y
## 1   9.3 20.3
## 2   4.1 19.1
## 3   9.1 20.9
## 4  15.8 22.0
## 5   7.1 23.5
## 6   6.9 18.8
## 7   6.7 18.6
## 8   8.2 22.3
## 9   6.4 20.8
## 10 11.8 28.7
## 11  8.0 26.3
## 12  6.2 28.1
filter(df, X < XQ3 & Y > YQ1) %>%
  tally()/nrow(df)
##     n
## 1 0.6

\(P(X < X | Y > y) = 0.6\)

In addition, make a table of counts:

a1 <- filter(df, X <= XQ3 & Y <= YQ1) %>%
        tally()
a2 <- filter(df, X <= XQ3 & Y > YQ1) %>%
        tally()
b1 <- filter(df, X > XQ3 & Y <= YQ1) %>%
        tally()
b2 <- filter(df, X > XQ3 & Y > YQ1) %>%
        tally()

count.tab <- matrix(c(a1, a2, a1+a2, b1, b2, b1+b2, a1+b1, a2+b2, a1+a2+b1+b2),
                    ncol = 3, nrow = 3)
colnames(count.tab) <- c("X <= 3rd Quartile", "X >3rd Quartile", "Total")
rownames(count.tab) <- c("Y <= 1st Quartile", "Y > 1st Quartile", "Total")
count.tab
##                   X <= 3rd Quartile X >3rd Quartile Total
## Y <= 1st Quartile 3                 2               5    
## Y > 1st Quartile  12                3               15   
## Total             15                5               20

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 3rd Quartile for X, and let B be the new variable counting those observation about the 1st Quartile for Y. Does P(AB) = P(A)P(B)?

Well, looking at our table \(A = 5/20 = 0.25\) and \(B = 15/20 = 0.75\). That makes \(P(A)P(B) = 0.1875\). What about \(P(AB)\)? According to the table that’s \(3/20 = 0.15\), which we calculated earlier. \(0.1875 \neq 0.15\), so they’re not independent.

Verifying with Chi Squared Test

chisq.test(df)
## 
##  Pearson's Chi-squared test
## 
## data:  df
## X-squared = 40.234, df = 19, p-value = 0.003048

The p-value is less than the typical significance level of 0.05, so we reject the null hypothesis that the data is independent.

Problem 2

Register for Kaggle.com and compete in the House Prices: Advanced Regression Techniques competition.

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set.

##        Id           MSSubClass     MSZoning     LotFrontage    
##  Min.   :   1.0   20     :536   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   60     :299   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   50     :144   RH     :  16   Median : 69.00  
##  Mean   : 730.5   120    : 87   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   30     : 69   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   160    : 63                  Max.   :313.00  
##                   (Other):262                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50   
##  Median :  9478               NA's:1369   IR3: 10   Low:  36   
##  Mean   : 10517                           Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                
##  Max.   :215245                                                
##                                                                
##   Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub:1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  NoSeWa:   1   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##                FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                FR3    :   4              Edwards:100   RRAn   :  26  
##                Inside :1052              Somerst: 86   PosN   :  19  
##                                          Gilbert: 79   RRAe   :  11  
##                                          (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual   OverallCond 
##  Norm   :1445   1Fam  :1220   1Story :726   5      :397   5      :821  
##  Feedr  :   6   2fmCon:  31   2Story :445   6      :374   6      :252  
##  Artery :   2   Duplex:  52   1.5Fin :154   7      :319   7      :205  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   8      :168   8      : 72  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   4      :116   4      : 57  
##  PosA   :   1                 1.5Unf : 14   9      : 43   3      : 25  
##  (Other):   2                 (Other): 19   (Other): 43   (Other): 28  
##    YearBuilt     YearRemodAdd   RoofStyle       RoofMatl     Exterior1st 
##  2006   :  67   1950   :178   Flat   :  13   CompShg:1434   VinylSd:515  
##  2005   :  64   2006   : 97   Gable  :1141   Tar&Grv:  11   HdBoard:222  
##  2004   :  54   2007   : 76   Gambrel:  11   WdShngl:   6   MetalSd:220  
##  2007   :  49   2005   : 73   Hip    : 286   WdShake:   5   Wd Sdng:206  
##  2003   :  45   2004   : 62   Mansard:   7   ClyTile:   1   Plywood:108  
##  1976   :  33   2000   : 55   Shed   :   2   Membran:   1   CemntBd: 61  
##  (Other):1148   (Other):919                  (Other):   2   (Other):128  
##   Exterior2nd    MasVnrType    MasVnrArea     ExterQual ExterCond
##  VinylSd:504   BrkCmn : 15   Min.   :   0.0   Ex: 52    Ex:   3  
##  MetalSd:214   BrkFace:445   1st Qu.:   0.0   Fa: 14    Fa:  28  
##  HdBoard:207   None   :864   Median :   0.0   Gd:488    Gd: 146  
##  Wd Sdng:197   Stone  :128   Mean   : 103.7   TA:906    Po:   1  
##  Plywood:142   NA's   :  8   3rd Qu.: 166.0             TA:1282  
##  CmentBd: 60                 Max.   :1600.0                      
##  (Other):136                 NA's   :8                           
##   Foundation  BsmtQual   BsmtCond    BsmtExposure BsmtFinType1
##  BrkTil:146   Ex  :121   Fa  :  45   Av  :221     ALQ :220    
##  CBlock:634   Fa  : 35   Gd  :  65   Gd  :134     BLQ :148    
##  PConc :647   Gd  :618   Po  :   2   Mn  :114     GLQ :418    
##  Slab  : 24   TA  :649   TA  :1311   No  :953     LwQ : 74    
##  Stone :  6   NA's: 37   NA's:  37   NA's: 38     Rec :133    
##  Wood  :  3                                       Unf :430    
##                                                   NA's: 37    
##    BsmtFinSF1     BsmtFinType2   BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   ALQ :  19    Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00   1st Qu.: 223.0  
##  Median : 383.5   GLQ :  14    Median :   0.00   Median : 477.5  
##  Mean   : 443.6   LwQ :  46    Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 712.2   Rec :  54    3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :5644.0   Unf :1256    Max.   :1474.00   Max.   :2336.0  
##                   NA's:  38                                      
##   TotalBsmtSF      Heating     HeatingQC CentralAir Electrical  
##  Min.   :   0.0   Floor:   1   Ex:741    N:  95     FuseA:  94  
##  1st Qu.: 795.8   GasA :1428   Fa: 49    Y:1365     FuseF:  27  
##  Median : 991.5   GasW :  18   Gd:241               FuseP:   3  
##  Mean   :1057.4   Grav :   7   Po:  1               Mix  :   1  
##  3rd Qu.:1298.2   OthW :   2   TA:428               SBrkr:1334  
##  Max.   :6110.0   Wall :   4                        NA's :   1  
##                                                                 
##    X1stFlrSF      X2ndFlrSF     LowQualFinSF       GrLivArea   
##  Min.   : 334   Min.   :   0   Min.   :  0.000   Min.   : 334  
##  1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130  
##  Median :1087   Median :   0   Median :  0.000   Median :1464  
##  Mean   :1163   Mean   : 347   Mean   :  5.845   Mean   :1515  
##  3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777  
##  Max.   :4692   Max.   :2065   Max.   :572.000   Max.   :5642  
##                                                                
##  BsmtFullBath BsmtHalfBath FullBath HalfBath  BedroomAbvGr KitchenAbvGr
##  0:856        0:1378       0:  9    0:913    3      :804   0:   1      
##  1:588        1:  80       1:650    1:535    2      :358   1:1392      
##  2: 15        2:   2       2:768    2: 12    4      :213   2:  65      
##  3:  1                     3: 33             1      : 50   3:   2      
##                                              5      : 21               
##                                              6      :  7               
##                                              (Other):  7               
##  KitchenQual  TotRmsAbvGrd Functional  Fireplaces FireplaceQu
##  Ex:100      6      :402   Maj1:  14   0:690      Ex  : 24   
##  Fa: 39      7      :329   Maj2:   5   1:650      Fa  : 33   
##  Gd:586      5      :275   Min1:  31   2:115      Gd  :380   
##  TA:735      8      :187   Min2:  34   3:  5      Po  : 20   
##              4      : 97   Mod :  15              TA  :313   
##              9      : 75   Sev :   1              NA's:690   
##              (Other): 95   Typ :1360                         
##    GarageType   GarageYrBlt   GarageFinish   GarageCars   
##  2Types :  6   2005   :  65   Fin :352     Min.   :0.000  
##  Attchd :870   2006   :  59   RFn :422     1st Qu.:1.000  
##  Basment: 19   2004   :  53   Unf :605     Median :2.000  
##  BuiltIn: 88   2003   :  50   NA's: 81     Mean   :1.767  
##  CarPort:  9   2007   :  49                3rd Qu.:2.000  
##  Detchd :387   (Other):1103                Max.   :4.000  
##  NA's   : 81   NA's   :  81                               
##    GarageArea     GarageQual  GarageCond  PavedDrive   WoodDeckSF    
##  Min.   :   0.0   Ex  :   3   Ex  :   2   N:  90     Min.   :  0.00  
##  1st Qu.: 334.5   Fa  :  48   Fa  :  35   P:  30     1st Qu.:  0.00  
##  Median : 480.0   Gd  :  14   Gd  :   9   Y:1340     Median :  0.00  
##  Mean   : 473.0   Po  :   3   Po  :   7              Mean   : 94.24  
##  3rd Qu.: 576.0   TA  :1311   TA  :1326              3rd Qu.:168.00  
##  Max.   :1418.0   NA's:  81   NA's:  81              Max.   :857.00  
##                                                                      
##   OpenPorchSF     EnclosedPorch      X3SsnPorch      ScreenPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median : 25.00   Median :  0.00   Median :  0.00   Median :  0.00  
##  Mean   : 46.66   Mean   : 21.95   Mean   :  3.41   Mean   : 15.06  
##  3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :547.00   Max.   :552.00   Max.   :508.00   Max.   :480.00  
##                                                                     
##     PoolArea        PoolQC       Fence      MiscFeature    MiscVal        
##  Min.   :  0.000   Ex  :   2   GdPrv:  59   Gar2:   2   Min.   :    0.00  
##  1st Qu.:  0.000   Fa  :   2   GdWo :  54   Othr:   2   1st Qu.:    0.00  
##  Median :  0.000   Gd  :   3   MnPrv: 157   Shed:  49   Median :    0.00  
##  Mean   :  2.759   NA's:1453   MnWw :  11   TenC:   1   Mean   :   43.49  
##  3rd Qu.:  0.000               NA's :1179   NA's:1406   3rd Qu.:    0.00  
##  Max.   :738.000                                        Max.   :15500.00  
##                                                                           
##      MoSold     YrSold       SaleType    SaleCondition    SalePrice     
##  6      :253   2006:314   WD     :1267   Abnorml: 101   Min.   : 34900  
##  7      :234   2007:329   New    : 122   AdjLand:   4   1st Qu.:129975  
##  5      :204   2008:304   COD    :  43   Alloca :  12   Median :163000  
##  4      :141   2009:338   ConLD  :   9   Family :  20   Mean   :180921  
##  8      :122   2010:175   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  3      :106              ConLw  :   5   Partial: 125   Max.   :755000  
##  (Other):400              (Other):   9

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

plot(house.train$SalePrice~house.train$X1stFlrSF, main = "1st Floor Square Feet vs Sale Price",
     xlab = "Square Feet", ylab = "Sale Price", col = "purple")

plot(house.train$SalePrice~house.train$Fireplaces, main = "Num of Fireplaces vs Sale Price",
     xlab = "Fireplaces", ylab = "Sale Price", col = "orange")

Derive a correlation matrix for any THREE variables in the dataset. Test the hypothesis that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

cor.mat <- cor(as.matrix(house.train[, c("LotArea", "TotalBsmtSF", "WoodDeckSF")]))
round(cor.mat, 2)
##             LotArea TotalBsmtSF WoodDeckSF
## LotArea        1.00        0.26       0.17
## TotalBsmtSF    0.26        1.00       0.23
## WoodDeckSF     0.17        0.23       1.00
cor.test(house.train$LotArea, house.train$TotalBsmtSF)
## 
##  Pearson's product-moment correlation
## 
## data:  house.train$LotArea and house.train$TotalBsmtSF
## t = 10.317, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2123727 0.3080138
## sample estimates:
##       cor 
## 0.2608331
cor.test(house.train$LotArea, house.train$WoodDeckSF)
## 
##  Pearson's product-moment correlation
## 
## data:  house.train$LotArea and house.train$WoodDeckSF
## t = 6.6549, df = 1458, p-value = 4.001e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1214653 0.2210529
## sample estimates:
##       cor 
## 0.1716977
cor.test(house.train$TotalBsmtSF, house.train$WoodDeckSF)
## 
##  Pearson's product-moment correlation
## 
## data:  house.train$TotalBsmtSF and house.train$WoodDeckSF
## t = 9.1079, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1828933 0.2799882
## sample estimates:
##       cor 
## 0.2320186

Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Since the correlation between all three variables are 0, I wouldn’t be worried about the familywise error. Even with a Bonferroni correction for those 3, the p-value is still beneath it as it only becomes \(\frac{0.5}{3} = 0.167\).

Linear Algebra and Correlation

Invert your 3x3 correlation matrix from above. Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

inv.cor <- solve(cor.mat)
inv.cor
##                LotArea TotalBsmtSF WoodDeckSF
## LotArea      1.0882554  -0.2541836 -0.1278756
## TotalBsmtSF -0.2541836   1.1162651 -0.2153515
## WoodDeckSF  -0.1278756  -0.2153515  1.0719215

Correlation Matrix multiplied by Precision Matrix

a <- cor.mat %*% inv.cor
a
##                  LotArea   TotalBsmtSF WoodDeckSF
## LotArea     1.000000e+00  1.387779e-17          0
## TotalBsmtSF 2.081668e-17  1.000000e+00          0
## WoodDeckSF  2.775558e-17 -8.326673e-17          1

Precision Matrix multiplied by Correlation Matrix

b <- inv.cor %*% cor.mat
b
##                   LotArea  TotalBsmtSF    WoodDeckSF
## LotArea      1.000000e+00 7.632783e-17  2.775558e-17
## TotalBsmtSF -4.163336e-17 1.000000e+00 -8.326673e-17
## WoodDeckSF   0.000000e+00 0.000000e+00  1.000000e+00

LU Decomposition of both

a.lu <- expand(lu(t(a)))
a.l <- a.lu$U
a.l[lower.tri(a.lu$U)] <- a.lu$L[lower.tri(a.lu$L)]
a.l
## 3 x 3 Matrix of class "dgeMatrix"
##              [,1]         [,2]          [,3]
## [1,] 1.000000e+00 2.081668e-17  2.775558e-17
## [2,] 1.387779e-17 1.000000e+00 -8.326673e-17
## [3,] 0.000000e+00 0.000000e+00  1.000000e+00
b.lu <- expand(lu(t(b)))
b.l <- b.lu$U
b.l[lower.tri(b.lu$U)] <- b.lu$L[lower.tri(b.lu$L)]
b.l
## 3 x 3 Matrix of class "dgeMatrix"
##              [,1]          [,2] [,3]
## [1,] 1.000000e+00 -4.163336e-17    0
## [2,] 7.632783e-17  1.000000e+00    0
## [3,] 2.775558e-17 -8.326673e-17    1

Calculus-Based Probability and Statistics

Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

hist(house.train$X1stFlrSF)

summary(house.train$X1stFlrSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334     882    1087    1163    1391    4692

The mean value is 1163, which is larger than the median value of 1087. That indicates a right-skewness to this variable, and the min is already above zero.

Load the MASS package and run fitdistr to fit an exponential probability density function. Find the optimal value of lambda for this distribution.

x <- fitdistr(house.train$X1stFlrSF, densfun = "exponential")
lambda <- x$estimate
lambda
##         rate 
## 0.0008601213

Take 1000 samples from this exponential distribution using this value (e.g. rexp(1000, lambda)). Plot a histogram and compare it with a histogram of your original variable.

lam.dist <- rexp(1000, lambda)

par(mfrow = c(1, 2))
hist(lam.dist, main = "Lambda Hist")
hist(house.train$X1stFlrSF, main = "Original Hist")

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality.

per5 <- qexp(0.05, rate = lambda, lower.tail = TRUE, log.p = FALSE)
per95 <- qexp(0.95, rate = lambda, lower.tail = TRUE, log.p = FALSE)
conf.int95 <- qnorm(0.95, mean(house.train$X1stFlrSF), sd(house.train$X1stFlrSF))
per5
## [1] 59.63495
per95
## [1] 3482.918
conf.int95
## [1] 1798.507

Provide the empirical 5th percentile and 95th percentile of the data. Discuss.

quantile(house.train$X1stFlrSF, c(0.05, 0.95))
##      5%     95% 
##  672.95 1831.25

We can see the difference between a simulated PDF and the actual data. The simulated data is a little more perfectly right skewed when compared to the actual data, which is somewhat messier. It’s nice to use to talk about in a general, academic sense, or to perhaps identify trends in the data, but that might be about it.

Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.

Since there is a lot of variables, we’ll simply utilize a backward stepwise regression, eliminating all variables that are deemed insignificant. There are too many variables and too short a time to do a lot of transformations on the data.

clean <- function(df){
  # Split factor variables from non-factor variables
  x <- sapply(df, function(x) is.factor(x))
  dat <- df[, x]
  non <- df[, !x]
  
  # Replace NA with 0 for numeric variables
  n <- sapply(non, function(x) anyNA(x))
  final <- non[, !n]
  non <- non[, n]
  non[is.na(non)] <- 0
  final <- cbind(final, non)
  
  
  # Split factor variables with NA's
  y <- sapply(dat, function(x) anyNA(x))
  z <- dat[, !y]
  for(i in 1:length(z)){
    z[, i] <- factor(z[, i], levels = c(levels(z[, i]), "Z"))
  }
  final <- cbind(final, z)
  dat <- dat[, y]
  
  # Force NA to be a factor
  # Rename it to 'Z'
  for(i in 1:length(dat)){
    dat[, i] <- factor(dat[, i], levels = levels(addNA(dat[, i])),
                       labels = c(levels(dat[, i]), "Z"),
                       exclude = NULL)
  }
  
  final <- cbind(final, dat)
  return(final)
}

house.train <- clean(house.train)
mod <- lm(SalePrice~., data = house.train)
final.model <- step(mod, direction = "backward", trace = 0)
summary(final.model)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + BsmtFinSF1 + BsmtFinSF2 + 
##     BsmtUnfSF + X1stFlrSF + X2ndFlrSF + GarageCars + GarageArea + 
##     X3SsnPorch + ScreenPorch + PoolArea + MasVnrArea + MSZoning + 
##     Street + LandContour + Utilities + LotConfig + LandSlope + 
##     Neighborhood + Condition1 + Condition2 + BldgType + OverallQual + 
##     OverallCond + RoofMatl + Exterior1st + Foundation + HeatingQC + 
##     FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + KitchenQual + 
##     TotRmsAbvGrd + Functional + Fireplaces + SaleCondition + 
##     BsmtQual + BsmtExposure + BsmtFinType1 + GarageType + PoolQC, 
##     data = house.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172663   -9364      15    9825  172663 
## 
## Coefficients: (3 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8.987e+05  8.621e+04 -10.424  < 2e-16 ***
## LotArea               5.307e-01  9.205e-02   5.765 1.02e-08 ***
## BsmtFinSF1            3.871e+01  4.423e+00   8.753  < 2e-16 ***
## BsmtFinSF2            3.349e+01  5.501e+00   6.088 1.51e-09 ***
## BsmtUnfSF             1.985e+01  4.118e+00   4.821 1.60e-06 ***
## X1stFlrSF             4.184e+01  4.880e+00   8.574  < 2e-16 ***
## X2ndFlrSF             4.203e+01  3.766e+00  11.160  < 2e-16 ***
## GarageCars            3.595e+03  2.117e+03   1.698 0.089681 .  
## GarageArea            2.366e+01  7.019e+00   3.371 0.000771 ***
## X3SsnPorch            5.279e+01  2.111e+01   2.501 0.012523 *  
## ScreenPorch           3.684e+01  1.146e+01   3.214 0.001342 ** 
## PoolArea              6.852e+02  1.597e+02   4.290 1.92e-05 ***
## MasVnrArea            1.467e+01  4.305e+00   3.408 0.000675 ***
## MSZoningFV            3.296e+04  1.113e+04   2.962 0.003115 ** 
## MSZoningRH            3.083e+04  1.134e+04   2.719 0.006646 ** 
## MSZoningRL            2.996e+04  9.554e+03   3.136 0.001754 ** 
## MSZoningRM            2.885e+04  8.905e+03   3.240 0.001225 ** 
## StreetPave            1.487e+04  1.083e+04   1.374 0.169813    
## LandContourHLS        7.973e+03  4.780e+03   1.668 0.095572 .  
## LandContourLow       -2.421e+03  5.913e+03  -0.409 0.682345    
## LandContourLvl        6.755e+03  3.395e+03   1.990 0.046814 *  
## UtilitiesNoSeWa      -3.816e+04  2.385e+04  -1.600 0.109905    
## LotConfigCulDSac      7.439e+03  2.914e+03   2.553 0.010793 *  
## LotConfigFR2         -6.564e+03  3.702e+03  -1.773 0.076414 .  
## LotConfigFR3         -1.643e+04  1.186e+04  -1.385 0.166191    
## LotConfigInside      -9.882e+02  1.632e+03  -0.606 0.544917    
## LandSlopeMod          8.082e+03  3.673e+03   2.200 0.027977 *  
## LandSlopeSev         -2.394e+04  1.009e+04  -2.372 0.017838 *  
## NeighborhoodBlueste  -3.668e+03  1.761e+04  -0.208 0.834989    
## NeighborhoodBrDale   -6.100e+03  1.017e+04  -0.600 0.548636    
## NeighborhoodBrkSide  -1.047e+04  8.493e+03  -1.232 0.218099    
## NeighborhoodClearCr  -9.589e+03  8.488e+03  -1.130 0.258793    
## NeighborhoodCollgCr  -7.058e+03  6.781e+03  -1.041 0.298089    
## NeighborhoodCrawfor   6.238e+03  7.838e+03   0.796 0.426289    
## NeighborhoodEdwards  -2.244e+04  7.378e+03  -3.042 0.002397 ** 
## NeighborhoodGilbert  -5.543e+03  7.164e+03  -0.774 0.439220    
## NeighborhoodIDOTRR   -1.995e+04  9.554e+03  -2.089 0.036939 *  
## NeighborhoodMeadowV  -2.244e+04  1.029e+04  -2.181 0.029371 *  
## NeighborhoodMitchel  -1.733e+04  7.574e+03  -2.288 0.022302 *  
## NeighborhoodNAmes    -1.746e+04  7.240e+03  -2.412 0.016015 *  
## NeighborhoodNoRidge   2.359e+04  7.910e+03   2.982 0.002919 ** 
## NeighborhoodNPkVill   4.619e+03  1.023e+04   0.451 0.651710    
## NeighborhoodNridgHt   1.403e+04  6.839e+03   2.052 0.040375 *  
## NeighborhoodNWAmes   -1.556e+04  7.470e+03  -2.083 0.037424 *  
## NeighborhoodOldTown  -2.252e+04  8.589e+03  -2.622 0.008855 ** 
## NeighborhoodSawyer   -1.127e+04  7.542e+03  -1.494 0.135369    
## NeighborhoodSawyerW  -2.423e+03  7.261e+03  -0.334 0.738641    
## NeighborhoodSomerst   3.319e+03  8.193e+03   0.405 0.685473    
## NeighborhoodStoneBr   2.719e+04  7.748e+03   3.510 0.000464 ***
## NeighborhoodSWISU    -1.750e+04  8.727e+03  -2.005 0.045176 *  
## NeighborhoodTimber   -8.915e+03  7.654e+03  -1.165 0.244378    
## NeighborhoodVeenker   2.182e+03  9.656e+03   0.226 0.821284    
## Condition1Feedr       2.490e+03  4.533e+03   0.549 0.582816    
## Condition1Norm        1.349e+04  3.747e+03   3.601 0.000329 ***
## Condition1PosA        1.323e+04  9.177e+03   1.442 0.149585    
## Condition1PosN        1.294e+04  6.744e+03   1.919 0.055238 .  
## Condition1RRAe       -1.139e+04  8.459e+03  -1.347 0.178251    
## Condition1RRAn        1.229e+04  6.270e+03   1.960 0.050224 .  
## Condition1RRNe       -5.547e+03  1.661e+04  -0.334 0.738418    
## Condition1RRNn        1.096e+04  1.176e+04   0.932 0.351681    
## Condition2Feedr       1.842e+03  2.109e+04   0.087 0.930403    
## Condition2Norm       -7.315e+02  1.828e+04  -0.040 0.968088    
## Condition2PosA       -1.268e+04  3.065e+04  -0.414 0.679072    
## Condition2PosN       -2.512e+05  2.608e+04  -9.635  < 2e-16 ***
## Condition2RRAe       -4.807e+04  3.113e+04  -1.544 0.122753    
## Condition2RRAn       -3.120e+03  2.937e+04  -0.106 0.915401    
## Condition2RRNn        8.658e+03  2.496e+04   0.347 0.728713    
## BldgType2fmCon       -4.712e+03  5.390e+03  -0.874 0.382185    
## BldgTypeDuplex       -4.425e+03  6.588e+03  -0.672 0.501844    
## BldgTypeTwnhs        -1.922e+04  5.097e+03  -3.771 0.000170 ***
## BldgTypeTwnhsE       -1.561e+04  3.612e+03  -4.321 1.67e-05 ***
## OverallQual2          6.779e+04  3.027e+04   2.239 0.025311 *  
## OverallQual3          4.521e+04  2.750e+04   1.644 0.100402    
## OverallQual4          4.816e+04  2.742e+04   1.757 0.079231 .  
## OverallQual5          4.795e+04  2.756e+04   1.740 0.082152 .  
## OverallQual6          5.167e+04  2.764e+04   1.869 0.061819 .  
## OverallQual7          6.002e+04  2.770e+04   2.167 0.030438 *  
## OverallQual8          7.434e+04  2.779e+04   2.675 0.007572 ** 
## OverallQual9          1.105e+05  2.814e+04   3.926 9.11e-05 ***
## OverallQual10         1.502e+05  2.882e+04   5.213 2.17e-07 ***
## OverallCond2         -4.640e+04  4.105e+04  -1.130 0.258595    
## OverallCond3         -6.996e+04  3.904e+04  -1.792 0.073384 .  
## OverallCond4         -5.625e+04  3.967e+04  -1.418 0.156467    
## OverallCond5         -4.810e+04  3.964e+04  -1.213 0.225248    
## OverallCond6         -4.362e+04  3.963e+04  -1.101 0.271262    
## OverallCond7         -3.596e+04  3.964e+04  -0.907 0.364474    
## OverallCond8         -3.451e+04  3.967e+04  -0.870 0.384406    
## OverallCond9         -2.768e+04  3.997e+04  -0.693 0.488742    
## RoofMatlCompShg       5.863e+05  4.505e+04  13.014  < 2e-16 ***
## RoofMatlMembran       6.410e+05  5.272e+04  12.160  < 2e-16 ***
## RoofMatlMetal         6.108e+05  5.268e+04  11.595  < 2e-16 ***
## RoofMatlRoll          5.860e+05  5.097e+04  11.497  < 2e-16 ***
## RoofMatlTar&Grv       5.719e+05  4.660e+04  12.273  < 2e-16 ***
## RoofMatlWdShake       6.008e+05  4.688e+04  12.816  < 2e-16 ***
## RoofMatlWdShngl       6.292e+05  4.539e+04  13.862  < 2e-16 ***
## Exterior1stAsphShn    6.832e+03  2.399e+04   0.285 0.775836    
## Exterior1stBrkComm    1.597e+04  1.875e+04   0.851 0.394734    
## Exterior1stBrkFace    2.146e+04  6.689e+03   3.208 0.001371 ** 
## Exterior1stCBlock    -8.039e+03  2.354e+04  -0.341 0.732786    
## Exterior1stCemntBd    1.191e+04  7.037e+03   1.693 0.090749 .  
## Exterior1stHdBoard    3.564e+03  6.133e+03   0.581 0.561320    
## Exterior1stImStucc   -1.311e+04  2.314e+04  -0.566 0.571234    
## Exterior1stMetalSd    7.448e+03  5.981e+03   1.245 0.213252    
## Exterior1stPlywood    3.322e+03  6.440e+03   0.516 0.606063    
## Exterior1stStone     -4.021e+03  1.851e+04  -0.217 0.828035    
## Exterior1stStucco     6.734e+03  7.490e+03   0.899 0.368792    
## Exterior1stVinylSd    8.953e+03  6.021e+03   1.487 0.137286    
## Exterior1stWd Sdng    4.714e+03  5.937e+03   0.794 0.427373    
## Exterior1stWdShing    3.496e+03  7.375e+03   0.474 0.635616    
## FoundationCBlock      5.628e+03  2.749e+03   2.047 0.040815 *  
## FoundationPConc       9.647e+03  2.979e+03   3.238 0.001233 ** 
## FoundationSlab        6.302e+03  8.665e+03   0.727 0.467191    
## FoundationStone       1.253e+04  9.691e+03   1.293 0.196222    
## FoundationWood       -1.900e+04  1.387e+04  -1.370 0.170805    
## HeatingQCFa          -5.899e+03  3.859e+03  -1.529 0.126585    
## HeatingQCGd          -4.522e+03  1.906e+03  -2.373 0.017777 *  
## HeatingQCPo           5.753e+03  2.391e+04   0.241 0.809880    
## HeatingQCTA          -4.639e+03  1.884e+03  -2.463 0.013923 *  
## FullBath1            -8.574e+03  1.335e+04  -0.642 0.520706    
## FullBath2            -7.105e+03  1.351e+04  -0.526 0.599048    
## FullBath3             1.955e+04  1.447e+04   1.352 0.176728    
## HalfBath1             5.103e+03  1.959e+03   2.605 0.009285 ** 
## HalfBath2            -1.139e+03  8.055e+03  -0.141 0.887601    
## BedroomAbvGr1         1.578e+02  1.477e+04   0.011 0.991481    
## BedroomAbvGr2         3.489e+03  1.455e+04   0.240 0.810585    
## BedroomAbvGr3        -2.058e+02  1.469e+04  -0.014 0.988823    
## BedroomAbvGr4         2.711e+03  1.490e+04   0.182 0.855654    
## BedroomAbvGr5        -1.442e+04  1.598e+04  -0.903 0.366916    
## BedroomAbvGr6        -1.401e+04  1.816e+04  -0.771 0.440725    
## BedroomAbvGr8        -4.896e+03  3.013e+04  -0.162 0.870973    
## KitchenAbvGr1         2.546e+04  2.847e+04   0.895 0.371207    
## KitchenAbvGr2         8.824e+03  2.820e+04   0.313 0.754438    
## KitchenAbvGr3        -8.287e+02  3.316e+04  -0.025 0.980066    
## KitchenQualFa        -1.956e+04  5.622e+03  -3.479 0.000520 ***
## KitchenQualGd        -1.531e+04  3.376e+03  -4.536 6.28e-06 ***
## KitchenQualTA        -1.753e+04  3.701e+03  -4.735 2.43e-06 ***
## TotRmsAbvGrd3        -4.297e+04  1.278e+04  -3.363 0.000794 ***
## TotRmsAbvGrd4        -4.125e+04  1.058e+04  -3.898 0.000102 ***
## TotRmsAbvGrd5        -4.247e+04  1.010e+04  -4.206 2.78e-05 ***
## TotRmsAbvGrd6        -4.085e+04  9.849e+03  -4.148 3.58e-05 ***
## TotRmsAbvGrd7        -3.947e+04  9.581e+03  -4.120 4.04e-05 ***
## TotRmsAbvGrd8        -3.988e+04  9.349e+03  -4.266 2.14e-05 ***
## TotRmsAbvGrd9        -3.923e+04  9.447e+03  -4.152 3.51e-05 ***
## TotRmsAbvGrd10       -2.660e+04  9.128e+03  -2.914 0.003635 ** 
## TotRmsAbvGrd11       -3.874e+04  1.044e+04  -3.711 0.000215 ***
## TotRmsAbvGrd12               NA         NA      NA       NA    
## TotRmsAbvGrd14               NA         NA      NA       NA    
## FunctionalMaj2       -1.962e+03  1.251e+04  -0.157 0.875380    
## FunctionalMin1        1.136e+04  7.946e+03   1.429 0.153240    
## FunctionalMin2        7.826e+03  7.889e+03   0.992 0.321328    
## FunctionalMod         1.869e+03  9.282e+03   0.201 0.840464    
## FunctionalSev        -2.791e+04  2.657e+04  -1.050 0.293695    
## FunctionalTyp         1.806e+04  6.910e+03   2.614 0.009050 ** 
## Fireplaces1           2.417e+03  1.606e+03   1.505 0.132501    
## Fireplaces2           9.627e+03  2.826e+03   3.406 0.000678 ***
## Fireplaces3           8.913e+03  1.149e+04   0.776 0.437866    
## SaleConditionAdjLand  3.033e+04  1.385e+04   2.189 0.028754 *  
## SaleConditionAlloca  -1.138e+03  8.136e+03  -0.140 0.888791    
## SaleConditionFamily   3.604e+03  5.671e+03   0.636 0.525215    
## SaleConditionNormal   8.571e+03  2.535e+03   3.381 0.000744 ***
## SaleConditionPartial  2.639e+04  3.526e+03   7.484 1.34e-13 ***
## BsmtQualFa           -1.656e+04  5.671e+03  -2.920 0.003562 ** 
## BsmtQualGd           -1.484e+04  3.149e+03  -4.713 2.70e-06 ***
## BsmtQualTA           -1.718e+04  3.782e+03  -4.543 6.08e-06 ***
## BsmtQualZ             9.376e+02  2.367e+04   0.040 0.968405    
## BsmtExposureGd        1.200e+04  2.838e+03   4.228 2.53e-05 ***
## BsmtExposureMn       -2.297e+03  2.730e+03  -0.841 0.400278    
## BsmtExposureNo       -4.206e+03  1.940e+03  -2.168 0.030322 *  
## BsmtExposureZ        -8.382e+03  2.198e+04  -0.381 0.702991    
## BsmtFinType1BLQ       9.884e+02  2.528e+03   0.391 0.695896    
## BsmtFinType1GLQ       5.536e+03  2.324e+03   2.382 0.017375 *  
## BsmtFinType1LwQ      -3.254e+03  3.310e+03  -0.983 0.325688    
## BsmtFinType1Rec      -1.898e+03  2.665e+03  -0.712 0.476493    
## BsmtFinType1Unf       1.655e+03  2.670e+03   0.620 0.535344    
## BsmtFinType1Z                NA         NA      NA       NA    
## GarageTypeAttchd      3.086e+04  1.038e+04   2.973 0.003002 ** 
## GarageTypeBasment     3.251e+04  1.201e+04   2.707 0.006870 ** 
## GarageTypeBuiltIn     3.214e+04  1.075e+04   2.990 0.002848 ** 
## GarageTypeCarPort     2.718e+04  1.331e+04   2.041 0.041417 *  
## GarageTypeDetchd      2.932e+04  1.034e+04   2.834 0.004673 ** 
## GarageTypeZ           3.694e+04  1.124e+04   3.285 0.001047 ** 
## PoolQCFa             -1.352e+05  2.597e+04  -5.205 2.26e-07 ***
## PoolQCGd             -8.826e+04  3.017e+04  -2.925 0.003502 ** 
## PoolQCZ               2.902e+05  8.684e+04   3.342 0.000856 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21710 on 1279 degrees of freedom
## Multiple R-squared:  0.9345, Adjusted R-squared:  0.9253 
## F-statistic: 101.4 on 180 and 1279 DF,  p-value: < 2.2e-16
qqnorm(final.model$residuals)
qqline(final.model$residuals)

I would have liked to use this model, but due to factor differences between the test and train that I couldn’t smooth out, I was forced to focus on just the numerical data instead.

alt.mod <- lm(SalePrice~LotFrontage + LotArea + MasVnrArea + BsmtFinSF1 +
                BsmtFinSF2 + TotalBsmtSF + X1stFlrSF + X2ndFlrSF +
                LowQualFinSF + GrLivArea + GarageCars + GarageArea +
                WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch +
                ScreenPorch + PoolArea + MiscVal, data = house.train)
alt.fin <- step(alt.mod, direction = "backward", trace = 0)
summary(alt.fin)
## 
## Call:
## lm(formula = SalePrice ~ MasVnrArea + BsmtFinSF1 + TotalBsmtSF + 
##     X1stFlrSF + X2ndFlrSF + GarageCars + WoodDeckSF + OpenPorchSF + 
##     EnclosedPorch + ScreenPorch + PoolArea, data = house.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -566694  -18955    -777   17424  299450 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -17870.659   4277.394  -4.178 3.12e-05 ***
## MasVnrArea        41.897      7.131   5.875 5.22e-09 ***
## BsmtFinSF1        15.463      2.949   5.244 1.81e-07 ***
## TotalBsmtSF       38.763      4.821   8.041 1.84e-15 ***
## X1stFlrSF         57.281      5.288  10.833  < 2e-16 ***
## X2ndFlrSF         62.770      3.017  20.805  < 2e-16 ***
## GarageCars     29113.767   1837.438  15.845  < 2e-16 ***
## WoodDeckSF        50.445      9.589   5.261 1.65e-07 ***
## OpenPorchSF       52.489     18.316   2.866  0.00422 ** 
## EnclosedPorch    -41.824     19.092  -2.191  0.02864 *  
## ScreenPorch       43.765     20.543   2.130  0.03330 *  
## PoolArea         -53.431     28.810  -1.855  0.06386 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42960 on 1448 degrees of freedom
## Multiple R-squared:  0.7098, Adjusted R-squared:  0.7076 
## F-statistic:   322 on 11 and 1448 DF,  p-value: < 2.2e-16
qqnorm(alt.fin$residuals)
qqline(alt.fin$residuals)

house.test <- read.csv("test.csv")

house.test$MSSubClass <- as.factor(house.test$MSSubClass)
house.test$OverallQual <- as.factor(house.test$OverallQual)
house.test$OverallCond <- as.factor(house.test$OverallCond)
house.test$YearBuilt <- as.factor(house.test$YearBuilt)
house.test$YearRemodAdd <- as.factor(house.test$YearRemodAdd)
house.test$BsmtFullBath <- as.factor(house.test$BsmtFullBath)
house.test$BsmtHalfBath <- as.factor(house.test$BsmtHalfBath)
house.test$FullBath <- as.factor(house.test$FullBath)
house.test$HalfBath <- as.factor(house.test$HalfBath)
house.test$BedroomAbvGr <- as.factor(house.test$BedroomAbvGr)
house.test$KitchenAbvGr <- as.factor(house.test$KitchenAbvGr)
house.test$TotRmsAbvGrd <- as.factor(house.test$TotRmsAbvGrd)
house.test$Fireplaces <- as.factor(house.test$Fireplaces)
house.test$MoSold <- as.factor(house.test$MoSold)
house.test$YrSold <- as.factor(house.test$YrSold)
house.test$GarageYrBlt <- as.factor(house.test$GarageYrBlt)

house.test <- clean(house.test)
eval <- predict(alt.fin, newdata = house.test)
eval[eval < 0] <- 0
result <- data.frame('Id' = house.test$Id, 'SalePrice' = eval)
head(result)
##     Id SalePrice
## 1 1461  116307.2
## 2 1462  179398.0
## 3 1463  198198.2
## 4 1464  202048.7
## 5 1465  177967.3
## 6 1466  181959.0
# write.csv(result, file = "result.csv", row.names = FALSE)

Kaggle name: Iden Watanabe Score of Model: 0.22668