Part 1 - Pagerank

Step 1

Form the A matrix. Then, introduce decay and form the B Matrix as we did in the course notes.

A <- matrix(c(0,1/2,1/2,0,0,0,
              1/6,1/6,1/6,1/6,1/6,1/6, # replaced row of 0s with uniform probability
              1/3,1/3,0,0,1/3,0,
              0,0,0,0,1/2,1/2,
              0,0,0,1/2,0,1/2,
              0,0,0,1,0,0), nrow=6, byrow=TRUE)

Creting the A Matrix as it as done in the lecture will leave a “dangling” row in row 2. This will prevent our power iteration method from converging. To remedy this, we can simply use 1/6 for each element in the row, suggesting that there is an equally likely probability for each powition to be selected in the first iteration.

n <- 6
decay <- 0.85
B <- decay * A + ((1-decay)/n)
print(B)
##           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## [1,] 0.0250000 0.4500000 0.4500000 0.0250000 0.0250000 0.0250000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3083333 0.3083333 0.0250000 0.0250000 0.3083333 0.0250000
## [4,] 0.0250000 0.0250000 0.0250000 0.0250000 0.4500000 0.4500000
## [5,] 0.0250000 0.0250000 0.0250000 0.4500000 0.0250000 0.4500000
## [6,] 0.0250000 0.0250000 0.0250000 0.8750000 0.0250000 0.0250000

Step 2

Start with a uniform rank vector r and perform power iterations on B till convergence. That is, compute the solution r = Bn × r. Attempt this for a sufficiently large n so that r actually converges. (5 Points)

paste("A",1:6,sep=",")
## [1] "A,1" "A,2" "A,3" "A,4" "A,5" "A,6"
# Start with uniform rank vector
r <- matrix(c(1/6,1/6,1/6,1/6,1/6,1/6), nrow=1, ncol=6)
power_iteration <- function(decayMat, r, iter) {
  index <- c(0)
  r1 <- c(r[1]) #one element vector with first element of r
  r2 <- c(r[2]) #one element vector with second element of r
  r3 <- c(r[3]) #one element vector with third element of r
  r4 <- c(r[4]) #one element vector with fourth element of r
  r5 <- c(r[5]) #one element vector with fifth element of r
  r6 <- c(r[6])
  power_iteration <- r #set new r equal to starting r
  newDecayMat <- decayMat #set new decayMat equal to original decayMat
  
  for (i in 1:iter) { # for each integer between 1 and n (30)
    newDecayMat <- newDecayMat%*%decayMat
    power_iteration <- r %*% newDecayMat
    r1 <- append(r1, power_iteration[1])
    r2 <- append(r2, power_iteration[2])
    r3 <- append(r3, power_iteration[3])
    r4 <- append(r4, power_iteration[4])
    r5 <- append(r5, power_iteration[5])
    r6 <- append(r6, power_iteration[6])
    
    index <- append(index, i)
  }
  
  all_r <- list(r1, r2, r3, r4, r5, r6)
  
  for (i in 1:6){
    r_n <- all_r[i]
    for (t in 2:length(r_n[[1]])-1) {
      current_t <- r_n[[1]][t]
      next_t <- r_n[[1]][t+1]
      difference <- abs((next_t - current_t)/current_t)
      if (difference < 0.0001) {
        print(str_interp("element ${i} of r converges after ${t} iterations"))
        break
      } else {
        if (t == length(r_n[[1]])-1) {
          print(str_interp("r${i} does not meet difference threshold"))
        }
      }
    }
  }
  ret = list(power_iteration, newDecayMat)
  
  return(ret)
}
response = power_iteration(B, r, 30)
## [1] "element 1 of r converges after 17 iterations"
## [1] "element 2 of r converges after 17 iterations"
## [1] "element 3 of r converges after 17 iterations"
## [1] "element 4 of r converges after 14 iterations"
## [1] "element 5 of r converges after 12 iterations"
## [1] "element 6 of r converges after 14 iterations"

Based on the above visualization, we can see that convergence occurs roughly around 15 iterations.

power_iteration_r <- response[[1]] #The new value of r
power_iteration_r
##            [,1]       [,2]       [,3]      [,4]      [,5]      [,6]
## [1,] 0.05170475 0.07367927 0.05741242 0.3487037 0.1999038 0.2685961
newDecayMat <- response[2] #Final iteration of decay matrix
newDecayMat
## [[1]]
##            [,1]       [,2]       [,3]      [,4]      [,5]      [,6]
## [1,] 0.05170476 0.07367928 0.05741243 0.3487037 0.1999038 0.2685961
## [2,] 0.05170475 0.07367927 0.05741242 0.3487037 0.1999038 0.2685961
## [3,] 0.05170475 0.07367928 0.05741242 0.3487037 0.1999038 0.2685961
## [4,] 0.05170474 0.07367926 0.05741241 0.3487037 0.1999038 0.2685961
## [5,] 0.05170474 0.07367926 0.05741241 0.3487037 0.1999038 0.2685961
## [6,] 0.05170474 0.07367926 0.05741241 0.3487037 0.1999038 0.2685961

Step 3

Compute the eigen-decomposition of B and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.(10 points)

ev <- eigen(t(B))
as.numeric(ev$values)
## [1]  1.00000000  0.57619235 -0.42500000 -0.42500000 -0.34991524 -0.08461044

We can see from the above that the largest eigenvalue is 1

sum(ev$vectors[,1])
## [1] 2.019902+0i

However, the sum of the associated eigenvector is greater than 1. This is corrected when transforming the vector to the unit vector (divide each element by the sum of the vector)

eigen_r <- as.numeric(1/(sum(ev$vectors[,1]))*ev$vectors[,1])
sum(eigen_r)
## [1] 1
eigen_r
## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
power_iteration_r - eigen_r
##             [,1]         [,2]         [,3]          [,4]          [,5]
## [1,] 3.29466e-09 5.724804e-09 3.837685e-09 -6.620217e-09 -1.589199e-09
##               [,6]
## [1,] -4.647732e-09

The “power_iter_vector” we received after running our power_iteration function is approximately the same as the “eigen_r” vector we received after performing eigenvalue decomposition on our transition matrix.

Step 4

Use the graph package in R and its page.rank method to compute the Page Rank of the graph as given in A. Note that you don’t need to apply decay. The package starts with a connected graph and applies decay internally. Verify that you do get the same PageRank vector as the two approaches above. (10 points)

#Converting to graph from adjacency matrix
webGraph = igraph::graph_from_adjacency_matrix(A,weighted = T)

#Plot the graph
plot(webGraph)

#Resultant vector
pr <- page_rank(webGraph)$vector
pr
## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
round(eigen_r, 5) == round(pr, 5)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE

Part 2 - Kaggle MNIST

Step 1-2

Go to Kaggle.com and build an account if you do not already have one. It is free

Go to https://www.kaggle.com/c/digit-recognizer/overview, accept the rules of the competition, and download the data. You will not be required to submit work to Kaggle, but you do need the data.

Step 3

Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)

labels = train_data[,1]
data <- train_data[,-1]/255
plot_array <- function(array){
  arrayMat <- matrix(array, nrow=28, ncol=28)
  mode(arrayMat) = "numeric"
  image(arrayMat)
}
for (i in 1:10) {
  plot_array(data[i,])
}

Step 4

What is the frequency distribution of the numbers in the dataset? (5 points)

table(labels)/42000
## labels
##          0          1          2          3          4          5          6 
## 0.09838095 0.11152381 0.09945238 0.10359524 0.09695238 0.09035714 0.09850000 
##          7          8          9 
## 0.10478571 0.09673810 0.09971429

While there is some variation, each label roughly corresponds to 10% of the entire training dataset.

Step 5

For each number, provide the mean pixel intensity. What does this tell you? (5 points)

get_number_intensity <- function(target, labels, data){
  x = data[labels==target,]
  means = rowMeans(x)
  return(mean(means))
}
for (i in 1:9) {
  mean_intensity <- get_number_intensity(i , labels, data)
  
  ret_string = str_interp("Pixel intensity for number ${i} is ${mean_intensity}")
  
  print(ret_string)
}
## [1] "Pixel intensity for number 1 is 0.075972720428906"
## [1] "Pixel intensity for number 2 is 0.149415262873165"
## [1] "Pixel intensity for number 3 is 0.141657603055012"
## [1] "Pixel intensity for number 4 is 0.121212097314368"
## [1] "Pixel intensity for number 5 is 0.129231294625887"
## [1] "Pixel intensity for number 6 is 0.138730078688473"
## [1] "Pixel intensity for number 7 is 0.1147021021542"
## [1] "Pixel intensity for number 8 is 0.150981134516322"
## [1] "Pixel intensity for number 9 is 0.12281787715086"

My intuition for the mean pixel intensity is that more complex numbers which take up a larger portion of the 28x28 matrix will have a greater mean pixel intensity. This intuition is supported by the fact that 1, a straight line, has the smallest mean pixel intensity. Likewise the number 7 is the second smallest mean pixel intensity. Both of these numbers are comprised of simple straight lines. Alternatively, numbers like 8, 3 and 6, which are typically much curvier, have the highest mean pixel intensities.

Step 6

Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points)

train_pca <- prcomp(data)
train_pca_std <- train_pca$sdev
train_pca_cum_var <- cumsum(train_pca_std^2)/sum(train_pca_std^2)
plot(train_pca_cum_var)

which.max(train_pca_cum_var > 0.95)
## [1] 154
length(train_pca_cum_var)
## [1] 784

With 154 components we are able to account for 95% variance. There are 784 total components possible, 1 for each column in the original dataset.

Step 7

Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points)

train_pca_rot <- train_pca$rotation
for (i in 1:10) {
  plot_array(train_pca_rot[,i])
}

Above are the first 10 images generated by PCA. The images appear to be noisy, but this is simply due to the fact that they were generated via PCA. Essentially, each PCA image is the average of the all the entries with that type.

Step 8

Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)

x = data[labels==8,]
pca_8 <- prcomp(x)
pca_8_std <- pca_8$sdev
pca_8_cum_var <- cumsum(pca_8_std^2)/sum(pca_8_std^2)
plot(pca_8_cum_var)

pca_8_rot <- pca_8$rotation
for (i in 1:10) {
  plot_array(pca_8_rot[,i])
}

As expected, multiple representations of the digit 8, blurred due to PCA.

Step 9

An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)

test_size <- floor(.2 * nrow(data))
set.seed(42)
test_index <- sample(seq_len(nrow(data)), size = test_size)

train_df <- data[-test_index,]
train_labels <- c(labels[-test_index,])[[1]]

test_df <- data[test_index,]
test_labels <- c(labels[test_index,])[[1]]
model <- multinom(train_labels~., train_df, MaxNWts = 100000)
## # weights:  7860 (7065 variable)
## initial  value 77366.859125 
## iter  10 value 22233.410474
## iter  20 value 18330.173557
## iter  30 value 17277.223639
## iter  40 value 16671.149301
## iter  50 value 16142.222215
## iter  60 value 15530.180131
## iter  70 value 14349.926185
## iter  80 value 12674.327346
## iter  90 value 11166.744868
## iter 100 value 10266.648048
## final  value 10266.648048 
## stopped after 100 iterations
predictions <- predict(model, test_df)
test_labels <- as.factor(test_labels)
confusionMatrix(predictions,test_labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 791   0  10   5   1  12   4   1   9   7
##          1   1 906  12   7   3  16   7  20  24   7
##          2   0   4 699  23   4   4   7   7   6   2
##          3   0   9  15 777   1  27   0   3  32  21
##          4   2   1  18   4 763  17  11  13   7  36
##          5   5   2   4  26   0 644  14   4  23   7
##          6  12   2  14   9   4  16 796   2   6   0
##          7   2   1  13  13   3   4   3 816   3  21
##          8   5  10  28  16   5  13   5   2 658  11
##          9   2   0   4  14  23  11   0  47  11 710
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.8934, 0.9063)
##     No Information Rate : 0.1113          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8888          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.96463   0.9690  0.85557   0.8691  0.94548  0.84293
## Specificity           0.99354   0.9870  0.99248   0.9856  0.98564  0.98887
## Pos Pred Value        0.94167   0.9033  0.92460   0.8780  0.87500  0.88340
## Neg Pred Value        0.99616   0.9961  0.98456   0.9844  0.99416  0.98436
## Prevalence            0.09762   0.1113  0.09726   0.1064  0.09607  0.09095
## Detection Rate        0.09417   0.1079  0.08321   0.0925  0.09083  0.07667
## Detection Prevalence  0.10000   0.1194  0.09000   0.1054  0.10381  0.08679
## Balanced Accuracy     0.97908   0.9780  0.92403   0.9274  0.96556  0.91590
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.93979  0.89180  0.84467  0.86375
## Specificity           0.99139  0.99158  0.98753  0.98522
## Pos Pred Value        0.92451  0.92833  0.87384  0.86375
## Neg Pred Value        0.99324  0.98684  0.98418  0.98522
## Prevalence            0.10083  0.10893  0.09274  0.09786
## Detection Rate        0.09476  0.09714  0.07833  0.08452
## Detection Prevalence  0.10250  0.10464  0.08964  0.09786
## Balanced Accuracy     0.96559  0.94169  0.91610  0.92448

Overall accuracy of the model was very strong, at 90%. Reviewing the confusion matrix shows us that numbers 1 and 7 (the simple ones) have the highest rates of precision. The more complex numbers, 8 in particular, score the lowest.

Part 3 - House Prices

Step 1

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

# Univariate descriptive statistics
summary(train_data)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical           1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##     2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

There are a lot of features here to consider, with a roughly even split between numeric and categorical data. We will need to think about ways to attack missing values later.

# Scatter plot between 1stFlrSF (first floot square feet) and SalePrice

# We see a positive correlation between independent variable '1stFlrSF' and dependent variable 'SalePrice'. As the first floor square footage increases, so does the price.

train_data %>%
  ggplot(aes(x=`1stFlrSF`, y=`SalePrice`)) + 
  geom_point(position="jitter") +
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

# Scatter plot between TotalBsmtSF (basement square feet) and SalePrice

# We see a positive correlation between independent variable 'TotalBsmtSF' and dependent variable 'SalePrice'. As the basement square footage increases, so does the price.

train_data %>%
  ggplot(aes(x=`TotalBsmtSF`, y=`SalePrice`)) + 
  geom_point(position="jitter") + 
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

correlation <- cor(train_data[,c("LotArea","TotalBsmtSF","1stFlrSF","SalePrice")])
correlation
##               LotArea TotalBsmtSF  1stFlrSF SalePrice
## LotArea     1.0000000   0.2608331 0.2994746 0.2638434
## TotalBsmtSF 0.2608331   1.0000000 0.8195300 0.6135806
## 1stFlrSF    0.2994746   0.8195300 1.0000000 0.6058522
## SalePrice   0.2638434   0.6135806 0.6058522 1.0000000
cor.test(train_data$LotArea, train_data$TotalBsmtSF, conf.level=0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_data$LotArea and train_data$TotalBsmtSF
## t = 10.317, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2292786 0.2918400
## sample estimates:
##       cor 
## 0.2608331
cor.test(train_data$TotalBsmtSF, train_data$`1stFlrSF`, conf.level=0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_data$TotalBsmtSF and train_data$`1stFlrSF`
## t = 54.609, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.8081976 0.8302556
## sample estimates:
##     cor 
## 0.81953
cor.test(train_data$LotArea, train_data$`1stFlrSF`, conf.level=0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_data$LotArea and train_data$`1stFlrSF`
## t = 11.985, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2686127 0.3297222
## sample estimates:
##       cor 
## 0.2994746

All of the above tests indicate that we can reject the null hypothesis that the correlation is zero between the features. As we are dealing with many features in this dataset, we should be worried about familywise error.

Part 2

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points

inverseMat <- solve(correlation)
inverseMat
##                   LotArea   TotalBsmtSF   1stFlrSF  SalePrice
## LotArea      1.1116224106 -0.0005917129 -0.2448005 -0.1446182
## TotalBsmtSF -0.0005917129  3.2603189969 -2.3064606 -0.6029380
## 1stFlrSF    -0.2448004553 -2.3064606461  3.2656838 -0.4987333
## SalePrice   -0.1446182309 -0.6029379875 -0.4987333  1.7102662
mult <- correlation %*% inverseMat
mult
##                  LotArea   TotalBsmtSF      1stFlrSF    SalePrice
## LotArea     1.000000e+00  2.775558e-17  5.551115e-17 5.551115e-17
## TotalBsmtSF 1.387779e-17  1.000000e+00  1.665335e-16 2.220446e-16
## 1stFlrSF    6.938894e-17 -5.551115e-17  1.000000e+00 2.220446e-16
## SalePrice   2.775558e-17 -3.330669e-16 -5.551115e-17 1.000000e+00
lu_decomp <- lu(correlation)
expand(lu_decomp)
## $L
## 4 x 4 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]      [,2]      [,3]      [,4]     
## [1,] 1.0000000         .         .         .
## [2,] 0.2608331 1.0000000         .         .
## [3,] 0.2994746 0.7955408 1.0000000         .
## [4,] 0.2638434 0.5845293 0.2916115 1.0000000
## 
## $U
## 4 x 4 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]       [,4]      
## [1,] 1.00000000 0.26083313 0.29947458 0.26384335
## [2,]          . 0.93196608 0.74141708 0.54476146
## [3,]          .          . 0.32048740 0.09345781
## [4,]          .          .          . 0.58470428
## 
## $P
## 4 x 4 sparse Matrix of class "pMatrix"
##             
## [1,] | . . .
## [2,] . | . .
## [3,] . . | .
## [4,] . . . |

Part 3

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/Rdevel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points

MasVnrArea is heavily right skewed. Additionally, there are some missing values that will cause us problems when attempting the fitdistr() method.

sum(is.na(train_data$MasVnrArea))
## [1] 8
# Replace missing values with the mean of the column

train_data$MasVnrArea[is.na(train_data$MasVnrArea)] <- mean(train_data$MasVnrArea, na.rm = T)
lambda <- fitdistr(train_data$MasVnrArea, densfun = "exponential")
lambda[[1]]
##        rate 
## 0.009644572
samples <- rexp(1000, lambda[[1]])
hist(samples, breaks=50)

lower_conf <- qexp(.05, rate = lambda[[1]])
upper_conf <- qexp(.95, rate = lambda[[1]])

print(c(lower_conf, upper_conf))
## [1]   5.318359 310.613285
lower_conf <- quantile(data$MasVnrArea, 0.05)
upper_conf <- quantile(data$MasVnrArea, 0.95)

print(c(lower_conf, upper_conf))
##  5% 95% 
##  NA  NA

Part 4

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score. 10 points

First let’s check to see which columns have the most NAs

sapply(train_data, function(x) sum(is.na(x)))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1369             0             0             0 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             0             0 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             8             0             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            37            37            38            37             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            38             0             0             0             0 
##     HeatingQC    CentralAir    Electrical      1stFlrSF      2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             0             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             0             0           690            81            81 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            81             0             0            81            81 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch     3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1453          1179          1406 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0

We can immediately remove the columns that have over 1000 NA values (or are the ID)

And for the rest we can impute the missing values:

  • Using mean imputation for numeric columns.
  • For categorical columns, we will replace missing values with “missing”
replace_numeric <- function(x) {
  replace(x, is.na(x), mean(x, na.rm = TRUE))
}

replace_category <- function(x) {
  replace(x, is.na(x), "missing")
}
clean_df <- function(dataframe) {
  
  dataframe <- dataframe %>%
    dplyr::select(-Id, -Alley, -PoolQC, -Fence, -MiscFeature)
  
  train_numeric <- dataframe %>%
    dplyr::select(is.numeric)
  
  train_category <- dataframe %>%
    dplyr::select(!is.numeric)
  
  train_numeric <- replace(train_numeric, TRUE, lapply(train_numeric, replace_numeric))
  train_category <- replace(train_category, TRUE, lapply(train_category, replace_category))
  
  train_data_clean <- cbind(train_numeric, train_category) 
  
  return(train_data_clean)
}
train_data_clean <- clean_df(train_data)
## Warning: Predicate functions must be wrapped in `where()`.
## 
##   # Bad
##   data %>% select(is.numeric)
## 
##   # Good
##   data %>% select(where(is.numeric))
## 
## ℹ Please update your code.
## This message is displayed once per session.
model <- lm(SalePrice~., data=train_data_clean)
summary(model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = train_data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -180209   -9385     386    9615  180209 
## 
## Coefficients: (8 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6.017e+05  1.055e+06  -0.570 0.568617    
## MSSubClass           -3.211e+01  8.282e+01  -0.388 0.698306    
## LotFrontage           5.448e+01  4.353e+01   1.252 0.210930    
## LotArea               7.382e-01  1.094e-01   6.745 2.35e-11 ***
## OverallQual           6.639e+03  1.013e+03   6.551 8.43e-11 ***
## OverallCond           5.752e+03  8.730e+02   6.589 6.58e-11 ***
## YearBuilt             3.267e+02  7.692e+01   4.247 2.33e-05 ***
## YearRemodAdd          9.663e+01  5.539e+01   1.745 0.081308 .  
## MasVnrArea            1.941e+01  5.783e+00   3.357 0.000811 ***
## BsmtFinSF1            3.916e+01  5.301e+00   7.388 2.76e-13 ***
## BsmtFinSF2            3.036e+01  9.076e+00   3.345 0.000848 ***
## BsmtUnfSF             2.059e+01  4.856e+00   4.241 2.39e-05 ***
## TotalBsmtSF                  NA         NA      NA       NA    
## `1stFlrSF`            4.671e+01  5.624e+00   8.307 2.59e-16 ***
## `2ndFlrSF`            6.612e+01  5.606e+00  11.795  < 2e-16 ***
## LowQualFinSF          1.032e+01  1.846e+01   0.559 0.576211    
## GrLivArea                    NA         NA      NA       NA    
## BsmtFullBath          8.231e+02  1.982e+03   0.415 0.677991    
## BsmtHalfBath         -2.303e+02  3.029e+03  -0.076 0.939403    
## FullBath              3.651e+03  2.209e+03   1.652 0.098697 .  
## HalfBath              1.223e+03  2.099e+03   0.583 0.560232    
## BedroomAbvGr         -3.427e+03  1.363e+03  -2.514 0.012070 *  
## KitchenAbvGr         -1.303e+04  5.678e+03  -2.295 0.021929 *  
## TotRmsAbvGrd          1.072e+03  9.499e+02   1.129 0.259204    
## Fireplaces            6.386e+03  2.560e+03   2.494 0.012763 *  
## GarageYrBlt          -2.068e+01  6.124e+01  -0.338 0.735678    
## GarageCars            3.976e+03  2.280e+03   1.744 0.081374 .  
## GarageArea            1.820e+01  7.920e+00   2.298 0.021746 *  
## WoodDeckSF            1.448e+01  5.853e+00   2.474 0.013497 *  
## OpenPorchSF           2.805e+00  1.154e+01   0.243 0.807929    
## EnclosedPorch         4.774e+00  1.242e+01   0.384 0.700803    
## `3SsnPorch`           3.213e+01  2.251e+01   1.428 0.153686    
## ScreenPorch           2.861e+01  1.231e+01   2.324 0.020313 *  
## PoolArea              8.283e+01  1.848e+01   4.482 8.08e-06 ***
## MiscVal              -4.912e-02  1.419e+00  -0.035 0.972398    
## MoSold               -4.403e+02  2.453e+02  -1.795 0.072834 .  
## YrSold               -4.798e+02  5.167e+02  -0.929 0.353275    
## MSZoningFV            3.350e+04  1.196e+04   2.800 0.005186 ** 
## MSZoningRH            2.443e+04  1.191e+04   2.050 0.040533 *  
## MSZoningRL            2.645e+04  1.019e+04   2.596 0.009551 ** 
## MSZoningRM            2.322e+04  9.527e+03   2.438 0.014922 *  
## StreetPave            3.223e+04  1.209e+04   2.665 0.007790 ** 
## LotShapeIR2           3.710e+03  4.238e+03   0.875 0.381526    
## LotShapeIR3           4.766e+03  8.909e+03   0.535 0.592758    
## LotShapeReg           1.684e+03  1.611e+03   1.045 0.296155    
## LandContourHLS        7.642e+03  5.148e+03   1.484 0.137972    
## LandContourLow       -1.128e+04  6.419e+03  -1.758 0.079082 .  
## LandContourLvl        5.419e+03  3.705e+03   1.463 0.143813    
## UtilitiesNoSeWa      -2.924e+04  2.650e+04  -1.103 0.270212    
## LotConfigCulDSac      9.094e+03  3.317e+03   2.742 0.006201 ** 
## LotConfigFR2         -7.513e+03  4.034e+03  -1.862 0.062797 .  
## LotConfigFR3         -1.595e+04  1.262e+04  -1.263 0.206677    
## LotConfigInside      -1.211e+03  1.794e+03  -0.675 0.499839    
## LandSlopeMod          7.361e+03  3.987e+03   1.846 0.065116 .  
## LandSlopeSev         -4.331e+04  1.146e+04  -3.780 0.000164 ***
## NeighborhoodBlueste  -6.013e+02  1.904e+04  -0.032 0.974810    
## NeighborhoodBrDale   -9.287e+02  1.101e+04  -0.084 0.932813    
## NeighborhoodBrkSide  -4.637e+03  9.461e+03  -0.490 0.624128    
## NeighborhoodClearCr  -1.516e+04  9.243e+03  -1.640 0.101285    
## NeighborhoodCollgCr  -1.041e+04  7.277e+03  -1.431 0.152770    
## NeighborhoodCrawfor   1.136e+04  8.587e+03   1.323 0.185949    
## NeighborhoodEdwards  -2.068e+04  8.021e+03  -2.578 0.010043 *  
## NeighborhoodGilbert  -1.096e+04  7.699e+03  -1.424 0.154776    
## NeighborhoodIDOTRR   -1.070e+04  1.075e+04  -0.996 0.319684    
## NeighborhoodMeadowV  -7.015e+03  1.123e+04  -0.624 0.532436    
## NeighborhoodMitchel  -2.277e+04  8.187e+03  -2.781 0.005498 ** 
## NeighborhoodNAmes    -1.645e+04  7.846e+03  -2.096 0.036260 *  
## NeighborhoodNoRidge   2.522e+04  8.446e+03   2.986 0.002882 ** 
## NeighborhoodNPkVill   1.362e+04  1.412e+04   0.965 0.334672    
## NeighborhoodNridgHt   1.754e+04  7.544e+03   2.324 0.020264 *  
## NeighborhoodNWAmes   -1.806e+04  8.033e+03  -2.248 0.024784 *  
## NeighborhoodOldTown  -1.434e+04  9.610e+03  -1.492 0.136004    
## NeighborhoodSawyer   -1.072e+04  8.152e+03  -1.315 0.188904    
## NeighborhoodSawyerW  -4.150e+03  7.810e+03  -0.531 0.595224    
## NeighborhoodSomerst  -2.158e+03  9.038e+03  -0.239 0.811330    
## NeighborhoodStoneBr   3.775e+04  8.334e+03   4.530 6.48e-06 ***
## NeighborhoodSWISU    -8.225e+03  9.726e+03  -0.846 0.397867    
## NeighborhoodTimber   -1.065e+04  8.170e+03  -1.304 0.192578    
## NeighborhoodVeenker   9.633e+02  1.053e+04   0.091 0.927156    
## Condition1Feedr       5.573e+03  5.001e+03   1.114 0.265336    
## Condition1Norm        1.476e+04  4.166e+03   3.543 0.000410 ***
## Condition1PosA        7.112e+03  1.002e+04   0.710 0.477796    
## Condition1PosN        1.323e+04  7.442e+03   1.778 0.075600 .  
## Condition1RRAe       -1.654e+04  9.090e+03  -1.820 0.069070 .  
## Condition1RRAn        8.259e+03  6.865e+03   1.203 0.229200    
## Condition1RRNe       -3.861e+03  1.758e+04  -0.220 0.826183    
## Condition1RRNn        6.212e+03  1.284e+04   0.484 0.628569    
## Condition2Feedr      -2.408e+03  2.345e+04  -0.103 0.918233    
## Condition2Norm       -4.021e+03  2.035e+04  -0.198 0.843411    
## Condition2PosA        4.107e+04  3.721e+04   1.104 0.269886    
## Condition2PosN       -2.375e+05  2.779e+04  -8.545  < 2e-16 ***
## Condition2RRAe       -1.221e+05  4.634e+04  -2.635 0.008530 ** 
## Condition2RRAn       -1.585e+04  3.165e+04  -0.501 0.616626    
## Condition2RRNn        3.872e+03  2.718e+04   0.142 0.886734    
## BldgType2fmCon       -4.976e+03  1.250e+04  -0.398 0.690572    
## BldgTypeDuplex       -9.392e+03  7.374e+03  -1.274 0.203030    
## BldgTypeTwnhs        -1.983e+04  9.999e+03  -1.983 0.047595 *  
## BldgTypeTwnhsE       -1.626e+04  8.993e+03  -1.808 0.070860 .  
## HouseStyle1.5Unf      1.206e+04  7.924e+03   1.522 0.128228    
## HouseStyle1Story      7.201e+03  4.353e+03   1.654 0.098347 .  
## HouseStyle2.5Fin     -2.313e+04  1.229e+04  -1.881 0.060173 .  
## HouseStyle2.5Unf     -8.943e+03  9.271e+03  -0.965 0.334890    
## HouseStyle2Story     -6.048e+03  3.511e+03  -1.723 0.085228 .  
## HouseStyleSFoyer      1.370e+03  6.279e+03   0.218 0.827273    
## HouseStyleSLvl        2.925e+03  5.563e+03   0.526 0.599164    
## RoofStyleGable        6.147e+03  1.847e+04   0.333 0.739366    
## RoofStyleGambrel      9.688e+03  2.018e+04   0.480 0.631341    
## RoofStyleHip          6.020e+03  1.853e+04   0.325 0.745388    
## RoofStyleMansard      1.843e+04  2.147e+04   0.859 0.390753    
## RoofStyleShed         9.839e+04  3.473e+04   2.833 0.004687 ** 
## RoofMatlCompShg       6.837e+05  3.376e+04  20.252  < 2e-16 ***
## RoofMatlMembran       7.756e+05  4.811e+04  16.119  < 2e-16 ***
## RoofMatlMetal         7.469e+05  4.711e+04  15.856  < 2e-16 ***
## RoofMatlRoll          6.718e+05  4.217e+04  15.932  < 2e-16 ***
## RoofMatlTar&Grv       6.879e+05  3.848e+04  17.875  < 2e-16 ***
## RoofMatlWdShake       6.758e+05  3.731e+04  18.115  < 2e-16 ***
## RoofMatlWdShngl       7.361e+05  3.500e+04  21.034  < 2e-16 ***
## Exterior1stAsphShn   -2.528e+04  3.317e+04  -0.762 0.446087    
## Exterior1stBrkComm   -1.215e+04  2.778e+04  -0.438 0.661763    
## Exterior1stBrkFace    4.886e+03  1.277e+04   0.383 0.702128    
## Exterior1stCBlock    -1.889e+04  2.727e+04  -0.693 0.488632    
## Exterior1stCemntBd   -1.367e+04  1.910e+04  -0.716 0.474428    
## Exterior1stHdBoard   -1.574e+04  1.296e+04  -1.214 0.224840    
## Exterior1stImStucc   -4.463e+04  2.788e+04  -1.600 0.109765    
## Exterior1stMetalSd   -7.602e+03  1.464e+04  -0.519 0.603574    
## Exterior1stPlywood   -1.729e+04  1.279e+04  -1.352 0.176603    
## Exterior1stStone     -5.368e+03  2.438e+04  -0.220 0.825810    
## Exterior1stStucco    -8.274e+03  1.413e+04  -0.585 0.558327    
## Exterior1stVinylSd   -1.647e+04  1.328e+04  -1.240 0.215212    
## Exterior1stWd Sdng   -1.571e+04  1.237e+04  -1.270 0.204312    
## Exterior1stWdShing   -1.183e+04  1.335e+04  -0.886 0.375896    
## Exterior2ndAsphShn    1.074e+04  2.229e+04   0.482 0.630021    
## Exterior2ndBrk Cmn    8.616e+03  2.018e+04   0.427 0.669423    
## Exterior2ndBrkFace    5.453e+03  1.325e+04   0.411 0.680870    
## Exterior2ndCBlock            NA         NA      NA       NA    
## Exterior2ndCmentBd    1.338e+04  1.879e+04   0.712 0.476486    
## Exterior2ndHdBoard    1.031e+04  1.246e+04   0.827 0.408290    
## Exterior2ndImStucc    2.798e+04  1.427e+04   1.960 0.050169 .  
## Exterior2ndMetalSd    6.564e+03  1.426e+04   0.460 0.645457    
## Exterior2ndOther     -1.419e+04  2.725e+04  -0.521 0.602739    
## Exterior2ndPlywood    8.287e+03  1.210e+04   0.685 0.493666    
## Exterior2ndStone     -9.985e+03  1.729e+04  -0.578 0.563687    
## Exterior2ndStucco     8.188e+03  1.361e+04   0.602 0.547503    
## Exterior2ndVinylSd    1.474e+04  1.284e+04   1.148 0.251178    
## Exterior2ndWd Sdng    1.250e+04  1.198e+04   1.044 0.296821    
## Exterior2ndWd Shng    7.176e+03  1.246e+04   0.576 0.564781    
## MasVnrTypeBrkFace     6.170e+03  6.811e+03   0.906 0.365212    
## MasVnrTypemissing     1.812e+03  1.085e+04   0.167 0.867316    
## MasVnrTypeNone        9.548e+03  6.875e+03   1.389 0.165130    
## MasVnrTypeStone       1.140e+04  7.211e+03   1.581 0.114192    
## ExterQualFa          -7.029e+03  1.098e+04  -0.640 0.522140    
## ExterQualGd          -2.021e+04  4.800e+03  -4.210 2.74e-05 ***
## ExterQualTA          -2.087e+04  5.306e+03  -3.933 8.85e-05 ***
## ExterCondFa          -3.610e+03  1.810e+04  -0.199 0.841933    
## ExterCondGd          -7.600e+03  1.729e+04  -0.440 0.660322    
## ExterCondPo           4.216e+03  3.140e+04   0.134 0.893203    
## ExterCondTA          -4.976e+03  1.725e+04  -0.289 0.772974    
## FoundationCBlock      2.634e+03  3.184e+03   0.827 0.408248    
## FoundationPConc       3.814e+03  3.431e+03   1.112 0.266489    
## FoundationSlab       -8.872e+03  1.007e+04  -0.881 0.378294    
## FoundationStone       7.636e+03  1.106e+04   0.691 0.489980    
## FoundationWood       -2.995e+04  1.479e+04  -2.025 0.043067 *  
## BsmtQualFa           -1.274e+04  6.348e+03  -2.006 0.045058 *  
## BsmtQualGd           -1.864e+04  3.354e+03  -5.556 3.39e-08 ***
## BsmtQualmissing       3.512e+04  3.683e+04   0.954 0.340457    
## BsmtQualTA           -1.551e+04  4.150e+03  -3.738 0.000194 ***
## BsmtCondGd           -4.895e+02  5.287e+03  -0.093 0.926244    
## BsmtCondmissing              NA         NA      NA       NA    
## BsmtCondPo            7.159e+04  2.996e+04   2.389 0.017039 *  
## BsmtCondTA            2.839e+03  4.253e+03   0.667 0.504604    
## BsmtExposureGd        1.305e+04  3.005e+03   4.343 1.52e-05 ***
## BsmtExposuremissing  -1.079e+04  2.316e+04  -0.466 0.641440    
## BsmtExposureMn       -4.383e+03  3.035e+03  -1.444 0.148959    
## BsmtExposureNo       -5.703e+03  2.191e+03  -2.602 0.009373 ** 
## BsmtFinType1BLQ       2.015e+03  2.797e+03   0.720 0.471382    
## BsmtFinType1GLQ       5.635e+03  2.522e+03   2.235 0.025617 *  
## BsmtFinType1LwQ      -3.455e+03  3.755e+03  -0.920 0.357780    
## BsmtFinType1missing          NA         NA      NA       NA    
## BsmtFinType1Rec      -3.913e+02  2.993e+03  -0.131 0.896005    
## BsmtFinType1Unf       2.877e+03  2.918e+03   0.986 0.324330    
## BsmtFinType2BLQ      -1.307e+04  7.577e+03  -1.725 0.084801 .  
## BsmtFinType2GLQ      -2.653e+03  9.367e+03  -0.283 0.777079    
## BsmtFinType2LwQ      -1.502e+04  7.393e+03  -2.032 0.042411 *  
## BsmtFinType2missing  -2.811e+04  2.513e+04  -1.119 0.263392    
## BsmtFinType2Rec      -1.029e+04  7.118e+03  -1.446 0.148414    
## BsmtFinType2Unf      -8.904e+03  7.576e+03  -1.175 0.240105    
## HeatingGasA           1.163e+04  2.572e+04   0.452 0.651218    
## HeatingGasW           8.187e+03  2.652e+04   0.309 0.757552    
## HeatingGrav           4.763e+03  2.816e+04   0.169 0.865725    
## HeatingOthW          -1.102e+04  3.154e+04  -0.349 0.726963    
## HeatingWall           2.601e+04  2.985e+04   0.871 0.383724    
## HeatingQCFa           6.932e+02  4.680e+03   0.148 0.882275    
## HeatingQCGd          -3.600e+03  2.072e+03  -1.737 0.082553 .  
## HeatingQCPo           3.434e+03  2.672e+04   0.129 0.897770    
## HeatingQCTA          -3.535e+03  2.067e+03  -1.710 0.087513 .  
## CentralAirY           2.484e+02  3.886e+03   0.064 0.949043    
## ElectricalFuseF      -2.531e+02  5.774e+03  -0.044 0.965036    
## ElectricalFuseP      -8.006e+03  1.859e+04  -0.431 0.666873    
## Electricalmissing     1.373e+04  2.423e+04   0.566 0.571265    
## ElectricalMix        -4.751e+04  4.467e+04  -1.064 0.287710    
## ElectricalSBrkr      -1.333e+03  2.959e+03  -0.451 0.652417    
## KitchenQualFa        -1.923e+04  6.217e+03  -3.093 0.002026 ** 
## KitchenQualGd        -2.421e+04  3.494e+03  -6.929 6.84e-12 ***
## KitchenQualTA        -2.198e+04  3.936e+03  -5.584 2.89e-08 ***
## FunctionalMaj2       -1.315e+03  1.447e+04  -0.091 0.927601    
## FunctionalMin1        9.318e+03  8.646e+03   1.078 0.281342    
## FunctionalMin2        1.100e+04  8.656e+03   1.271 0.203869    
## FunctionalMod        -7.700e+02  1.045e+04  -0.074 0.941269    
## FunctionalSev        -4.439e+04  2.949e+04  -1.505 0.132548    
## FunctionalTyp         2.047e+04  7.501e+03   2.729 0.006443 ** 
## FireplaceQuFa        -1.988e+03  6.871e+03  -0.289 0.772400    
## FireplaceQuGd         2.167e+03  5.326e+03   0.407 0.684125    
## FireplaceQumissing    8.347e+03  6.248e+03   1.336 0.181830    
## FireplaceQuPo         8.983e+03  7.850e+03   1.144 0.252702    
## FireplaceQuTA         3.260e+03  5.541e+03   0.588 0.556380    
## GarageTypeAttchd      1.960e+04  1.098e+04   1.785 0.074576 .  
## GarageTypeBasment     2.271e+04  1.273e+04   1.784 0.074608 .  
## GarageTypeBuiltIn     1.886e+04  1.143e+04   1.650 0.099237 .  
## GarageTypeCarPort     2.569e+04  1.464e+04   1.755 0.079466 .  
## GarageTypeDetchd      2.271e+04  1.099e+04   2.066 0.039038 *  
## GarageTypemissing     2.384e+04  2.079e+04   1.147 0.251736    
## GarageFinishmissing          NA         NA      NA       NA    
## GarageFinishRFn      -2.732e+03  1.968e+03  -1.388 0.165301    
## GarageFinishUnf      -1.223e+01  2.424e+03  -0.005 0.995974    
## GarageQualFa         -1.164e+05  3.013e+04  -3.862 0.000118 ***
## GarageQualGd         -1.085e+05  3.089e+04  -3.511 0.000463 ***
## GarageQualmissing            NA         NA      NA       NA    
## GarageQualPo         -1.320e+05  3.847e+04  -3.431 0.000621 ***
## GarageQualTA         -1.106e+05  2.983e+04  -3.707 0.000219 ***
## GarageCondFa          1.042e+05  3.475e+04   3.000 0.002756 ** 
## GarageCondGd          1.037e+05  3.586e+04   2.891 0.003912 ** 
## GarageCondmissing            NA         NA      NA       NA    
## GarageCondPo          1.092e+05  3.729e+04   2.929 0.003462 ** 
## GarageCondTA          1.055e+05  3.442e+04   3.067 0.002213 ** 
## PavedDriveP          -2.750e+03  5.552e+03  -0.495 0.620487    
## PavedDriveY           2.161e+01  3.469e+03   0.006 0.995031    
## SaleTypeCon           2.626e+04  1.764e+04   1.489 0.136876    
## SaleTypeConLD         1.649e+04  9.706e+03   1.699 0.089532 .  
## SaleTypeConLI         4.348e+03  1.157e+04   0.376 0.707083    
## SaleTypeConLw         1.261e+03  1.220e+04   0.103 0.917688    
## SaleTypeCWD           1.566e+04  1.291e+04   1.213 0.225406    
## SaleTypeNew           2.281e+04  1.551e+04   1.471 0.141587    
## SaleTypeOth           8.426e+03  1.444e+04   0.584 0.559586    
## SaleTypeWD            8.561e+01  4.195e+03   0.020 0.983722    
## SaleConditionAdjLand  7.512e+03  1.454e+04   0.517 0.605571    
## SaleConditionAlloca   6.021e+03  8.633e+03   0.697 0.485646    
## SaleConditionFamily  -7.042e+02  6.101e+03  -0.115 0.908127    
## SaleConditionNormal   5.535e+03  2.895e+03   1.912 0.056130 .  
## SaleConditionPartial -2.294e+03  1.492e+04  -0.154 0.877826    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22770 on 1219 degrees of freedom
## Multiple R-squared:  0.9314, Adjusted R-squared:  0.9178 
## F-statistic: 68.92 on 240 and 1219 DF,  p-value: < 2.2e-16
# Create a second model that only uses the most significant of the features from the first model summary

model2 <- lm(SalePrice ~ LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + FullBath + BedroomAbvGr + KitchenAbvGr + Fireplaces + GarageCars + GarageArea + WoodDeckSF + ScreenPorch + PoolArea + MoSold + MSZoning + LandContour + Street + LotConfig + LandSlope + Neighborhood + Condition1 + Condition2 + BldgType + RoofStyle + RoofMatl + ExterQual + Foundation + BsmtQual + BsmtCond + BsmtExposure, data = train_data_clean)
summary(model2)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + OverallCond + 
##     YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + 
##     BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + FullBath + BedroomAbvGr + 
##     KitchenAbvGr + Fireplaces + GarageCars + GarageArea + WoodDeckSF + 
##     ScreenPorch + PoolArea + MoSold + MSZoning + LandContour + 
##     Street + LotConfig + LandSlope + Neighborhood + Condition1 + 
##     Condition2 + BldgType + RoofStyle + RoofMatl + ExterQual + 
##     Foundation + BsmtQual + BsmtCond + BsmtExposure, data = train_data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179169  -10422     563   10262  179169 
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.635e+06  1.523e+05 -10.737  < 2e-16 ***
## LotArea              7.391e-01  1.000e-01   7.390 2.56e-13 ***
## OverallQual          7.394e+03  9.648e+02   7.664 3.43e-14 ***
## OverallCond          6.309e+03  7.913e+02   7.973 3.27e-15 ***
## YearBuilt            3.608e+02  6.471e+01   5.575 2.99e-08 ***
## YearRemodAdd         1.276e+02  5.019e+01   2.543 0.011112 *  
## MasVnrArea           9.129e+00  4.657e+00   1.960 0.050157 .  
## BsmtFinSF1           4.367e+01  4.278e+00  10.208  < 2e-16 ***
## BsmtFinSF2           2.760e+01  5.747e+00   4.802 1.74e-06 ***
## BsmtUnfSF            2.550e+01  4.199e+00   6.072 1.64e-09 ***
## `1stFlrSF`           5.045e+01  4.528e+00  11.142  < 2e-16 ***
## `2ndFlrSF`           5.949e+01  2.835e+00  20.988  < 2e-16 ***
## FullBath             2.105e+03  1.995e+03   1.055 0.291720    
## BedroomAbvGr        -4.945e+03  1.201e+03  -4.119 4.04e-05 ***
## KitchenAbvGr        -1.635e+04  5.215e+03  -3.135 0.001753 ** 
## Fireplaces           3.110e+03  1.335e+03   2.330 0.019946 *  
## GarageCars           3.368e+03  2.160e+03   1.559 0.119186    
## GarageArea           1.159e+01  7.391e+00   1.568 0.117136    
## WoodDeckSF           1.663e+00  5.793e+00   0.287 0.774112    
## ScreenPorch          3.328e+01  1.214e+01   2.741 0.006205 ** 
## PoolArea             8.988e+01  1.742e+01   5.161 2.82e-07 ***
## MoSold              -3.021e+02  2.436e+02  -1.240 0.215112    
## MSZoningFV           2.376e+04  1.179e+04   2.016 0.044038 *  
## MSZoningRH           2.070e+04  1.190e+04   1.740 0.082169 .  
## MSZoningRL           2.260e+04  1.007e+04   2.245 0.024919 *  
## MSZoningRM           1.773e+04  9.431e+03   1.880 0.060260 .  
## LandContourHLS       1.046e+04  5.129e+03   2.039 0.041691 *  
## LandContourLow      -1.098e+04  6.287e+03  -1.747 0.080824 .  
## LandContourLvl       5.060e+03  3.606e+03   1.403 0.160792    
## StreetPave           3.563e+04  1.191e+04   2.992 0.002823 ** 
## LotConfigCulDSac     4.710e+03  3.137e+03   1.501 0.133466    
## LotConfigFR2        -7.271e+03  4.046e+03  -1.797 0.072533 .  
## LotConfigFR3        -1.474e+04  1.309e+04  -1.126 0.260500    
## LotConfigInside     -1.205e+03  1.754e+03  -0.687 0.492083    
## LandSlopeMod         5.842e+03  3.929e+03   1.487 0.137253    
## LandSlopeSev        -3.587e+04  1.077e+04  -3.329 0.000895 ***
## NeighborhoodBlueste  4.480e+03  1.907e+04   0.235 0.814321    
## NeighborhoodBrDale  -1.054e+03  1.075e+04  -0.098 0.921925    
## NeighborhoodBrkSide -7.603e+03  8.912e+03  -0.853 0.393741    
## NeighborhoodClearCr -1.922e+04  8.958e+03  -2.145 0.032125 *  
## NeighborhoodCollgCr -1.160e+04  7.109e+03  -1.631 0.103074    
## NeighborhoodCrawfor  4.847e+03  8.248e+03   0.588 0.556882    
## NeighborhoodEdwards -2.132e+04  7.721e+03  -2.761 0.005840 ** 
## NeighborhoodGilbert -1.594e+04  7.484e+03  -2.130 0.033325 *  
## NeighborhoodIDOTRR  -9.493e+03  1.029e+04  -0.922 0.356447    
## NeighborhoodMeadowV  3.259e+02  1.007e+04   0.032 0.974186    
## NeighborhoodMitchel -2.892e+04  7.918e+03  -3.653 0.000269 ***
## NeighborhoodNAmes   -2.021e+04  7.556e+03  -2.675 0.007566 ** 
## NeighborhoodNoRidge  2.311e+04  8.121e+03   2.845 0.004505 ** 
## NeighborhoodNPkVill  4.509e+03  1.072e+04   0.421 0.673940    
## NeighborhoodNridgHt  2.130e+04  7.302e+03   2.917 0.003598 ** 
## NeighborhoodNWAmes  -2.604e+04  7.791e+03  -3.343 0.000853 ***
## NeighborhoodOldTown -1.692e+04  9.225e+03  -1.834 0.066837 .  
## NeighborhoodSawyer  -1.778e+04  7.917e+03  -2.246 0.024869 *  
## NeighborhoodSawyerW -1.064e+04  7.551e+03  -1.409 0.159171    
## NeighborhoodSomerst  5.829e+03  8.771e+03   0.665 0.506455    
## NeighborhoodStoneBr  3.717e+04  8.087e+03   4.596 4.71e-06 ***
## NeighborhoodSWISU   -1.200e+04  9.321e+03  -1.287 0.198267    
## NeighborhoodTimber  -1.624e+04  8.068e+03  -2.012 0.044371 *  
## NeighborhoodVeenker  1.679e+03  1.011e+04   0.166 0.868153    
## Condition1Feedr      2.418e+03  4.870e+03   0.497 0.619592    
## Condition1Norm       9.363e+03  3.998e+03   2.342 0.019320 *  
## Condition1PosA       6.640e+03  9.913e+03   0.670 0.503100    
## Condition1PosN       1.196e+04  7.314e+03   1.635 0.102271    
## Condition1RRAe      -1.511e+04  8.727e+03  -1.732 0.083520 .  
## Condition1RRAn       4.366e+03  6.769e+03   0.645 0.519089    
## Condition1RRNe      -8.146e+03  1.808e+04  -0.450 0.652455    
## Condition1RRNn       1.070e+03  1.274e+04   0.084 0.933104    
## Condition2Feedr      2.812e+03  2.218e+04   0.127 0.899135    
## Condition2Norm       4.295e+02  1.894e+04   0.023 0.981910    
## Condition2PosA       2.679e+04  3.175e+04   0.844 0.398908    
## Condition2PosN      -2.243e+05  2.698e+04  -8.316  < 2e-16 ***
## Condition2RRAe      -9.187e+04  4.366e+04  -2.104 0.035555 *  
## Condition2RRAn      -4.805e+03  3.118e+04  -0.154 0.877546    
## Condition2RRNn       9.026e+03  2.611e+04   0.346 0.729596    
## BldgType2fmCon      -5.498e+03  5.602e+03  -0.981 0.326561    
## BldgTypeDuplex      -2.350e+03  5.850e+03  -0.402 0.687952    
## BldgTypeTwnhs       -3.225e+04  5.267e+03  -6.122 1.21e-09 ***
## BldgTypeTwnhsE      -2.491e+04  3.569e+03  -6.979 4.66e-12 ***
## RoofStyleGable      -9.380e+03  1.829e+04  -0.513 0.608210    
## RoofStyleGambrel    -5.773e+03  1.984e+04  -0.291 0.771165    
## RoofStyleHip        -7.922e+03  1.838e+04  -0.431 0.666607    
## RoofStyleMansard     7.644e+03  2.111e+04   0.362 0.717255    
## RoofStyleShed        6.715e+04  3.474e+04   1.933 0.053434 .  
## RoofMatlCompShg      6.721e+05  3.081e+04  21.813  < 2e-16 ***
## RoofMatlMembran      7.302e+05  4.504e+04  16.214  < 2e-16 ***
## RoofMatlMetal        7.110e+05  4.531e+04  15.692  < 2e-16 ***
## RoofMatlRoll         6.732e+05  3.957e+04  17.011  < 2e-16 ***
## RoofMatlTar&Grv      6.520e+05  3.579e+04  18.217  < 2e-16 ***
## RoofMatlWdShake      6.603e+05  3.397e+04  19.440  < 2e-16 ***
## RoofMatlWdShngl      7.436e+05  3.190e+04  23.314  < 2e-16 ***
## ExterQualFa         -1.656e+04  9.527e+03  -1.739 0.082304 .  
## ExterQualGd         -3.497e+04  4.543e+03  -7.696 2.69e-14 ***
## ExterQualTA         -3.674e+04  5.066e+03  -7.252 6.91e-13 ***
## FoundationCBlock    -2.037e+03  3.082e+03  -0.661 0.508676    
## FoundationPConc      3.670e+03  3.389e+03   1.083 0.279074    
## FoundationSlab      -5.772e+03  9.031e+03  -0.639 0.522798    
## FoundationStone     -1.915e+03  1.059e+04  -0.181 0.856553    
## FoundationWood      -1.948e+04  1.473e+04  -1.323 0.186021    
## BsmtQualFa          -2.043e+04  6.242e+03  -3.273 0.001092 ** 
## BsmtQualGd          -2.901e+04  3.255e+03  -8.912  < 2e-16 ***
## BsmtQualmissing      2.358e+04  2.635e+04   0.895 0.370851    
## BsmtQualTA          -2.366e+04  4.056e+03  -5.834 6.75e-09 ***
## BsmtCondGd           4.332e+03  5.215e+03   0.831 0.406278    
## BsmtCondmissing             NA         NA      NA       NA    
## BsmtCondPo           1.885e+04  1.957e+04   0.964 0.335466    
## BsmtCondTA           6.360e+03  4.090e+03   1.555 0.120186    
## BsmtExposureGd       1.243e+04  3.038e+03   4.091 4.54e-05 ***
## BsmtExposuremissing -2.309e+04  2.441e+04  -0.946 0.344482    
## BsmtExposureMn      -3.751e+03  2.971e+03  -1.263 0.206910    
## BsmtExposureNo      -7.238e+03  2.053e+03  -3.526 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24200 on 1350 degrees of freedom
## Multiple R-squared:  0.9141, Adjusted R-squared:  0.9072 
## F-statistic: 131.9 on 109 and 1350 DF,  p-value: < 2.2e-16

We can see that the overall Adjusted R2 has decreased by roughly 1%. That being said, we’ve decreased our independent features by 50%, which makes for a much more simplified model.

# Create a third model using only the most significant features from the second model summary

model3 <- lm(SalePrice ~ LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + BedroomAbvGr + KitchenAbvGr + Fireplaces + ScreenPorch + PoolArea + MSZoning + LandContour + Street + LotConfig + LandSlope + Neighborhood + Condition1 + Condition2 + BldgType + RoofStyle + RoofMatl + ExterQual + Foundation + BsmtQual + BsmtCond + BsmtExposure, data = train_data_clean)
summary(model3)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + OverallCond + 
##     YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + 
##     BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + BedroomAbvGr + KitchenAbvGr + 
##     Fireplaces + ScreenPorch + PoolArea + MSZoning + LandContour + 
##     Street + LotConfig + LandSlope + Neighborhood + Condition1 + 
##     Condition2 + BldgType + RoofStyle + RoofMatl + ExterQual + 
##     Foundation + BsmtQual + BsmtCond + BsmtExposure, data = train_data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -184232  -10808     431   10655  184232 
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.740e+06  1.503e+05 -11.577  < 2e-16 ***
## LotArea              7.729e-01  1.004e-01   7.696 2.68e-14 ***
## OverallQual          7.631e+03  9.689e+02   7.876 6.89e-15 ***
## OverallCond          6.348e+03  7.955e+02   7.980 3.08e-15 ***
## YearBuilt            4.203e+02  6.388e+01   6.580 6.69e-11 ***
## YearRemodAdd         1.263e+02  5.049e+01   2.501 0.012485 *  
## MasVnrArea           1.124e+01  4.678e+00   2.402 0.016438 *  
## BsmtFinSF1           4.379e+01  4.313e+00  10.153  < 2e-16 ***
## BsmtFinSF2           2.695e+01  5.789e+00   4.655 3.55e-06 ***
## BsmtUnfSF            2.528e+01  4.228e+00   5.979 2.86e-09 ***
## `1stFlrSF`           5.574e+01  4.350e+00  12.813  < 2e-16 ***
## `2ndFlrSF`           6.297e+01  2.678e+00  23.513  < 2e-16 ***
## BedroomAbvGr        -5.254e+03  1.188e+03  -4.423 1.05e-05 ***
## KitchenAbvGr        -1.579e+04  5.218e+03  -3.026 0.002525 ** 
## Fireplaces           2.889e+03  1.339e+03   2.157 0.031217 *  
## ScreenPorch          3.450e+01  1.212e+01   2.847 0.004474 ** 
## PoolArea             8.898e+01  1.753e+01   5.077 4.37e-07 ***
## MSZoningFV           2.015e+04  1.183e+04   1.703 0.088876 .  
## MSZoningRH           1.772e+04  1.196e+04   1.481 0.138782    
## MSZoningRL           2.067e+04  1.011e+04   2.044 0.041128 *  
## MSZoningRM           1.616e+04  9.486e+03   1.703 0.088760 .  
## LandContourHLS       1.121e+04  5.150e+03   2.177 0.029687 *  
## LandContourLow      -9.488e+03  6.309e+03  -1.504 0.132849    
## LandContourLvl       5.281e+03  3.621e+03   1.458 0.144990    
## StreetPave           3.054e+04  1.194e+04   2.558 0.010650 *  
## LotConfigCulDSac     4.199e+03  3.162e+03   1.328 0.184447    
## LotConfigFR2        -7.314e+03  4.067e+03  -1.798 0.072379 .  
## LotConfigFR3        -1.449e+04  1.319e+04  -1.098 0.272277    
## LotConfigInside     -1.557e+03  1.766e+03  -0.882 0.378202    
## LandSlopeMod         6.061e+03  3.950e+03   1.535 0.125139    
## LandSlopeSev        -3.956e+04  1.084e+04  -3.650 0.000272 ***
## NeighborhoodBlueste  7.331e+03  1.921e+04   0.382 0.702766    
## NeighborhoodBrDale  -3.245e+03  1.077e+04  -0.301 0.763150    
## NeighborhoodBrkSide -1.077e+04  8.939e+03  -1.205 0.228521    
## NeighborhoodClearCr -2.237e+04  8.997e+03  -2.487 0.013019 *  
## NeighborhoodCollgCr -1.300e+04  7.112e+03  -1.828 0.067726 .  
## NeighborhoodCrawfor  2.033e+03  8.273e+03   0.246 0.805944    
## NeighborhoodEdwards -2.489e+04  7.726e+03  -3.222 0.001304 ** 
## NeighborhoodGilbert -1.794e+04  7.534e+03  -2.381 0.017421 *  
## NeighborhoodIDOTRR  -1.272e+04  1.029e+04  -1.237 0.216309    
## NeighborhoodMeadowV -2.840e+03  1.000e+04  -0.284 0.776441    
## NeighborhoodMitchel -3.035e+04  7.951e+03  -3.817 0.000141 ***
## NeighborhoodNAmes   -2.199e+04  7.569e+03  -2.906 0.003722 ** 
## NeighborhoodNoRidge  2.135e+04  8.147e+03   2.621 0.008863 ** 
## NeighborhoodNPkVill  7.835e+03  1.075e+04   0.729 0.466189    
## NeighborhoodNridgHt  2.153e+04  7.298e+03   2.950 0.003237 ** 
## NeighborhoodNWAmes  -2.611e+04  7.837e+03  -3.331 0.000888 ***
## NeighborhoodOldTown -1.868e+04  9.264e+03  -2.016 0.043979 *  
## NeighborhoodSawyer  -1.962e+04  7.953e+03  -2.466 0.013768 *  
## NeighborhoodSawyerW -1.240e+04  7.589e+03  -1.634 0.102432    
## NeighborhoodSomerst  7.214e+03  8.786e+03   0.821 0.411775    
## NeighborhoodStoneBr  3.538e+04  8.120e+03   4.357 1.42e-05 ***
## NeighborhoodSWISU   -1.514e+04  9.342e+03  -1.620 0.105360    
## NeighborhoodTimber  -1.680e+04  8.120e+03  -2.068 0.038786 *  
## NeighborhoodVeenker  2.749e+02  1.016e+04   0.027 0.978419    
## Condition1Feedr      1.762e+03  4.908e+03   0.359 0.719585    
## Condition1Norm       8.788e+03  4.028e+03   2.182 0.029297 *  
## Condition1PosA       5.707e+03  9.989e+03   0.571 0.567832    
## Condition1PosN       1.097e+04  7.355e+03   1.491 0.136163    
## Condition1RRAe      -1.484e+04  8.791e+03  -1.688 0.091558 .  
## Condition1RRAn       4.167e+03  6.827e+03   0.610 0.541664    
## Condition1RRNe      -8.005e+03  1.821e+04  -0.440 0.660274    
## Condition1RRNn      -2.002e+02  1.284e+04  -0.016 0.987558    
## Condition2Feedr      4.468e+03  2.231e+04   0.200 0.841302    
## Condition2Norm       4.217e+02  1.907e+04   0.022 0.982358    
## Condition2PosA       3.270e+04  3.175e+04   1.030 0.303266    
## Condition2PosN      -2.259e+05  2.710e+04  -8.334  < 2e-16 ***
## Condition2RRAe      -9.498e+04  4.394e+04  -2.162 0.030813 *  
## Condition2RRAn      -4.909e+03  3.143e+04  -0.156 0.875905    
## Condition2RRNn       1.342e+04  2.627e+04   0.511 0.609510    
## BldgType2fmCon      -5.642e+03  5.636e+03  -1.001 0.317013    
## BldgTypeDuplex      -2.857e+03  5.894e+03  -0.485 0.628032    
## BldgTypeTwnhs       -3.458e+04  5.254e+03  -6.582 6.61e-11 ***
## BldgTypeTwnhsE      -2.615e+04  3.564e+03  -7.337 3.76e-13 ***
## RoofStyleGable      -1.595e+04  1.839e+04  -0.868 0.385668    
## RoofStyleGambrel    -1.380e+04  1.993e+04  -0.692 0.488843    
## RoofStyleHip        -1.465e+04  1.848e+04  -0.793 0.427990    
## RoofStyleMansard     9.252e+02  2.123e+04   0.044 0.965252    
## RoofStyleShed        6.383e+04  3.495e+04   1.826 0.068046 .  
## RoofMatlCompShg      6.827e+05  3.035e+04  22.493  < 2e-16 ***
## RoofMatlMembran      7.398e+05  4.488e+04  16.484  < 2e-16 ***
## RoofMatlMetal        7.205e+05  4.511e+04  15.973  < 2e-16 ***
## RoofMatlRoll         6.847e+05  3.915e+04  17.491  < 2e-16 ***
## RoofMatlTar&Grv      6.562e+05  3.552e+04  18.474  < 2e-16 ***
## RoofMatlWdShake      6.685e+05  3.368e+04  19.852  < 2e-16 ***
## RoofMatlWdShngl      7.533e+05  3.147e+04  23.940  < 2e-16 ***
## ExterQualFa         -1.808e+04  9.599e+03  -1.884 0.059800 .  
## ExterQualGd         -3.439e+04  4.579e+03  -7.509 1.07e-13 ***
## ExterQualTA         -3.658e+04  5.108e+03  -7.162 1.30e-12 ***
## FoundationCBlock    -3.568e+03  3.088e+03  -1.155 0.248135    
## FoundationPConc      3.044e+03  3.414e+03   0.892 0.372775    
## FoundationSlab      -6.187e+03  9.102e+03  -0.680 0.496793    
## FoundationStone     -1.415e+03  1.068e+04  -0.133 0.894547    
## FoundationWood      -2.129e+04  1.483e+04  -1.436 0.151293    
## BsmtQualFa          -1.869e+04  6.273e+03  -2.980 0.002934 ** 
## BsmtQualGd          -2.935e+04  3.276e+03  -8.960  < 2e-16 ***
## BsmtQualmissing      2.405e+04  2.653e+04   0.907 0.364820    
## BsmtQualTA          -2.384e+04  4.078e+03  -5.846 6.29e-09 ***
## BsmtCondGd           4.951e+03  5.255e+03   0.942 0.346339    
## BsmtCondmissing             NA         NA      NA       NA    
## BsmtCondPo           1.509e+04  1.970e+04   0.766 0.444041    
## BsmtCondTA           6.583e+03  4.124e+03   1.596 0.110652    
## BsmtExposureGd       1.228e+04  3.058e+03   4.016 6.26e-05 ***
## BsmtExposuremissing -2.337e+04  2.459e+04  -0.950 0.342064    
## BsmtExposureMn      -3.308e+03  2.991e+03  -1.106 0.268877    
## BsmtExposureNo      -7.531e+03  2.061e+03  -3.654 0.000268 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24410 on 1355 degrees of freedom
## Multiple R-squared:  0.9123, Adjusted R-squared:  0.9056 
## F-statistic: 135.6 on 104 and 1355 DF,  p-value: < 2.2e-16

Once again we see a decrease in the overall Adjusted R2 value after removing 4 more independent features from the model.

model4 <- lm(SalePrice ~ LotArea*OverallQual*OverallCond + YearBuilt + BsmtFinSF1 + BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + BedroomAbvGr + PoolArea + LandSlope + Neighborhood + LotConfig + Condition1 + BldgType + RoofStyle + RoofMatl + ExterQual, data = train_data_clean)
summary(model4)
## 
## Call:
## lm(formula = SalePrice ~ LotArea * OverallQual * OverallCond + 
##     YearBuilt + BsmtFinSF1 + BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + 
##     BedroomAbvGr + PoolArea + LandSlope + Neighborhood + LotConfig + 
##     Condition1 + BldgType + RoofStyle + RoofMatl + ExterQual, 
##     data = train_data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -363957  -11843    -852   11396  179813 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -1.730e+06  1.271e+05 -13.606  < 2e-16 ***
## LotArea                          9.164e+00  3.043e+00   3.012 0.002642 ** 
## OverallQual                      2.196e+04  5.949e+03   3.692 0.000231 ***
## OverallCond                      2.042e+04  6.256e+03   3.264 0.001126 ** 
## YearBuilt                        5.446e+02  5.864e+01   9.288  < 2e-16 ***
## BsmtFinSF1                       3.087e+01  3.087e+00  10.001  < 2e-16 ***
## BsmtUnfSF                        8.960e+00  2.995e+00   2.992 0.002824 ** 
## `1stFlrSF`                       6.980e+01  3.662e+00  19.064  < 2e-16 ***
## `2ndFlrSF`                       6.024e+01  2.732e+00  22.045  < 2e-16 ***
## BedroomAbvGr                    -5.768e+03  1.288e+03  -4.477 8.19e-06 ***
## PoolArea                         9.557e+01  1.941e+01   4.923 9.56e-07 ***
## LandSlopeMod                     6.765e+03  3.763e+03   1.798 0.072432 .  
## LandSlopeSev                    -1.902e+04  1.115e+04  -1.706 0.088299 .  
## NeighborhoodBlueste             -1.154e+03  2.081e+04  -0.055 0.955796    
## NeighborhoodBrDale              -9.156e+03  1.087e+04  -0.842 0.399915    
## NeighborhoodBrkSide             -1.411e+04  9.135e+03  -1.544 0.122702    
## NeighborhoodClearCr             -1.983e+04  9.780e+03  -2.028 0.042770 *  
## NeighborhoodCollgCr             -1.864e+04  7.738e+03  -2.409 0.016108 *  
## NeighborhoodCrawfor             -1.007e+03  8.918e+03  -0.113 0.910113    
## NeighborhoodEdwards             -3.007e+04  8.381e+03  -3.587 0.000346 ***
## NeighborhoodGilbert             -2.412e+04  8.247e+03  -2.925 0.003497 ** 
## NeighborhoodIDOTRR              -2.536e+04  9.629e+03  -2.634 0.008543 ** 
## NeighborhoodMeadowV             -1.077e+04  1.036e+04  -1.040 0.298446    
## NeighborhoodMitchel             -3.268e+04  8.659e+03  -3.774 0.000167 ***
## NeighborhoodNAmes               -2.616e+04  8.120e+03  -3.221 0.001305 ** 
## NeighborhoodNoRidge              1.776e+04  8.795e+03   2.019 0.043655 *  
## NeighborhoodNPkVill              6.360e+02  1.179e+04   0.054 0.956996    
## NeighborhoodNridgHt              3.011e+04  7.962e+03   3.782 0.000162 ***
## NeighborhoodNWAmes              -3.385e+04  8.416e+03  -4.022 6.09e-05 ***
## NeighborhoodOldTown             -2.712e+04  8.870e+03  -3.057 0.002276 ** 
## NeighborhoodSawyer              -2.561e+04  8.594e+03  -2.980 0.002933 ** 
## NeighborhoodSawyerW             -2.237e+04  8.275e+03  -2.703 0.006962 ** 
## NeighborhoodSomerst              2.299e+03  7.748e+03   0.297 0.766708    
## NeighborhoodStoneBr              3.943e+04  8.905e+03   4.427 1.03e-05 ***
## NeighborhoodSWISU               -2.038e+04  1.010e+04  -2.019 0.043686 *  
## NeighborhoodTimber              -1.197e+04  8.838e+03  -1.354 0.176009    
## NeighborhoodVeenker              3.247e+03  1.120e+04   0.290 0.771940    
## LotConfigCulDSac                 5.201e+03  3.502e+03   1.485 0.137786    
## LotConfigFR2                    -6.519e+03  4.484e+03  -1.454 0.146219    
## LotConfigFR3                    -1.880e+04  1.434e+04  -1.311 0.189978    
## LotConfigInside                 -2.512e+03  1.945e+03  -1.292 0.196718    
## Condition1Feedr                  1.882e+03  5.281e+03   0.356 0.721571    
## Condition1Norm                   9.771e+03  4.313e+03   2.265 0.023655 *  
## Condition1PosA                   3.390e+03  1.103e+04   0.307 0.758579    
## Condition1PosN                  -1.428e+04  7.806e+03  -1.829 0.067570 .  
## Condition1RRAe                  -1.426e+04  9.657e+03  -1.477 0.139979    
## Condition1RRAn                   7.885e+03  7.076e+03   1.114 0.265317    
## Condition1RRNe                  -1.025e+04  2.021e+04  -0.507 0.612066    
## Condition1RRNn                   1.045e+04  1.356e+04   0.771 0.440976    
## BldgType2fmCon                  -7.997e+03  5.396e+03  -1.482 0.138590    
## BldgTypeDuplex                  -1.780e+04  4.313e+03  -4.127 3.89e-05 ***
## BldgTypeTwnhs                   -4.463e+04  5.653e+03  -7.896 5.83e-15 ***
## BldgTypeTwnhsE                  -3.142e+04  3.664e+03  -8.576  < 2e-16 ***
## RoofStyleGable                  -2.723e+04  2.025e+04  -1.345 0.178851    
## RoofStyleGambrel                -2.224e+04  2.194e+04  -1.013 0.311015    
## RoofStyleHip                    -2.398e+04  2.034e+04  -1.179 0.238570    
## RoofStyleMansard                -5.820e+03  2.331e+04  -0.250 0.802845    
## RoofStyleShed                   -7.698e+03  2.919e+04  -0.264 0.792055    
## RoofMatlCompShg                  6.519e+05  3.484e+04  18.712  < 2e-16 ***
## RoofMatlMembran                  6.569e+05  5.008e+04  13.118  < 2e-16 ***
## RoofMatlMetal                    6.503e+05  4.994e+04  13.022  < 2e-16 ***
## RoofMatlRoll                     6.573e+05  4.439e+04  14.809  < 2e-16 ***
## RoofMatlTar&Grv                  6.223e+05  4.006e+04  15.534  < 2e-16 ***
## RoofMatlWdShake                  6.266e+05  3.763e+04  16.653  < 2e-16 ***
## RoofMatlWdShngl                  7.186e+05  3.578e+04  20.086  < 2e-16 ***
## ExterQualFa                     -4.419e+04  9.850e+03  -4.486 7.86e-06 ***
## ExterQualGd                     -4.457e+04  4.721e+03  -9.441  < 2e-16 ***
## ExterQualTA                     -4.793e+04  5.330e+03  -8.993  < 2e-16 ***
## LotArea:OverallQual             -1.550e+00  4.810e-01  -3.222 0.001302 ** 
## LotArea:OverallCond             -1.881e+00  5.434e-01  -3.462 0.000552 ***
## OverallQual:OverallCond         -2.602e+03  1.076e+03  -2.418 0.015737 *  
## LotArea:OverallQual:OverallCond  3.352e-01  8.773e-02   3.821 0.000139 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27240 on 1388 degrees of freedom
## Multiple R-squared:  0.8882, Adjusted R-squared:  0.8825 
## F-statistic: 155.3 on 71 and 1388 DF,  p-value: < 2.2e-16

By introducing some logical feature interactions, and removing many features, we see roughly the same Adjusted R2 as model3. This is favorable, as we now have a much simpler model.

plot(model4 )
## Warning: not plotting observations with leverage one:
##   121, 272, 1276, 1299

Now we can test our final model on the test set. We first need to perform the same types of imputation that we did for the training set.

test_set_clean <- clean_df(test_data)
test_set_clean[28,]
predictions <- predict(model4, test_set_clean)
df <- data.frame(Id = 1461:(length(predictions)+1460), SalePrice = predictions)

write.csv(df, "submission.csv", row.names = FALSE)

Finally, I want to add a few more interactions to see if I can lift the score even further.

model6 <- lm(SalePrice ~ LotArea*ExterQual + LotArea**2+ OverallQual*OverallCond + YearBuilt + BsmtFinSF1*BsmtUnfSF + `1stFlrSF`*`2ndFlrSF` + BedroomAbvGr + PoolArea + Neighborhood*BldgType + RoofMatl, data = train_data_clean)
summary(model6)
## 
## Call:
## lm(formula = SalePrice ~ LotArea * ExterQual + LotArea^2 + OverallQual * 
##     OverallCond + YearBuilt + BsmtFinSF1 * BsmtUnfSF + `1stFlrSF` * 
##     `2ndFlrSF` + BedroomAbvGr + PoolArea + Neighborhood * BldgType + 
##     RoofMatl, data = train_data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -325559  -11307       0   11374  165846 
## 
## Coefficients: (61 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -1.442e+06  1.302e+05 -11.072  < 2e-16 ***
## LotArea                            -2.746e+00  5.661e-01  -4.851 1.37e-06 ***
## ExterQualFa                        -8.482e+04  2.241e+04  -3.785 0.000160 ***
## ExterQualGd                        -8.526e+04  8.812e+03  -9.675  < 2e-16 ***
## ExterQualTA                        -8.918e+04  9.140e+03  -9.757  < 2e-16 ***
## OverallQual                         4.089e+03  2.867e+03   1.426 0.154001    
## OverallCond                         4.979e+02  2.814e+03   0.177 0.859617    
## YearBuilt                           5.134e+02  6.008e+01   8.545  < 2e-16 ***
## BsmtFinSF1                          3.367e+01  3.651e+00   9.222  < 2e-16 ***
## BsmtUnfSF                           1.019e+01  3.123e+00   3.264 0.001125 ** 
## `1stFlrSF`                          6.601e+01  4.058e+00  16.266  < 2e-16 ***
## `2ndFlrSF`                          3.360e+01  6.784e+00   4.953 8.22e-07 ***
## BedroomAbvGr                       -6.088e+03  1.288e+03  -4.727 2.51e-06 ***
## PoolArea                            7.681e+01  1.920e+01   4.001 6.64e-05 ***
## NeighborhoodBlueste                -1.631e+04  2.769e+04  -0.589 0.555880    
## NeighborhoodBrDale                 -1.411e+04  1.537e+04  -0.918 0.358539    
## NeighborhoodBrkSide                 2.882e+04  2.726e+04   1.057 0.290532    
## NeighborhoodClearCr                 2.531e+04  2.746e+04   0.922 0.356812    
## NeighborhoodCollgCr                 2.924e+04  2.676e+04   1.093 0.274799    
## NeighborhoodCrawfor                 4.478e+04  2.729e+04   1.641 0.101082    
## NeighborhoodEdwards                 1.585e+04  2.707e+04   0.585 0.558390    
## NeighborhoodGilbert                 2.635e+04  2.688e+04   0.980 0.327232    
## NeighborhoodIDOTRR                  2.028e+04  2.746e+04   0.739 0.460249    
## NeighborhoodMeadowV                -1.608e+04  1.143e+04  -1.407 0.159628    
## NeighborhoodMitchel                 1.434e+04  2.713e+04   0.528 0.597278    
## NeighborhoodNAmes                   1.808e+04  2.693e+04   0.671 0.502068    
## NeighborhoodNoRidge                 6.667e+04  2.712e+04   2.458 0.014098 *  
## NeighborhoodNPkVill                -6.331e+03  1.316e+04  -0.481 0.630430    
## NeighborhoodNridgHt                 8.902e+04  2.699e+04   3.299 0.000997 ***
## NeighborhoodNWAmes                  1.276e+04  2.700e+04   0.473 0.636525    
## NeighborhoodOldTown                 1.697e+04  2.723e+04   0.623 0.533253    
## NeighborhoodSawyer                  1.974e+04  2.706e+04   0.730 0.465792    
## NeighborhoodSawyerW                 1.989e+04  2.699e+04   0.737 0.461246    
## NeighborhoodSomerst                 4.840e+04  2.687e+04   1.802 0.071833 .  
## NeighborhoodStoneBr                 1.343e+05  2.778e+04   4.835 1.48e-06 ***
## NeighborhoodSWISU                   2.088e+04  2.767e+04   0.755 0.450628    
## NeighborhoodTimber                  3.815e+04  2.708e+04   1.409 0.159190    
## NeighborhoodVeenker                 2.552e+04  2.845e+04   0.897 0.369841    
## BldgType2fmCon                     -8.489e+04  3.111e+04  -2.728 0.006450 ** 
## BldgTypeDuplex                     -1.851e+04  1.307e+04  -1.416 0.156970    
## BldgTypeTwnhs                      -3.619e+04  9.710e+03  -3.727 0.000201 ***
## BldgTypeTwnhsE                      1.892e+04  2.741e+04   0.690 0.490237    
## RoofMatlCompShg                     5.000e+05  3.951e+04  12.654  < 2e-16 ***
## RoofMatlMembran                     5.477e+05  4.833e+04  11.334  < 2e-16 ***
## RoofMatlMetal                       5.109e+05  4.794e+04  10.657  < 2e-16 ***
## RoofMatlRoll                        4.974e+05  4.812e+04  10.338  < 2e-16 ***
## RoofMatlTar&Grv                     4.939e+05  4.013e+04  12.308  < 2e-16 ***
## RoofMatlWdShake                     4.771e+05  4.140e+04  11.525  < 2e-16 ***
## RoofMatlWdShngl                     5.828e+05  4.046e+04  14.405  < 2e-16 ***
## LotArea:ExterQualFa                 2.675e+00  2.303e+00   1.162 0.245597    
## LotArea:ExterQualGd                 3.462e+00  5.808e-01   5.961 3.19e-09 ***
## LotArea:ExterQualTA                 3.347e+00  5.769e-01   5.801 8.17e-09 ***
## OverallQual:OverallCond             1.118e+03  4.956e+02   2.255 0.024263 *  
## BsmtFinSF1:BsmtUnfSF               -9.941e-03  5.144e-03  -1.933 0.053503 .  
## `1stFlrSF`:`2ndFlrSF`               2.302e-02  4.905e-03   4.692 2.97e-06 ***
## NeighborhoodBlueste:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodBrDale:BldgType2fmCon          NA         NA      NA       NA    
## NeighborhoodBrkSide:BldgType2fmCon  7.314e+04  3.648e+04   2.005 0.045191 *  
## NeighborhoodClearCr:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodCollgCr:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodCrawfor:BldgType2fmCon  6.040e+04  4.096e+04   1.475 0.140565    
## NeighborhoodEdwards:BldgType2fmCon  7.779e+04  3.395e+04   2.291 0.022089 *  
## NeighborhoodGilbert:BldgType2fmCon  7.239e+04  4.025e+04   1.799 0.072312 .  
## NeighborhoodIDOTRR:BldgType2fmCon   7.001e+04  3.704e+04   1.890 0.058947 .  
## NeighborhoodMeadowV:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodMitchel:BldgType2fmCon  7.957e+04  4.055e+04   1.962 0.049930 *  
## NeighborhoodNAmes:BldgType2fmCon    8.825e+04  4.104e+04   2.150 0.031698 *  
## NeighborhoodNoRidge:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodNPkVill:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodNridgHt:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodNWAmes:BldgType2fmCon          NA         NA      NA       NA    
## NeighborhoodOldTown:BldgType2fmCon  7.428e+04  3.206e+04   2.317 0.020657 *  
## NeighborhoodSawyer:BldgType2fmCon   8.827e+04  4.103e+04   2.151 0.031622 *  
## NeighborhoodSawyerW:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodSomerst:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodStoneBr:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodSWISU:BldgType2fmCon    9.594e+04  3.556e+04   2.698 0.007059 ** 
## NeighborhoodTimber:BldgType2fmCon          NA         NA      NA       NA    
## NeighborhoodVeenker:BldgType2fmCon         NA         NA      NA       NA    
## NeighborhoodBlueste:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodBrDale:BldgTypeDuplex          NA         NA      NA       NA    
## NeighborhoodBrkSide:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodClearCr:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodCollgCr:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodCrawfor:BldgTypeDuplex -3.000e+04  2.333e+04  -1.286 0.198639    
## NeighborhoodEdwards:BldgTypeDuplex  6.125e+03  1.636e+04   0.374 0.708228    
## NeighborhoodGilbert:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodIDOTRR:BldgTypeDuplex   3.184e+04  3.100e+04   1.027 0.304581    
## NeighborhoodMeadowV:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodMitchel:BldgTypeDuplex  1.952e+03  1.662e+04   0.117 0.906496    
## NeighborhoodNAmes:BldgTypeDuplex    8.939e+03  1.465e+04   0.610 0.541743    
## NeighborhoodNoRidge:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodNPkVill:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodNridgHt:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodNWAmes:BldgTypeDuplex  -3.682e+03  2.325e+04  -0.158 0.874216    
## NeighborhoodOldTown:BldgTypeDuplex  1.777e+03  2.070e+04   0.086 0.931596    
## NeighborhoodSawyer:BldgTypeDuplex  -2.441e+04  1.723e+04  -1.416 0.156866    
## NeighborhoodSawyerW:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodSomerst:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodStoneBr:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodSWISU:BldgTypeDuplex           NA         NA      NA       NA    
## NeighborhoodTimber:BldgTypeDuplex          NA         NA      NA       NA    
## NeighborhoodVeenker:BldgTypeDuplex         NA         NA      NA       NA    
## NeighborhoodBlueste:BldgTypeTwnhs   7.526e+04  4.757e+04   1.582 0.113851    
## NeighborhoodBrDale:BldgTypeTwnhs    4.996e+04  3.289e+04   1.519 0.128978    
## NeighborhoodBrkSide:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodClearCr:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodCollgCr:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodCrawfor:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodEdwards:BldgTypeTwnhs  -1.449e+04  1.878e+04  -0.772 0.440499    
## NeighborhoodGilbert:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodIDOTRR:BldgTypeTwnhs           NA         NA      NA       NA    
## NeighborhoodMeadowV:BldgTypeTwnhs   5.423e+04  3.194e+04   1.698 0.089792 .  
## NeighborhoodMitchel:BldgTypeTwnhs   2.935e+04  2.870e+04   1.023 0.306525    
## NeighborhoodNAmes:BldgTypeTwnhs            NA         NA      NA       NA    
## NeighborhoodNoRidge:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodNPkVill:BldgTypeTwnhs   5.325e+04  3.463e+04   1.538 0.124335    
## NeighborhoodNridgHt:BldgTypeTwnhs  -3.898e+04  1.445e+04  -2.698 0.007063 ** 
## NeighborhoodNWAmes:BldgTypeTwnhs           NA         NA      NA       NA    
## NeighborhoodOldTown:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodSawyer:BldgTypeTwnhs           NA         NA      NA       NA    
## NeighborhoodSawyerW:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodSomerst:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodStoneBr:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodSWISU:BldgTypeTwnhs            NA         NA      NA       NA    
## NeighborhoodTimber:BldgTypeTwnhs           NA         NA      NA       NA    
## NeighborhoodVeenker:BldgTypeTwnhs          NA         NA      NA       NA    
## NeighborhoodBlueste:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodBrDale:BldgTypeTwnhsE          NA         NA      NA       NA    
## NeighborhoodBrkSide:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodClearCr:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodCollgCr:BldgTypeTwnhsE -4.299e+04  2.889e+04  -1.488 0.136917    
## NeighborhoodCrawfor:BldgTypeTwnhsE -1.609e+04  3.210e+04  -0.501 0.616200    
## NeighborhoodEdwards:BldgTypeTwnhsE -3.724e+04  3.079e+04  -1.210 0.226676    
## NeighborhoodGilbert:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodIDOTRR:BldgTypeTwnhsE          NA         NA      NA       NA    
## NeighborhoodMeadowV:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodMitchel:BldgTypeTwnhsE -3.515e+04  3.175e+04  -1.107 0.268479    
## NeighborhoodNAmes:BldgTypeTwnhsE   -2.349e+04  3.342e+04  -0.703 0.482333    
## NeighborhoodNoRidge:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodNPkVill:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodNridgHt:BldgTypeTwnhsE -8.234e+04  2.854e+04  -2.885 0.003980 ** 
## NeighborhoodNWAmes:BldgTypeTwnhsE          NA         NA      NA       NA    
## NeighborhoodOldTown:BldgTypeTwnhsE         NA         NA      NA       NA    
## NeighborhoodSawyer:BldgTypeTwnhsE          NA         NA      NA       NA    
## NeighborhoodSawyerW:BldgTypeTwnhsE -2.866e+04  2.949e+04  -0.972 0.331198    
## NeighborhoodSomerst:BldgTypeTwnhsE -4.120e+04  2.837e+04  -1.452 0.146747    
## NeighborhoodStoneBr:BldgTypeTwnhsE -1.282e+05  2.954e+04  -4.338 1.54e-05 ***
## NeighborhoodSWISU:BldgTypeTwnhsE           NA         NA      NA       NA    
## NeighborhoodTimber:BldgTypeTwnhsE          NA         NA      NA       NA    
## NeighborhoodVeenker:BldgTypeTwnhsE  2.730e+04  3.293e+04   0.829 0.407343    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26580 on 1370 degrees of freedom
## Multiple R-squared:  0.8949, Adjusted R-squared:  0.8881 
## F-statistic:   131 on 89 and 1370 DF,  p-value: < 2.2e-16
predictions <- predict(model6, test_set_clean)
## Warning in predict.lm(model6, test_set_clean): prediction from a rank-deficient
## fit may be misleading
df <- data.frame(Id = 1461:(length(predictions)+1460), SalePrice = predictions)

write.csv(df, "submission7.csv", row.names = FALSE)

kaggle results