Problem 1: PageRank

Problem Set

Solution 1

Form the \(A\) matrix. Then introduce decay and form the \(B\) matrix as we did in the course notes.

# form A matrix
A = matrix(c(0, 1/2, 1/2, 0, 0, 0,
             0, 0, 0, 0, 0, 0,
             1/3, 1/3, 0, 0, 1/3, 0,
             0, 0, 0, 0, 1/2, 1/2,
             0, 0, 0, 1/2, 0, 1/2,
             0, 0, 0, 1, 0, 0),
           nrow =6,
           byrow= TRUE)

Since row 2 is filled with zeros, we replace with the probability of landing on any of the pages: 1/6

# replace row 2 
A[2,] = rep(1/6, 6)
A
##           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## [1,] 0.0000000 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3333333 0.3333333 0.0000000 0.0000000 0.3333333 0.0000000
## [4,] 0.0000000 0.0000000 0.0000000 0.0000000 0.5000000 0.5000000
## [5,] 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.5000000
## [6,] 0.0000000 0.0000000 0.0000000 1.0000000 0.0000000 0.0000000

Introduce decay and form the B matrix.

d = 0.85 # decay
n = 6 # number of pages

B = d * A + ((1-d)/n)
B
##           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## [1,] 0.0250000 0.4500000 0.4500000 0.0250000 0.0250000 0.0250000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3083333 0.3083333 0.0250000 0.0250000 0.3083333 0.0250000
## [4,] 0.0250000 0.0250000 0.0250000 0.0250000 0.4500000 0.4500000
## [5,] 0.0250000 0.0250000 0.0250000 0.4500000 0.0250000 0.4500000
## [6,] 0.0250000 0.0250000 0.0250000 0.8750000 0.0250000 0.0250000

Solution 2

Start with a uniform rank vector \(r\) and perform power iterations on \(B\) till convergence. That is, compute the solution \(r = B^n \times r\). Attempt this for a sufficiently large \(n\) so that \(r\) actually converges.

library(matrixcalc)

# define uniform rank vector: r 
r = rep(1/n, n)

# find convergence at n
n = 10
matrix.power(t(B), n) %*% r
##            [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085
n = 20
matrix.power(t(B), n) %*% r
##            [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408
n = 30
matrix.power(t(B), n) %*% r
##            [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
n = 40
matrix.power(t(B), n) %*% r
##            [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
n = 50
matrix.power(t(B), n) %*% r
##            [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
all.equal(matrix.power(t(B),40) %*% r, matrix.power(t(B),50) %*% r)
## [1] TRUE
# store the convergence
convergence = matrix.power(t(B), 40) %*% r

Solution 3

Compute the eigen-decomposition of \(B\) and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.

eig_decomp = eigen(B)
eig_decomp
## eigen() decomposition
## $values
## [1]  1.00000000  0.57619235 -0.42500001 -0.42499999 -0.34991524 -0.08461044
## 
## $vectors
##            [,1]       [,2]          [,3]          [,4]         [,5]
## [1,] -0.4082483 -0.7278031 -5.345224e-01  5.345225e-01 -0.795670150
## [2,] -0.4082483 -0.3721164 -5.216180e-09 -5.216180e-09  0.059710287
## [3,] -0.4082483 -0.5389259  5.345225e-01 -5.345225e-01  0.602762996
## [4,] -0.4082483  0.1174605 -2.672613e-01  2.672612e-01  0.002611877
## [5,] -0.4082483  0.1174605 -2.672613e-01  2.672612e-01  0.002611877
## [6,] -0.4082483  0.1174605  5.345225e-01 -5.345224e-01  0.002611877
##              [,6]
## [1,]  0.486246420
## [2,] -0.673469294
## [3,]  0.556554233
## [4,] -0.009145393
## [5,] -0.009145393
## [6,] -0.009145393

We can see 1 is the largest eigenvalue.

Sum the orresponding eigenvector:

sum(as.numeric(eig_decomp$vectors[,1]/sum(eig_decomp$vector[,1])))
## [1] 1

Solution 4

Use the graph package in R and its page.rank method to compute the Page Rank of the graph as given in \(A\). Note that you don’t need to apply decay. The package starts with a connected graph and applies decay internally. Verify that you do get the same PageRank vector as the two approaches above.

library(igraph)

# graph the matrix A
A_graph = graph_from_adjacency_matrix(A, weighted=TRUE)
plot(A_graph)

Verify equality with convergence vector.

page.rank(A_graph)
## $vector
## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
## 
## $value
## [1] 1
## 
## $options
## NULL
convergence
##            [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
all.equal(page.rank(A_graph)$vector, convergence[,1])
## [1] TRUE

Problem 2: Digit Recognizer

Problem Set

Step 1

Sign in to Kaggle.

Step 2

Download the data.

library(tidyverse) 

test_data = '/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/DIGIT/test.csv'
train_data = '/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/DIGIT/train.csv'

test = read.csv(test_data)
train = read.csv(train_data)

dim(test)
## [1] 28000   784
dim(train)
## [1] 42000   785

Step 3

Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.)

# plot single image 
m = matrix(unlist(train[10,-1])/255,nrow = 28,byrow = T)
image(m)

# plot first 10 images
par(mfrow=c(2,5))

for (i in 1:10) {
  m = matrix(unlist(train[i,-1])/255,nrow = 28,byrow = T)
  image(t(apply(m,2,rev)),axes=FALSE) # print and reverse matrix
}

# print the first 10 labels
train[1:10,1]
##  [1] 1 0 1 4 0 0 7 3 5 3

Step 4

What is the frequency distribution of the numbers in the dataset?

# frequency distribution table
freq = table((train[,1])) |>
  as.data.frame() |>
  rename(number = Var1, count = Freq) |>
  mutate(freq = round(prop.table(table((train[,1]))),4))

freq
number count freq
0 4132 0.0984
1 4684 0.1115
2 4177 0.0995
3 4351 0.1036
4 4072 0.0970
5 3795 0.0904
6 4137 0.0985
7 4401 0.1048
8 4063 0.0967
9 4188 0.0997
sum(freq$freq)
## [1] 1.0001

Thanks, rounding.

ggplot(train, aes(x=train[,1])) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=1) +
  scale_x_continuous(breaks = 0:9) +
  labs(x='number')

Step 5

For each number, provide the mean pixel intensity. What does this tell you?

train |>
  mutate(mean_pix = rowMeans(across(where(is.numeric)))) |>
  group_by(label) |>
  summarise(mean_pix = mean(mean_pix))
label mean_pix
0 44.11772
1 19.34964
2 38.05490
3 36.08049
4 30.87481
5 32.91837
6 35.33875
7 29.22069
8 38.46134
9 31.29013

We can see 0 has the highest (brightest) average pixel intensity, while 1 has the lowest. I suspect this represents the information needed to display each number.

Step 6

Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why?

library(stats) 

train_pca = prcomp(train[,2:785])

# calculate cumulative variances
train_cv = (cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2))
# store components for 95% variance
train_cv_95 = min(which((train_cv >= .95)))
train_cv_95
## [1] 154

154 components account for 95% of the variance.

# total possible components
train_cv = (cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2))
train_cv_100 = min(which((train_cv >= 1)))
train_cv_100
## [1] 704

704 components account for 100% of the variance. This indicates a substantial number of components account for very little variance between 95-100%

Step 7

Plot the first 10 images generated by PCA. They will appear to be noise. Why?

# plot first 10 PCA images
par(mfrow=c(2,5))

# rotate the PCA matrix
train_pca_rot = train_pca$rotation

for (i in 1:10) {
  a = array(train_pca_rot[,i], dim = c(28, 28))
  image(1:28, 1:28, a, axes=FALSE) 
}

There’s noise here because the components haven’t been normalized, and PCA maximizes variance.

Step 8

Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see?

# select only 8s  
eights = train |>
  filter(label == 8)

# pca  
eights_pca = prcomp(eights[,2:785])

# calculate cumulative variances
eights_cv = (cumsum(eights_pca$sdev^2) / sum(eights_pca$sdev^2))
# store components for 100% variance
eights_cv_100 = min(which((eights_cv >= 1)))
# plot first 10 images
par(mfrow=c(2,5))

# rotate the eights PCA matrix
eights_pca_rot = eights_pca$rotation

for (i in 1:10) {
  a = array(eights_pca_rot[,i], dim = c(28, 28))
  image(1:28, 1:28, a, axes=FALSE)
}

The images are still noisy, but clearer than the full set, and appear to go from clearest to blurriest. This is likely because only 248 variables are needed to account for 100% variance in this model, so there appears to be much less overall variance.

Step 9

An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)

library(nnet)  

# build multinomial model 
model = multinom(label ~ ., train, MaxNWts=50000)  
## # weights:  7860 (7065 variable)
## initial  value 96708.573906 
## iter  10 value 27932.238987
## iter  20 value 22901.936560
## iter  30 value 21677.526948
## iter  40 value 21254.278400
## iter  50 value 21077.662091
## iter  60 value 21002.266892
## iter  70 value 20928.779026
## iter  80 value 20866.883697
## iter  90 value 20792.605325
## iter 100 value 20727.274628
## final  value 20727.274628 
## stopped after 100 iterations
library(caret) 
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# test accuracy  
predictions = predict(model, train)
mtab = table(predictions, train$label)
confusionMatrix(mtab)
## Confusion Matrix and Statistics
## 
##            
## predictions    0    1    2    3    4    5    6    7    8    9
##           0 3953    0   10    5   11   24   43    6   16   13
##           1   24 4605  146   96   83   77   54  113  397   83
##           2   17   12 3639   76   36   18   72   38   21   14
##           3    9   13   73 3762   11   88    3    7   66   51
##           4    8    4   26    3 3448   12   17   21   18   63
##           5   27   13   21  183   55 3283  101   13  178   46
##           6   10    2   13    6   24   44 3769    4   13    0
##           7   35   13  111   91   18   51   10 4053   35  179
##           8   38   22  111   91   47  141   41    7 3247   25
##           9   11    0   27   38  339   57   27  139   72 3714
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8922          
##                  95% CI : (0.8892, 0.8952)
##     No Information Rate : 0.1115          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8802          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.95668   0.9831  0.87120  0.86463  0.84676  0.86509
## Specificity           0.99662   0.9712  0.99196  0.99147  0.99547  0.98333
## Pos Pred Value        0.96864   0.8110  0.92290  0.92138  0.95249  0.83750
## Neg Pred Value        0.99528   0.9978  0.98586  0.98447  0.98374  0.98655
## Prevalence            0.09838   0.1115  0.09945  0.10360  0.09695  0.09036
## Detection Rate        0.09412   0.1096  0.08664  0.08957  0.08210  0.07817
## Detection Prevalence  0.09717   0.1352  0.09388  0.09721  0.08619  0.09333
## Balanced Accuracy     0.97665   0.9772  0.93158  0.92805  0.92111  0.92421
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.91105   0.9209  0.79916  0.88682
## Specificity           0.99694   0.9856  0.98621  0.98122
## Pos Pred Value        0.97014   0.8819  0.86127  0.83951
## Neg Pred Value        0.99035   0.9907  0.97866  0.98739
## Prevalence            0.09850   0.1048  0.09674  0.09971
## Detection Rate        0.08974   0.0965  0.07731  0.08843
## Detection Prevalence  0.09250   0.1094  0.08976  0.10533
## Balanced Accuracy     0.95399   0.9532  0.89269  0.93402

The classification model is 89% accurate.

Problem 3: House Prices

Problem Set

Step 1

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

house_train = read.csv('/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/HOUSE/train.csv')
house_test = read.csv('/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/HOUSE/test.csv')

names(house_train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
dim(house_train)
## [1] 1460   81
summary(house_train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
dim(house_train)
## [1] 1460   81

We have 1460 rows and 81 columns of data.

ggplot(house_train, aes(x=SalePrice)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=10000) +
  labs(x = 'Sale Price')

ggplot(house_train, aes(x=OverallCond)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=1) +
  labs(x = 'Overall Condition')

ggplot(house_train, aes(x=YearBuilt)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=10) +
  labs(x = 'Year Built')

Scatterplot Sale Price vs. Lot Area & Overall Condition:

ggplot(house_train, aes(TotRmsAbvGrd, SalePrice)) +
  geom_jitter(aes(color=OverallCond), size=3) +
  labs(x='Rooms Above Ground', y='Sale Price') +
  scale_color_continuous(name='Overall Condition')

Derive a correlation matrix for any three quantitative variables in the dataset.

# vector of desired columns 
cols = c('YearBuilt','OverallCond','TotRmsAbvGrd','SalePrice')
cor_mat = cor(house_train[,cols])
cor_mat
##                YearBuilt OverallCond TotRmsAbvGrd   SalePrice
## YearBuilt     1.00000000 -0.37598320   0.09558913  0.52289733
## OverallCond  -0.37598320  1.00000000  -0.05758317 -0.07785589
## TotRmsAbvGrd  0.09558913 -0.05758317   1.00000000  0.53372316
## SalePrice     0.52289733 -0.07785589   0.53372316  1.00000000

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

\(H_0:\) The correlation between each pairwise set of variables with 80% confidence is 0.
\(H_A:\) The correlation between each pairwise set of variables with 80% confidence is not 0.

cor.test(house_train$SalePrice, house_train$YearBuilt, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  house_train$SalePrice and house_train$YearBuilt
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4980766 0.5468619
## sample estimates:
##       cor 
## 0.5228973

We see there is a pretty strong positive correlation between Sale Price and Year Built.

cor.test(house_train$SalePrice, house_train$OverallCond, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  house_train$SalePrice and house_train$OverallCond
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.1111272 -0.0444103
## sample estimates:
##         cor 
## -0.07785589
cor.test(house_train$SalePrice, house_train$TotRmsAbvGrd, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  house_train$SalePrice and house_train$TotRmsAbvGrd
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5092841 0.5573021
## sample estimates:
##       cor 
## 0.5337232

There are strong correlations between the Sales Price, Overall Condition, and Total Rooms Above Ground. We reject the null hypthosis that the correlation between the variables is 0.

Given the formula to estimate familywise error,

\(FWE = 1 - (1-α)^n\) where \(α\) is the significance level for a single hypothesis test, and \(n\) is the total number of tests, we have:

n = 3 # number of tests
a = 0.05 # alpha
FWE = function(n, a){
  1 - (1-a)^n
}

FWE(n,a)
## [1] 0.142625

The estimated familywise error is 14%, so there is a 14% chance of coming to one or more false conclusions.

Step 2

Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

#invert the correlation matrix
prec_mat = solve(cor_mat)
prec_mat
##               YearBuilt OverallCond TotRmsAbvGrd  SalePrice
## YearBuilt     1.7744022   0.6060981    0.4688685 -1.1308879
## OverallCond   0.6060981   1.2134925    0.1827151 -0.3199688
## TotRmsAbvGrd  0.4688685   0.1827151    1.5227280 -1.0436598
## SalePrice    -1.1308879  -0.3199688   -1.0436598  2.1234522
# multiply the correlation matrix by the precision matrix
cor_mat %*% prec_mat
##                 YearBuilt   OverallCond TotRmsAbvGrd     SalePrice
## YearBuilt    1.000000e+00 -2.775558e-17 0.000000e+00  0.000000e+00
## OverallCond  1.249001e-16  1.000000e+00 5.551115e-17 -5.551115e-17
## TotRmsAbvGrd 1.110223e-16 -2.775558e-17 1.000000e+00  0.000000e+00
## SalePrice    2.220446e-16  0.000000e+00 0.000000e+00  1.000000e+00
# multiply the precision matrix by the correlation matrix
prec_mat %*% cor_mat
##                 YearBuilt   OverallCond  TotRmsAbvGrd    SalePrice
## YearBuilt    1.000000e+00  2.775558e-17 -1.110223e-16 0.000000e+00
## OverallCond  8.326673e-17  1.000000e+00  0.000000e+00 5.551115e-17
## TotRmsAbvGrd 1.110223e-16 -2.775558e-17  1.000000e+00 2.220446e-16
## SalePrice    0.000000e+00 -5.551115e-17  0.000000e+00 1.000000e+00
# conduct LU decomposition on the matrix
c_mat_decomp = lu.decomposition(cor_mat)

c_mat_decomp
## $L
##             [,1]        [,2]     [,3] [,4]
## [1,]  1.00000000  0.00000000 0.000000    0
## [2,] -0.37598320  1.00000000 0.000000    0
## [3,]  0.09558913 -0.02520654 1.000000    0
## [4,]  0.52289733  0.13829449 0.491492    1
## 
## $U
##      [,1]       [,2]        [,3]      [,4]
## [1,]    1 -0.3759832  0.09558913 0.5228973
## [2,]    0  0.8586366 -0.02164326 0.1187447
## [3,]    0  0.0000000  0.99031717 0.4867330
## [4,]    0  0.0000000  0.00000000 0.4709312

Step 3

Recalling an earlier histogram, we can see that sale price is right-skewed.

ggplot(house_train, aes(x=SalePrice)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=10000) +
  labs(x = 'Sale Price')

And this checks out, as the mean sale price is higher than its median.

mean(house_train$SalePrice) - median(house_train$SalePrice)
## [1] 17921.2
min(house_train$SalePrice)
## [1] 34900

The minimum value is above 0, so no shifting is necessary.

library(MASS)

# find optimal value
lambda = fitdistr(house_train$SalePrice, densfun = 'exponential')
lambda
##        rate    
##   5.527268e-06 
##  (1.446552e-07)
# take 1000 samples using optimal value
samps = rexp(1000, lambda$estimate)
par(mfrow=c(1,2))
hist(house_train$SalePrice, breaks=40, xlab='Price', main='Sale Price')
hist(samps, breaks=40, xlab='Samples', main='Fitted Distribution')

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data.

# 5th percentile, labmda
lambda_5p = qexp(.05, rate=lambda$estimate)
lambda_5p
## [1] 9280.044
# 95th percentile, lambda
lambda_95p = qexp(.95, rate=lambda$estimate)
lambda_95p
## [1] 541991.5
# 5th percentile, empirical 
house_5p = quantile(house_train$SalePrice, 0.05)
house_5p
##    5% 
## 88000
# 95th percentile, empirical
house_95p = quantile(house_train$SalePrice, 0.95)
house_95p
##    95% 
## 326100
summary(house_train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
summary(samps)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      11.1   52231.4  133148.7  176819.6  249091.4 1317234.3

We can see the samples shifted the data towards a minimum of 0 and actually increased the right skew.

Step 4

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

I’m going to see if the data satisfies the conditions for simple linear regression. To start, I will isolate the numeric variables and take a look at a pairs plot to see which variables appear to have linear relationships.

# copy the numeric columns to a new datafram: model.df
model.df = house_train |>
  dplyr::select(-Id) |>
  keep(is.numeric)

glimpse(model.df)
## Rows: 1,460
## Columns: 37
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

I see some columns with NAs. Let’s take a look.

colSums(is.na(model.df))
##    MSSubClass   LotFrontage       LotArea   OverallQual   OverallCond 
##             0           259             0             0             0 
##     YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1    BsmtFinSF2 
##             0             0             8             0             0 
##     BsmtUnfSF   TotalBsmtSF     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##             0             0             0             0             0 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##             0             0             0             0             0 
##  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd    Fireplaces   GarageYrBlt 
##             0             0             0             0            81 
##    GarageCars    GarageArea    WoodDeckSF   OpenPorchSF EnclosedPorch 
##             0             0             0             0             0 
##    X3SsnPorch   ScreenPorch      PoolArea       MiscVal        MoSold 
##             0             0             0             0             0 
##        YrSold     SalePrice 
##             0             0

Looks like LotFrontage, MasVnrArea,and GarageYrBlt have NAs, though not many.

LotFrontage = Linear feet of street connected to property MasVnrArea = Masonry Veneer Area
GarageYrBlt = Garage Year Built

I think these columns could all be useful to our model, so rather than remove the columns or impute the NAs, I’m simply going to remove the rows with these NAs, since it’s only a maximum of about 350 out 1460 observations.

model.df = model.df |>
  na.omit()

glimpse(model.df)
## Rows: 1,121
## Columns: 37
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 50, 190, 20, 60, 20, 45, 90,…
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, 51, 50, 70, 85, 91, 51, 72, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 6120, 7420…
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 5, 5, 9, 7, 7, 4, 5, 5, 8, 7, 8,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 8, 5, 5, 6, 5, 7, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1931, 1939, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1950, 1950, 19…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 0, 0, 0, 286, 306, 0, 0, 0,…
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 0, 851, 906, 998, 0,…
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 952, 140, 134, 177, 14…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 952, 991, 1040, 11…
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1022, 1077, 1040, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 752, 0, 0, 1142, 0, 0, 0…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 1774, 1077, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3, 1, 2,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 2, 2, 3, 4, 3, 2, 2, 3, 3, 4, 3, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,…
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 8, 5, 5, 11, 7, 5, 6, 6, 6, 9, 6, 7…
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1,…
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1931, 1939, 19…
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 3, 1, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 468, 205, 384, 736, 8…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 90, 0, 0, 147, 160, 48, 0, 0…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 0, 4, 0, 21, 33, 112, 0, 102,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 205, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2…
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 4, 1, 2, 7, 8, 7, 10, 6, 5, 11, 6…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2008, 2008, 20…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

This removes 339 rows from our data. If the variables prove insignificant to the model, I will restore the rows to the data. Now let’s take a look at a pairs plot to observe which variables may have a linear relationship.

Now let’s take a look at the linear regression model.

model.full = lm(SalePrice ~ ., model.df)
summary(model.full)
## 
## Call:
## lm(formula = SalePrice ~ ., data = model.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442865  -16873   -2581   14998  318042 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.232e+05  1.701e+06  -0.190 0.849317    
## MSSubClass    -2.005e+02  3.449e+01  -5.814 8.03e-09 ***
## LotFrontage   -1.161e+02  6.124e+01  -1.896 0.058203 .  
## LotArea        5.454e-01  1.573e-01   3.466 0.000548 ***
## OverallQual    1.870e+04  1.478e+03  12.646  < 2e-16 ***
## OverallCond    5.227e+03  1.367e+03   3.824 0.000139 ***
## YearBuilt      3.170e+02  8.762e+01   3.617 0.000311 ***
## YearRemodAdd   1.206e+02  8.661e+01   1.392 0.164174    
## MasVnrArea     3.160e+01  7.006e+00   4.511 7.15e-06 ***
## BsmtFinSF1     1.739e+01  5.835e+00   2.980 0.002947 ** 
## BsmtFinSF2     8.362e+00  8.763e+00   0.954 0.340205    
## BsmtUnfSF      5.006e+00  5.275e+00   0.949 0.342890    
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.591e+01  7.356e+00   6.241 6.21e-10 ***
## X2ndFlrSF      4.668e+01  6.099e+00   7.654 4.28e-14 ***
## LowQualFinSF   3.415e+01  2.788e+01   1.225 0.220788    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   8.980e+03  3.194e+03   2.812 0.005018 ** 
## BsmtHalfBath   2.490e+03  5.071e+03   0.491 0.623487    
## FullBath       5.390e+03  3.529e+03   1.527 0.126941    
## HalfBath      -1.119e+03  3.320e+03  -0.337 0.736244    
## BedroomAbvGr  -1.023e+04  2.154e+03  -4.750 2.30e-06 ***
## KitchenAbvGr  -2.193e+04  6.704e+03  -3.271 0.001105 ** 
## TotRmsAbvGrd   5.440e+03  1.486e+03   3.661 0.000263 ***
## Fireplaces     4.375e+03  2.188e+03   2.000 0.045793 *  
## GarageYrBlt   -4.914e+01  9.093e+01  -0.540 0.589011    
## GarageCars     1.679e+04  3.487e+03   4.815 1.68e-06 ***
## GarageArea     6.488e+00  1.211e+01   0.536 0.592338    
## WoodDeckSF     2.155e+01  1.002e+01   2.151 0.031713 *  
## OpenPorchSF   -2.315e+00  1.948e+01  -0.119 0.905404    
## EnclosedPorch  7.233e+00  2.061e+01   0.351 0.725733    
## X3SsnPorch     3.458e+01  3.749e+01   0.922 0.356593    
## ScreenPorch    5.797e+01  2.040e+01   2.842 0.004572 ** 
## PoolArea      -6.126e+01  2.984e+01  -2.053 0.040326 *  
## MiscVal       -3.850e+00  6.955e+00  -0.554 0.579980    
## MoSold        -2.240e+02  4.227e+02  -0.530 0.596213    
## YrSold        -2.536e+02  8.454e+02  -0.300 0.764216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36790 on 1086 degrees of freedom
## Multiple R-squared:  0.8095, Adjusted R-squared:  0.8036 
## F-statistic: 135.7 on 34 and 1086 DF,  p-value: < 2.2e-16
model.full.pvals = summary(model.full)$coefficients[,4] 

We see a number of variables that don’t appear to be significant to the model, but we do an adjusted R-squared of 80%, which is a good start. GarageYrBlt doesn’t look like it will be significant so I am going to remove that column and restore the observations we removed in order to utilize that variable without NAs. Masonry Veneer Area is highly significant, and Lot Frontage may be, so we will keep those for now.

We can also remove TotalBsmtSF, GrLivArea, which are not defined due to singularities, and OpenPorchSF, YrSold, EnclosedPorch, HalfBath, BsmtHalfBath, MoSold, MiscVal, and GarageArea, which are not significant.

cols = c('Id', 'GarageYrBlt', 'TotalBsmtSF', 'GrLivArea', 'OpenPorchSF', 'YrSold', 'EnclosedPorch', 'HalfBath', 'BsmtHalfBath', 'MoSold', 'MiscVal', 'GarageArea')

model.df2 = house_train |>
  dplyr::select(-cols) |>
  keep(is.numeric) |>
  na.omit()
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(cols)
## 
##   # Now:
##   data %>% select(all_of(cols))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
model.v2 = lm(SalePrice ~., model.df2)
summary(model.v2)
## 
## Call:
## lm(formula = SalePrice ~ ., data = model.df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -452259  -17560   -2381   14179  323849 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.791e+05  1.415e+05  -6.919 7.48e-12 ***
## MSSubClass   -1.888e+02  3.199e+01  -5.901 4.72e-09 ***
## LotFrontage  -8.906e+01  5.775e+01  -1.542 0.123282    
## LotArea       5.532e-01  1.546e-01   3.578 0.000360 ***
## OverallQual   1.754e+04  1.373e+03  12.778  < 2e-16 ***
## OverallCond   4.177e+03  1.215e+03   3.438 0.000607 ***
## YearBuilt     2.869e+02  6.165e+01   4.654 3.62e-06 ***
## YearRemodAdd  1.803e+02  7.548e+01   2.389 0.017069 *  
## MasVnrArea    3.543e+01  6.827e+00   5.190 2.47e-07 ***
## BsmtFinSF1    1.875e+01  5.425e+00   3.456 0.000567 ***
## BsmtFinSF2    8.159e+00  8.338e+00   0.979 0.328011    
## BsmtUnfSF     7.629e+00  4.862e+00   1.569 0.116910    
## X1stFlrSF     4.632e+01  6.905e+00   6.709 3.05e-11 ***
## X2ndFlrSF     4.618e+01  5.039e+00   9.164  < 2e-16 ***
## LowQualFinSF  3.543e+01  2.175e+01   1.629 0.103524    
## BsmtFullBath  9.805e+03  2.879e+03   3.406 0.000682 ***
## FullBath      7.124e+03  2.994e+03   2.380 0.017490 *  
## BedroomAbvGr -1.003e+04  1.938e+03  -5.176 2.66e-07 ***
## KitchenAbvGr -1.276e+04  5.765e+03  -2.213 0.027102 *  
## TotRmsAbvGrd  5.048e+03  1.415e+03   3.568 0.000374 ***
## Fireplaces    4.617e+03  2.086e+03   2.214 0.027054 *  
## GarageCars    1.100e+04  1.911e+03   5.754 1.11e-08 ***
## WoodDeckSF    2.413e+01  9.618e+00   2.509 0.012241 *  
## X3SsnPorch    3.043e+01  3.692e+01   0.824 0.409918    
## ScreenPorch   5.650e+01  1.965e+01   2.876 0.004101 ** 
## PoolArea     -6.394e+01  2.883e+01  -2.218 0.026755 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36460 on 1169 degrees of freedom
## Multiple R-squared:  0.8119, Adjusted R-squared:  0.8078 
## F-statistic: 201.8 on 25 and 1169 DF,  p-value: < 2.2e-16

The model seems to be improving. The adjusted R-squared remains at 80%, and the residuals deviance is approaching symmetry. Let’s remove X3SsnPorch and BsmtFinSF2 from the model and try again.

cols = c('X3SsnPorch','BsmtFinSF2')
model.df3 = model.df2 |>
  dplyr::select(-cols) 

model.v3 = lm(SalePrice ~., model.df3)
summary(model.v3)
## 
## Call:
## lm(formula = SalePrice ~ ., data = model.df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -450535  -17624   -2527   14253  322671 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.796e+05  1.415e+05  -6.923 7.25e-12 ***
## MSSubClass   -1.894e+02  3.198e+01  -5.923 4.16e-09 ***
## LotFrontage  -8.679e+01  5.763e+01  -1.506 0.132333    
## LotArea       5.694e-01  1.536e-01   3.708 0.000219 ***
## OverallQual   1.760e+04  1.371e+03  12.843  < 2e-16 ***
## OverallCond   4.221e+03  1.214e+03   3.477 0.000526 ***
## YearBuilt     2.911e+02  6.149e+01   4.735 2.46e-06 ***
## YearRemodAdd  1.766e+02  7.535e+01   2.343 0.019293 *  
## MasVnrArea    3.550e+01  6.825e+00   5.201 2.33e-07 ***
## BsmtFinSF1    1.615e+01  4.690e+00   3.444 0.000593 ***
## BsmtUnfSF     5.367e+00  4.263e+00   1.259 0.208272    
## X1stFlrSF     4.874e+01  6.477e+00   7.526 1.04e-13 ***
## X2ndFlrSF     4.625e+01  5.038e+00   9.181  < 2e-16 ***
## LowQualFinSF  3.608e+01  2.173e+01   1.660 0.097197 .  
## BsmtFullBath  1.035e+04  2.815e+03   3.676 0.000247 ***
## FullBath      7.136e+03  2.990e+03   2.387 0.017163 *  
## BedroomAbvGr -9.991e+03  1.936e+03  -5.159 2.91e-07 ***
## KitchenAbvGr -1.335e+04  5.738e+03  -2.326 0.020182 *  
## TotRmsAbvGrd  4.923e+03  1.411e+03   3.489 0.000502 ***
## Fireplaces    4.536e+03  2.084e+03   2.177 0.029705 *  
## GarageCars    1.097e+04  1.910e+03   5.743 1.19e-08 ***
## WoodDeckSF    2.382e+01  9.592e+00   2.483 0.013166 *  
## ScreenPorch   5.700e+01  1.960e+01   2.908 0.003705 ** 
## PoolArea     -6.232e+01  2.873e+01  -2.169 0.030303 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36450 on 1171 degrees of freedom
## Multiple R-squared:  0.8116, Adjusted R-squared:  0.8079 
## F-statistic: 219.3 on 23 and 1171 DF,  p-value: < 2.2e-16

Removing LotFrontage, LowQualFinSF, BsmUnfSF and we should be good to go.

cols = c('LotFrontage','LowQualFinSF','BsmtUnfSF')  
model.df4 = model.df3 |>
  dplyr::select(-cols) 

model.v4 = lm(SalePrice ~., model.df4)
summary(model.v4)
## 
## Call:
## lm(formula = SalePrice ~ ., data = model.df4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -453173  -17058   -2301   14054  320504 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.656e+05  1.409e+05  -6.851 1.18e-11 ***
## MSSubClass   -1.722e+02  2.994e+01  -5.750 1.14e-08 ***
## LotArea       5.079e-01  1.488e-01   3.413 0.000664 ***
## OverallQual   1.813e+04  1.337e+03  13.560  < 2e-16 ***
## OverallCond   3.897e+03  1.208e+03   3.227 0.001285 ** 
## YearBuilt     2.708e+02  6.046e+01   4.479 8.23e-06 ***
## YearRemodAdd  1.881e+02  7.530e+01   2.499 0.012607 *  
## MasVnrArea    3.546e+01  6.813e+00   5.205 2.29e-07 ***
## BsmtFinSF1    1.197e+01  3.511e+00   3.409 0.000673 ***
## X1stFlrSF     5.054e+01  5.711e+00   8.848  < 2e-16 ***
## X2ndFlrSF     4.455e+01  4.989e+00   8.928  < 2e-16 ***
## BsmtFullBath  9.916e+03  2.789e+03   3.555 0.000393 ***
## FullBath      7.463e+03  2.989e+03   2.497 0.012672 *  
## BedroomAbvGr -1.013e+04  1.931e+03  -5.245 1.85e-07 ***
## KitchenAbvGr -1.526e+04  5.690e+03  -2.682 0.007411 ** 
## TotRmsAbvGrd  5.189e+03  1.395e+03   3.718 0.000210 ***
## Fireplaces    4.097e+03  2.077e+03   1.973 0.048747 *  
## GarageCars    1.061e+04  1.905e+03   5.569 3.18e-08 ***
## WoodDeckSF    2.443e+01  9.564e+00   2.555 0.010747 *  
## ScreenPorch   5.905e+01  1.959e+01   3.015 0.002625 ** 
## PoolArea     -6.474e+01  2.839e+01  -2.280 0.022760 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36510 on 1174 degrees of freedom
## Multiple R-squared:  0.8106, Adjusted R-squared:  0.8073 
## F-statistic: 251.2 on 20 and 1174 DF,  p-value: < 2.2e-16

Now we see that all variables in our model have significance. Let’s take a look at the residuals to observe if they meet the conditions for linear regression.

hist(model.v4$residuals)

The residuals are normally distributed… good.

plot(model.v4)

We can see the residuals are somewhat non-linear. and somewhat right-skewed. We can observe that they’re somewhat homoskedactic, but there are outliers affecting the model. Let’s try reducing the model to the most significant variables and see if this improves the fit.

pvals = summary(model.v4)$coefficients[-1,4]

pvals |>  
  as.data.frame() |>
  arrange(pvals)
pvals
OverallQual 0.0000000
X2ndFlrSF 0.0000000
X1stFlrSF 0.0000000
MSSubClass 0.0000000
GarageCars 0.0000000
BedroomAbvGr 0.0000002
MasVnrArea 0.0000002
YearBuilt 0.0000082
TotRmsAbvGrd 0.0002100
BsmtFullBath 0.0003933
LotArea 0.0006644
BsmtFinSF1 0.0006733
OverallCond 0.0012846
ScreenPorch 0.0026253
KitchenAbvGr 0.0074112
WoodDeckSF 0.0107471
YearRemodAdd 0.0126070
FullBath 0.0126725
PoolArea 0.0227601
Fireplaces 0.0487469

Let’s keep the remaining variables at least r negative decimals places, so the 12 most significant variables.

val_names = pvals |>  
  as.data.frame() |>
  arrange(pvals) |>
  tail(8) |>
  row.names()
model.df5 = model.df4 |>
  dplyr::select(-val_names)  
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(val_names)
## 
##   # Now:
##   data %>% select(all_of(val_names))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
model.v5 = lm(SalePrice ~., model.df5)
summary(model.v5)
## 
## Call:
## lm(formula = SalePrice ~ ., data = model.df5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -492766  -17508   -1858   15974  273713 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.454e+05  8.984e+04  -7.185 1.19e-12 ***
## MSSubClass   -2.078e+02  2.828e+01  -7.348 3.74e-13 ***
## LotArea       5.191e-01  1.520e-01   3.414 0.000662 ***
## OverallQual   2.188e+04  1.268e+03  17.260  < 2e-16 ***
## YearBuilt     2.987e+02  4.717e+01   6.332 3.44e-10 ***
## MasVnrArea    3.295e+01  6.914e+00   4.766 2.11e-06 ***
## BsmtFinSF1    1.217e+01  3.523e+00   3.455 0.000571 ***
## X1stFlrSF     5.400e+01  5.411e+00   9.979  < 2e-16 ***
## X2ndFlrSF     5.169e+01  4.767e+00  10.843  < 2e-16 ***
## BsmtFullBath  9.962e+03  2.839e+03   3.509 0.000466 ***
## BedroomAbvGr -1.041e+04  1.931e+03  -5.389 8.54e-08 ***
## TotRmsAbvGrd  4.244e+03  1.367e+03   3.105 0.001945 ** 
## GarageCars    1.097e+04  1.947e+03   5.632 2.22e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37560 on 1182 degrees of freedom
## Multiple R-squared:  0.7981, Adjusted R-squared:  0.7961 
## F-statistic: 389.5 on 12 and 1182 DF,  p-value: < 2.2e-16

So we lose a little bit of R-squared, and it seems the residuals deviance is more asymetrical, let’s take a look at the residuals.

hist(summary(model.v5)$residuals)

plot(model.v5)

We don’t observe any noticeable improvement in the residuals with this model. Taking a closer look at the data, we learn that MSSubClass is a categorical data column describing the class of the buildings.

plot(SalePrice ~ MSSubClass, model.df5)

Let’s remove this variable from the model and see if it improves the fit.

model.df6 = model.df5 |>
  dplyr::select(-MSSubClass)

model.v6 = lm(SalePrice ~., model.df6)
summary(model.v6)
## 
## Call:
## lm(formula = SalePrice ~ ., data = model.df6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -505085  -18286   -1076   16635  280446 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.201e+05  9.176e+04  -6.758 2.19e-11 ***
## LotArea       7.237e-01  1.528e-01   4.737 2.44e-06 ***
## OverallQual   2.197e+04  1.296e+03  16.956  < 2e-16 ***
## YearBuilt     2.757e+02  4.811e+01   5.731 1.26e-08 ***
## MasVnrArea    3.104e+01  7.062e+00   4.396 1.20e-05 ***
## BsmtFinSF1    1.286e+01  3.600e+00   3.571  0.00037 ***
## X1stFlrSF     5.633e+01  5.522e+00  10.202  < 2e-16 ***
## X2ndFlrSF     4.335e+01  4.733e+00   9.160  < 2e-16 ***
## BsmtFullBath  7.798e+03  2.886e+03   2.702  0.00699 ** 
## BedroomAbvGr -8.496e+03  1.956e+03  -4.344 1.52e-05 ***
## TotRmsAbvGrd  3.987e+03  1.397e+03   2.855  0.00438 ** 
## GarageCars    1.245e+04  1.980e+03   6.291 4.43e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38390 on 1183 degrees of freedom
## Multiple R-squared:  0.7889, Adjusted R-squared:  0.787 
## F-statistic:   402 on 11 and 1183 DF,  p-value: < 2.2e-16
hist(summary(model.v6)$residuals)

plot(model.v6)

Again, no improvement. Let’s try reducing to the variables with the highest t-values and see how the model fits.

t_vals = summary(model.v6)$coefficients[-1,3] 

t_vals |>
  as.data.frame() |>
  arrange(desc(t_vals))
t_vals
OverallQual 16.955977
X1stFlrSF 10.202467
X2ndFlrSF 9.160032
GarageCars 6.291123
YearBuilt 5.731199
LotArea 4.736543
MasVnrArea 4.395768
BsmtFinSF1 3.570969
TotRmsAbvGrd 2.854904
BsmtFullBath 2.702133
BedroomAbvGr -4.344397
vals = t_vals |>
  as.data.frame() |>
  arrange(desc(t_vals)) |>
  head(5) |>
  row.names()

model.df7 = model.df6 |>
  dplyr::select(c(vals, 'SalePrice'))

model.v7 = lm(SalePrice ~ ., model.df7)
summary(model.v7)
## 
## Call:
## lm(formula = SalePrice ~ ., data = model.df7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -428994  -20841   -1773   18441  288174 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.379e+05  9.416e+04  -7.837 1.02e-14 ***
## OverallQual  2.297e+04  1.334e+03  17.220  < 2e-16 ***
## X1stFlrSF    7.783e+01  4.048e+00  19.227  < 2e-16 ***
## X2ndFlrSF    4.748e+01  3.226e+00  14.719  < 2e-16 ***
## GarageCars   1.361e+04  2.066e+03   6.585 6.82e-11 ***
## YearBuilt    3.286e+02  4.963e+01   6.620 5.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40450 on 1189 degrees of freedom
## Multiple R-squared:  0.7645, Adjusted R-squared:  0.7635 
## F-statistic:   772 on 5 and 1189 DF,  p-value: < 2.2e-16

We see haven’t lost much R-squared, and the residuals deviance symmetry improves.

hist(summary(model.v7)$residuals)

plot(model.v7)

Still not good. Previously I’ve bypassed producing pairs plots to observe the relationship between the data because the number of variables was grinding my machine to a halt, but now we’ve subset far enough down to take a look:

library(GGally) 

ggpairs(model.df7)

We see overall quality and garage cars are discrete variables, and X2ndFlrSF has many 0s as not every house has a second floor. What happens if we create a third column, combining first and second floor square footage, include the interaction between YearBuilt and OverallQuality and fit SalePrice against that?

cols = c('X1stFlrSF','X2ndFlrSF','SalePrice','YearBuilt','OverallQual')
model.df8 = model.df7 |>
  dplyr::select(cols) |>
  mutate(CombineFtg = X1stFlrSF + X2ndFlrSF)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(cols)
## 
##   # Now:
##   data %>% select(all_of(cols))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
model.v8 = lm(SalePrice ~ OverallQual*YearBuilt + CombineFtg, model.df8)
summary(model.v8)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual * YearBuilt + CombineFtg, 
##     data = model.df8)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -421172  -21408   -1269   19324  274759 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.894e+06  3.560e+05   5.320 1.24e-07 ***
## OverallQual           -4.507e+05  5.615e+04  -8.026 2.39e-15 ***
## YearBuilt             -1.000e+03  1.813e+02  -5.517 4.22e-08 ***
## CombineFtg             6.114e+01  2.955e+00  20.687  < 2e-16 ***
## OverallQual:YearBuilt  2.419e+02  2.846e+01   8.498  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41160 on 1190 degrees of freedom
## Multiple R-squared:  0.7559, Adjusted R-squared:  0.7551 
## F-statistic: 921.3 on 4 and 1190 DF,  p-value: < 2.2e-16
hist(model.v8$residuals)

plot(model.v8)

We now observe all variables are signficant, the adjusted R-squared of 75% is acceptable, and the distribution of residuals is close enough to satisfying the conditions of simple linear regression. Now we can use this model to make predictions on the test data and observe our accuracy. First we have to transform the test data to match our training data.

test_tran = house_test |>
  mutate(CombineFtg = X1stFlrSF + X2ndFlrSF)

predictions = predict(model.v8, test_tran)
predictions = predictions |>
  as.data.frame() |>
  mutate(Id = test_tran$Id) |>
  rename(SalePrice = predictions) |>
  dplyr::select(c(2,1))

head(predictions)
Id SalePrice
1461 106016.3
1462 154797.8
1463 158367.1
1464 189661.0
1465 229468.3
1466 190522.7
write.csv(predictions,'/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/predictions.csv', row.names=FALSE)

My Kaggle profile:

Unfortunately these predictions did not provide accurate results. In retrospect I think the data required further tranformations in order to make more accurate predictions. Thanks for reading!