Problem 1 - Playing with PageRank (30 Points)

You’ll verify for yourself that PageRank works by performing calculations on a small universe of web pages. Let’s use the 6 page universe that we had in the previous discussion For this directed graph, perform the following calculations in R.

Form the A matrix. Then, introduce decay and form the B matrix as we did in the course notes. (5 Points)

# create transition matrix
A = matrix(c(0,0,1/3,0,0,0, 1/2,0,1/3,0,0,0, 1/2,0,0,0,0,0, 0,0,0,0,1/2,1, 0,0,1/3,1/2,0,0, 0,0,0,1/2,1/2,0),6)
print('A Transition Matrix')

## [1] "A Transition Matrix"

print(round(A,3))

##       [,1]  [,2] [,3] [,4]  [,5] [,6]
## [1,] 0.000 0.500  0.5  0.0 0.000  0.0
## [2,] 0.000 0.000  0.0  0.0 0.000  0.0
## [3,] 0.333 0.333  0.0  0.0 0.333  0.0
## [4,] 0.000 0.000  0.0  0.0 0.500  0.5
## [5,] 0.000 0.000  0.0  0.5 0.000  0.5
## [6,] 0.000 0.000  0.0  1.0 0.000  0.0

# replace 2nd row with random prob for the 6 pages 1/6
ri <- matrix(c(1/6, 1/6, 1/6, 1/6, 1/6, 1/6),ncol=6)
A[2,] <- ri
A <- t(A)
print(round(A,3))

##      [,1]  [,2]  [,3] [,4] [,5] [,6]
## [1,]  0.0 0.167 0.333  0.0  0.0    0
## [2,]  0.5 0.167 0.333  0.0  0.0    0
## [3,]  0.5 0.167 0.000  0.0  0.0    0
## [4,]  0.0 0.167 0.000  0.0  0.5    1
## [5,]  0.0 0.167 0.333  0.5  0.0    0
## [6,]  0.0 0.167 0.000  0.5  0.5    0

# calculate the decay
decay <- 0.85

n <- nrow(A)

B <- decay*A + (1-decay)/n
print('---')

## [1] "---"

print('\ B Matrix with Decay')

## [1] " B Matrix with Decay"

print(round(B,4))

##       [,1]   [,2]   [,3]  [,4]  [,5]  [,6]
## [1,] 0.025 0.1667 0.3083 0.025 0.025 0.025
## [2,] 0.450 0.1667 0.3083 0.025 0.025 0.025
## [3,] 0.450 0.1667 0.0250 0.025 0.025 0.025
## [4,] 0.025 0.1667 0.0250 0.025 0.450 0.875
## [5,] 0.025 0.1667 0.3083 0.450 0.025 0.025
## [6,] 0.025 0.1667 0.0250 0.450 0.450 0.025

Start with a uniform rank vector r and perform power iterations on B till convergence. That is, compute the solution r = Bn × r. Attempt this for a sufficiently large n so that r actually converges. (5 Points)

ri = matrix(c(1/6,1/6,1/6,1/6,1/6,1/6),6)
ri

##           [,1]
## [1,] 0.1666667
## [2,] 0.1666667
## [3,] 0.1666667
## [4,] 0.1666667
## [5,] 0.1666667
## [6,] 0.1666667

n <- 10
r10 <- matrix.power(B, n) %*%  ri
r10

##            [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085

n <- 20
r20 <- matrix.power(B, n) %*%  ri
r20

##            [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408

n <- 30
r30 <- matrix.power(B, n) %*%  ri
r30

##            [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607

# by n = 30 the result vector converges
print('---')

## [1] "---"

print('1.b) Matrix Converges by n=30 ')

## [1] "1.b) Matrix Converges by n=30 "

rf <- r30
rf

##            [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607

Compute the eigen-decomposition of B and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.(10 points)

ev <- eigen(B)
ev

## eigen() decomposition
## $values
## [1]  1.00000000+0i  0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i
## 
## $vectors
##              [,1]          [,2]                        [,3]
## [1,] 0.1044385+0i  0.2931457+0i  1.227106e-15+0.000000e+00i
## [2,] 0.1488249+0i  0.5093703+0i -1.431623e-17-0.000000e+00i
## [3,] 0.1159674+0i  0.3414619+0i -1.382539e-15-0.000000e+00i
## [4,] 0.7043472+0i -0.5890805+0i -7.071068e-01+0.000000e+00i
## [5,] 0.4037861+0i -0.1413606+0i  7.071068e-01+0.000000e+00i
## [6,] 0.5425377+0i -0.4135367+0i  0.000000e+00-4.057108e-09i
##                             [,4]           [,5]            [,6]
## [1,]  1.227106e-15-0.000000e+00i -0.06471710+0i -0.212296003+0i
## [2,] -1.431623e-17+0.000000e+00i  0.01388698+0i  0.854071294+0i
## [3,] -1.382539e-15+0.000000e+00i  0.07298180+0i -0.363638739+0i
## [4,] -7.071068e-01-0.000000e+00i -0.66058664+0i  0.018399984+0i
## [5,]  7.071068e-01+0.000000e+00i  0.73761812+0i -0.304719509+0i
## [6,]  0.000000e+00+4.057108e-09i -0.09918316+0i  0.008182973+0i

# max eigenvalue
print('---')

## [1] "---"

print('eigen values')

## [1] "eigen values"

ev$values

## [1]  1.00000000+0i  0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i

print('max eigen value is 1')

## [1] "max eigen value is 1"

which.max(ev$values)

## [1] 1

# same vector and sums to 1
e_vec <- as.numeric((1/sum(ev$vectors[,1]))*ev$vectors[,1])
print('same vector')

## [1] "same vector"

e_vec

## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608

print('vector sums to 1')

## [1] "vector sums to 1"

sum(e_vec)

## [1] 1

Use the graph package in R and its page.rank method to compute the Page Rank of the graph as given in A. Note that you don’t need to apply decay. The package starts with a connected graph and applies decay internally. Verify that you do get the same PageRank vector as the two approaches above. (10 points)

A_2 <- matrix(c(0,1/2,1/2,0,0,0,0,0,0,0,0,0,1/3,1/3,0,0,1/3,0,0,0,0,0,1/2,1/2,0,0,0,1/2,0,1/2,0,0,0,1,0,0),nrow=6,ncol=6,byrow = T)

igraph_A <- graph_from_adjacency_matrix(A_2, weighted=TRUE,mode = "directed")
plot(igraph_A)

ipageRank <- as.matrix(page.rank(igraph_A)$vector)
round(ipageRank,4) == round(rf,4)

##      [,1]
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE

Problem 2 - digit-recognizer

MNIST (“Modified National Institute of Standards and Technology”) is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike.

Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)

train_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/digit-recognizer/train.csv?raw=true", show_col_types = FALSE)

# print first 10
head(train_df, n = 10) %>%
  mutate(id = row_number()) %>%
  pivot_longer(cols = -c(label, id)) %>%
  mutate(
    pixel = as.numeric(str_extract(name, "\\d+")),
    x = pixel %% 28,
    y = 28 - pixel %/% 28
  ) %>%
  ggplot(aes(x, y, fill = value)) +
  geom_tile() +
  facet_wrap(~ label + id)

labels = train_df[,1]
train_data <- train_df[,-1]/255


# print first 10
head(cbind(labels,train_data), n = 10) %>%
  mutate(id = row_number()) %>%
  pivot_longer(cols = -c(label, id)) %>%
  mutate(
    pixel = as.numeric(str_extract(name, "\\d+")),
    x = pixel %% 28,
    y = 28 - pixel %/% 28
  ) %>%
  ggplot(aes(x, y, fill = value)) +
  geom_tile() +
  facet_wrap(~ label + id)

What is the frequency distribution of the numbers in the dataset? (5 points)

The individual images are mainly white space with a uniform background color of white creating a sparse dataset

train_vec=as.vector(unlist(train_data)) 
hist(train_vec, breaks = 100)

train_arr=array(train_vec, dim=c(nrow(train_data),28,28))
train_arr<-train_arr/255


logInd<-train_arr>0
hist(train_arr[logInd], breaks = 100, freq = FALSE)

For each number, provide the mean pixel intensity. What does this tell you? (5 points)

The mean pixel intensity for each number is relatively low 0.09 and 0.10 indicating that most images are primarily white space

train_freq <- as.data.frame(table(labels)/nrow(train_data))
train_freq

##    label       Freq
## 1      0 0.09838095
## 2      1 0.11152381
## 3      2 0.09945238
## 4      3 0.10359524
## 5      4 0.09695238
## 6      5 0.09035714
## 7      6 0.09850000
## 8      7 0.10478571
## 9      8 0.09673810
## 10     9 0.09971429

ggplot(train_freq, aes(x = label, y = Freq, fill = Freq)) +
    geom_bar(stat = 'identity') +
    scale_fill_gradient(low = "lightgray", high = "red")

Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points)

# create vector and array
train_vec=as.vector(unlist(train_data)) 
train_arr=array(train_vec, dim=c(nrow(train_data),28,28))

# convert this to 784 rows (pixels) and 42000 columns (images)
train_flat_mat<-t(matrix(train_arr, ncol=28*28, byrow = FALSE))


train_pca<-prcomp(train_flat_mat, scale = TRUE)
train_cumvar <- (cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2))


# 95% variance
cumvar_95 <- which.max(train_cumvar >= .95)
print(paste0("At 95% variance there were ", (cumvar_95), " components generated."))

## [1] "At 95% variance there were 124 components generated."

var_explained = train_pca$sdev^2 / sum(train_pca$sdev^2)

qplot(c(1:50), var_explained[1:50]) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot First 50 Only")

Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points) | There are 704 components generated. There are more components because of the increasing the accounted accuracy.

# 95% variance
cumvar_100 <- which.max(train_cumvar >= 1)
print(paste0("At 100% variance there were ", (cumvar_100), " components generated."))

## [1] "At 100% variance there were 704 components generated."

Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points) | Images generated by PCA typically have noise.

par(mfrow = c (2, 5))

for (i in 1:10){
 image(1:28, 1:28, array(train_pca$x[,i], dim = c(28, 28)))
}

Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)

# create array and matrix
eights<-which(labels == 8)
eights_arr<-train_arr[eights,,]
eights_flat_mat<-t(matrix(eights_arr, ncol=28*28, byrow = FALSE))

# pca
eights_pca <- prcomp(eights_flat_mat, scale = TRUE)
eights_cumvar <- (cumsum(eights_pca$sdev^2) / sum(eights_pca$sdev^2))

# 100% variance
eights_100 <- which.max(eights_cumvar >= 1)
print(paste0("At 100% variance there were ", (eights_100), " components generated."))

## [1] "At 100% variance there were 537 components generated."

var_explained = train_pca$sdev^2 / sum(train_pca$sdev^2)

qplot(c(1:50), var_explained[1:50]) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot First 50 Only")

par(mfrow = c (2, 5))

for (i in 1:10){
 image(1:28, 1:28, array(eights_pca$x[,i], dim = c(28, 28)))
}

An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)

# create model
train_labels_new <- as.factor(train_df$label)
train_flat_new<-as.data.frame(cbind(labels, t(train_flat_mat)  ))

# multinomial model
nnet_m <- nnet::multinom(label ~., data = train_flat_new, MaxNWts = 10000000)

## # weights:  7860 (7065 variable)
## initial  value 96708.573906 
## iter  10 value 25322.714106
## iter  20 value 20402.086316
## iter  30 value 19312.872829
## iter  40 value 18703.256586
## iter  50 value 18197.815143
## iter  60 value 17732.985798
## iter  70 value 16739.962157
## iter  80 value 14961.658448
## iter  90 value 13446.085942
## iter 100 value 12442.636014
## final  value 12442.636014 
## stopped after 100 iterations

# predict images
prediction_model <- predict(nnet_m, train_flat_new)
confusionMatrix(prediction_model, train_labels_new)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0 3994    0   19   11    4   35   17    5   19   12
##          1    3 4588   59   32   20   39   28   37  134   18
##          2   11   13 3753   88   17   19   17   37   21   10
##          3    8   12   65 3879    9   91    1    9   91   53
##          4   13    6   60   10 3852   55   35   44   38  136
##          5   33   12   20  162    5 3386   45   10  132   33
##          6   35    3   38   15   22   52 3973    2   19    3
##          7    7    8   54   35    7   28    2 4076   18   87
##          8   20   32   85   78   22   51   17    4 3519   25
##          9    8   10   24   41  114   39    2  177   72 3811
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9245         
##                  95% CI : (0.922, 0.9271)
##     No Information Rate : 0.1115         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9161         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.96660   0.9795  0.89849  0.89152  0.94597  0.89223
## Specificity           0.99678   0.9901  0.99384  0.99100  0.98953  0.98817
## Pos Pred Value        0.97036   0.9254  0.94155  0.91963  0.90657  0.88223
## Neg Pred Value        0.99636   0.9974  0.98885  0.98751  0.99417  0.98928
## Prevalence            0.09838   0.1115  0.09945  0.10360  0.09695  0.09036
## Detection Rate        0.09510   0.1092  0.08936  0.09236  0.09171  0.08062
## Detection Prevalence  0.09800   0.1180  0.09490  0.10043  0.10117  0.09138
## Balanced Accuracy     0.98169   0.9848  0.94617  0.94126  0.96775  0.94020
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity            0.9604  0.92615  0.86611  0.90998
## Specificity            0.9950  0.99346  0.99120  0.98712
## Pos Pred Value         0.9546  0.94308  0.91331  0.88669
## Neg Pred Value         0.9957  0.99137  0.98574  0.99000
## Prevalence             0.0985  0.10479  0.09674  0.09971
## Detection Rate         0.0946  0.09705  0.08379  0.09074
## Detection Prevalence   0.0991  0.10290  0.09174  0.10233
## Balanced Accuracy      0.9777  0.95981  0.92865  0.94855

Problem 3: Advanced Regression Techniques

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points

# read in training data
house_train_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/house-prices/train.csv?raw=true")
house_train_df %>% select_if(is.numeric) %>% summary()

##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF        1stFlrSF       2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      3SsnPorch     
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
##

# reokace b/a with 0 where applicable
house_train_df <- house_train_df %>% mutate(
                    LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
                    MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
                    GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
                    FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
                    PoolQC = ifelse(is.na(PoolQC), "None", PoolQC)
                  )

#Print number of null
print('---- nulls ----')

## [1] "---- nulls ----"

all_nas <- colSums(sapply(house_train_df[names(house_train_df)], is.na))
all_nas[which(all_nas > 0)]

##        Alley   MasVnrType     BsmtQual     BsmtCond BsmtExposure BsmtFinType1 
##         1369            8           37           37           38           37 
## BsmtFinType2   Electrical   GarageType GarageFinish   GarageQual   GarageCond 
##           38            1           81           81           81           81 
##        Fence  MiscFeature 
##         1179         1406

Provide univariate descriptive statistics and appropriate plots for the training data set.

# SalePrice
ggplot(house_train_df, aes(x = SalePrice)) + 
  geom_histogram() +
  scale_x_continuous(labels = comma)

# GrLivArea
ggplot(house_train_df, aes(x = GrLivArea)) + 
  geom_histogram()

# Neighborhood
ggplot(house_train_df, aes(x = Neighborhood)) + 
  geom_bar() +
  coord_flip()

# BedroomAbvGr
ggplot(house_train_df, aes(x = BedroomAbvGr)) + 
  geom_bar()

# LotArea
ggplot(house_train_df, aes(x = LotArea)) + 
  geom_histogram()

Provide a scatter plot matrix for at least two of the independent variables and the dependent variable.

house_train_df %>% 
    dplyr::select('LotFrontage','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd'
        ,'FullBath','BedroomAbvGr','TotRmsAbvGrd','GrLivArea','YrSold','SalePrice') %>%
    ggpairs()

ggplot(house_train_df, aes(y=SalePrice, x=GrLivArea)) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=LotFrontage)) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=YearRemodAdd)) + geom_point()

Derive a correlation matrix for any three quantitative variables in the dataset.

n_df <- house_train_df %>% 
    dplyr::select('LotArea','GrLivArea','FullBath') %>% 
    drop_na()

mycol=colorRampPalette(c("red","orange","yellow","white","green", "dark green"))(20)
mycor=cor(n_df)
mycor

##             LotArea GrLivArea  FullBath
## LotArea   1.0000000 0.2631162 0.1260306
## GrLivArea 0.2631162 1.0000000 0.6300116
## FullBath  0.1260306 0.6300116 1.0000000

corrplot(mycor, method="ellipse", type="upper",
         addCoef.col=TRUE, tl.cex=.7, number.cex=.5, insig="blank",
         order="hclust", hclust.method="centroid", number.digits=2,
         col=mycol)

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points

      - LotArea and GrLivArea - p-value below 0.2 reject the null hypothesis that a correlation does not exist. correlation is = 0.263
      - LotArea and FullBath - p-value below 0.2 reject the null hypothesis that a correlation does not exist. correlation is = 0.126
      - GrLivArea and FullBath - p-value below 0.2 reject the null hypothesis that a correlation does not exist. correlation is = 0.630

Would you be worried about familywise error? Why or why not? 5 points

- The familywise error reflects the increased probability of making a Type 1 error if we are running multiple Hypothesis tests. Even though the correlation tests below we can isolate the Hypothesis test so that we are only testing 2 variables at a time we will be unable to control for the variations in the other variable so we should worry about the familywise error rate.

cor.test(house_train_df$LotArea, house_train_df$GrLivArea, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  house_train_df$LotArea and house_train_df$GrLivArea
## t = 10.414, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2315997 0.2940809
## sample estimates:
##       cor 
## 0.2631162

cor.test(house_train_df$LotArea, house_train_df$FullBath, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  house_train_df$LotArea and house_train_df$FullBath
## t = 4.851, df = 1458, p-value = 1.36e-06
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.09286177 0.15892007
## sample estimates:
##       cor 
## 0.1260306

cor.test(house_train_df$GrLivArea, house_train_df$FullBath, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  house_train_df$GrLivArea and house_train_df$FullBath
## t = 30.977, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6093339 0.6498331
## sample estimates:
##       cor 
## 0.6300116

Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points

mycor.inv = solve(mycor)
mycor.inv

##               LotArea GrLivArea    FullBath
## LotArea    1.07740995 -0.328207  0.07098756
## GrLivArea -0.32820697  1.758121 -1.06627226
## FullBath   0.07098756 -1.066272  1.66281734

m_mi <- mycor %*% mycor.inv

mycor.inv  %*% mycor

##                LotArea     GrLivArea     FullBath
## LotArea   1.000000e+00 -1.300003e-17 2.775558e-17
## GrLivArea 9.626782e-18  1.000000e+00 0.000000e+00
## FullBath  6.761552e-18 -1.106864e-16 1.000000e+00

lu.decomposition(m_mi)

## $L
##               [,1]          [,2] [,3]
## [1,]  1.000000e+00  0.000000e+00    0
## [2,] -1.300003e-17  1.000000e+00    0
## [3,]  2.775558e-17 -2.671969e-34    1
## 
## $U
##      [,1]         [,2]          [,3]
## [1,]    1 9.626782e-18  6.761552e-18
## [2,]    0 1.000000e+00 -1.106864e-16
## [3,]    0 0.000000e+00  1.000000e+00

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R- devel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points

ggplot(house_train_df, aes(x=GrLivArea)) + geom_histogram()

m <- min(house_train_df$GrLivArea)
m

## [1] 334

GrLivArea_shift <- house_train_df$GrLivArea - m
ggplot() + geom_histogram(aes(x=GrLivArea_shift))

exp_GrLivArea <- fitdistr(GrLivArea_shift,'exponential')

lamb_GrLivArea <- exp_GrLivArea$estimate
lamb_GrLivArea

##         rate 
## 0.0008464077

exp_sample <- rexp(1000, lamb_GrLivArea)
summary(exp_sample)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.74   313.46   776.80  1125.80  1562.82 10167.15

ggplot() + geom_histogram(aes(x=exp_sample))

l <- qexp(0.05, lamb_GrLivArea)
l

## [1] 60.60117

u <- qexp(0.95, lamb_GrLivArea)
u

## [1] 3539.349

quantile(exp_sample, probs = c(0.05,0.95))

##         5%        95% 
##   62.45815 3353.36821

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score. 10 points

Load data

First step is to load the model and tidy the data. The NAs in the dataset actually represent the absences of a particular feature vs invalid data. As part of the tidy process we will transform NAs into a value that can be processed.

# read in training data
house_train_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/house-prices/train.csv?raw=true")
house_train_df %>% select_if(is.numeric) %>% summary()

##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF        1stFlrSF       2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      3SsnPorch     
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
##

# reokace b/a with 0 where applicable
house_train_df <- house_train_df %>% mutate(
                    LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
                    MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
                    GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
                    FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
                    BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
                    TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF),
                    PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
                    Street = ifelse(is.na(Street), "None", Street),
                    Alley = ifelse(is.na(Alley), "None", Alley),
                    GarageType = ifelse(is.na(GarageType), "None", GarageType),
                    GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
                    GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
                    GarageCond = ifelse(is.na(GarageCond), "None", GarageCond),
                    BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
                    KitchenQual = ifelse(is.na(KitchenQual), "None", KitchenQual),
                    GrLivArea = ifelse(is.na(GrLivArea), 0, GrLivArea)
                  )

#Print number of null
print('---- nulls ----')

## [1] "---- nulls ----"

all_nas <- colSums(sapply(house_train_df[names(house_train_df)], is.na))
all_nas[which(all_nas > 0)]

##   MasVnrType     BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2   Electrical 
##            8           37           38           37           38            1 
##        Fence  MiscFeature 
##         1179         1406

Baseline - Full Numeric Model

As a baseline we I developed a numeric model that will include all numeric fields. Using stepwise optimization of the model we get the following baseline model.

# moduel with numeric variables only
houseNum_df <- house_train_df %>%  dplyr::select_if(is.numeric)
ml.num <- lm(SalePrice ~., houseNum_df)


# stepwize optimizaiton
ml.num.step <- stats::step(ml.num, criteria='AIC', trace=FALSE,direction='both')
summary(ml.num.step)

## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + 
##     BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + LowQualFinSF + BsmtFullBath + 
##     FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + 
##     GarageCars + WoodDeckSF + ScreenPorch, data = houseNum_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -486479  -16700   -2066   13824  287942 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.744e+05  1.245e+05  -7.826 9.70e-15 ***
## MSSubClass   -1.690e+02  2.596e+01  -6.512 1.03e-10 ***
## LotArea       4.142e-01  9.987e-02   4.147 3.56e-05 ***
## OverallQual   1.758e+04  1.172e+03  15.002  < 2e-16 ***
## OverallCond   4.359e+03  1.008e+03   4.326 1.62e-05 ***
## YearBuilt     2.970e+02  5.408e+01   5.492 4.68e-08 ***
## YearRemodAdd  1.668e+02  6.544e+01   2.548  0.01092 *  
## MasVnrArea    3.156e+01  5.885e+00   5.363 9.53e-08 ***
## BsmtFinSF1    1.609e+01  3.908e+00   4.117 4.06e-05 ***
## BsmtUnfSF     7.005e+00  3.628e+00   1.931  0.05370 .  
## `1stFlrSF`    4.917e+01  5.245e+00   9.375  < 2e-16 ***
## `2ndFlrSF`    4.574e+01  4.238e+00  10.794  < 2e-16 ***
## LowQualFinSF  2.863e+01  1.960e+01   1.461  0.14428    
## BsmtFullBath  9.670e+03  2.423e+03   3.991 6.91e-05 ***
## FullBath      4.843e+03  2.586e+03   1.873  0.06132 .  
## BedroomAbvGr -1.029e+04  1.675e+03  -6.143 1.05e-09 ***
## KitchenAbvGr -1.357e+04  5.143e+03  -2.639  0.00842 ** 
## TotRmsAbvGrd  5.088e+03  1.223e+03   4.159 3.38e-05 ***
## Fireplaces    3.448e+03  1.741e+03   1.981  0.04782 *  
## GarageCars    1.049e+04  1.691e+03   6.202 7.26e-10 ***
## WoodDeckSF    2.515e+01  7.869e+00   3.196  0.00143 ** 
## ScreenPorch   5.352e+01  1.686e+01   3.174  0.00154 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34720 on 1438 degrees of freedom
## Multiple R-squared:  0.8118, Adjusted R-squared:  0.809 
## F-statistic: 295.3 on 21 and 1438 DF,  p-value: < 2.2e-16

# residuals
par(mfrow=c(2,2))
plot(ml.num.step)

lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtUnfSF + 1stFlrSF + 2ndFlrSF + LowQualFinSF + BsmtFullBath + FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + ScreenPorch, data = houseNum_df)

Analysis - Candidate Variable

Starting with the output from the stepwize optimized model we analyze the correlation between the independent variables and the Sale Price to identify a few candidate variable to focus on.

candidates <- c('SalePrice','MSSubClass','LotArea','LotFrontage','OverallQual','YearBuilt','YearRemodAdd',
                'BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath',
                'FullBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea',
                'WoodDeckSF','OpenPorchSF','FireplaceQu','SaleCondition','OverallCond','CentralAir','PoolQC',
                'ExterCond','MasVnrArea','TotalBsmtSF','ScreenPorch')

full_df <- house_train_df %>%
   dplyr::select(all_of(candidates)) %>%
   mutate(ExterCond=as.numeric(as.factor(ExterCond)),
            FullBath=as.numeric(as.factor(FullBath)),
            FireplaceQu=as.numeric(as.factor(FireplaceQu)), 
            SaleCondition=as.numeric(as.factor(SaleCondition)), 
            GarageCars=as.numeric(as.factor(GarageCars)),
            OverallCond=as.numeric(as.factor(OverallCond)), 
            CentralAir=as.numeric(as.factor(CentralAir)),
            PoolQC=as.numeric(as.factor(PoolQC))
   )

ames_corr <- full_df
ames_correlations <- cor(ames_corr, method = c("pearson"))
corrplot(ames_correlations, method="color")

Filtering the canidate variables with a correlation of 0.3 or higher generates that smaller list. Please note that across the range of variable there the negative correlation amounts are minimal so we will focus on positive correlation.

t <- ames_correlations
t[t < 0.3 ] <- 0
corrplot(t, method="color")

t <- as.data.frame(ames_correlations)
t <- t[t$SalePrice > 0.3]
t <- t %>% dplyr::filter(SalePrice > 0.3)


candidates <- names(t)
candidates

##  [1] "SalePrice"    "OverallQual"  "YearBuilt"    "YearRemodAdd" "BsmtFinSF1"  
##  [6] "TotalBsmtSF"  "1stFlrSF"     "2ndFlrSF"     "GrLivArea"    "FullBath"    
## [11] "TotRmsAbvGrd" "Fireplaces"   "GarageCars"   "GarageArea"   "WoodDeckSF"  
## [16] "OpenPorchSF"  "MasVnrArea"

ames_corr %>% 
    dplyr::select(all_of(candidates)) %>% 
    ggpairs()

corrplot(as.matrix(t), method="color")

Analyzing the relationship between candidate variables and Sale Price graphically

ggplot(house_train_df, aes(y=SalePrice, x=OverallQual)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=YearBuilt)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=YearRemodAdd)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=BsmtFinSF1)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=TotalBsmtSF)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=house_train_df$'1stFlrSF')) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=house_train_df$'2ndFlrSF')) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=GrLivArea)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=FullBath)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=TotRmsAbvGrd)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=Fireplaces)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=GarageCars)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=GarageArea)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=WoodDeckSF)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=OpenPorchSF)) + 
    scale_y_continuous(labels = comma) + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=MasVnrArea)) + 
    scale_y_continuous(labels = comma) + geom_point()

Exploring the histograms for some key candiate variables

ggplot(house_train_df, aes(x=log(SalePrice))) + geom_histogram()

ggplot(house_train_df, aes(x=YearBuilt-min(house_train_df$YearBuilt))) + geom_histogram()

ggplot(house_train_df, aes(x=YearRemodAdd-min(house_train_df$YearRemodAdd))) + geom_histogram()

ggplot(house_train_df, aes(x=log(TotalBsmtSF))) + geom_histogram()

ggplot(house_train_df, aes(x=house_train_df$'1stFlrSF')) + geom_histogram()

ggplot(house_train_df, aes(x=GrLivArea)) + geom_histogram()

ggplot(house_train_df, aes(x=GarageArea)) + geom_histogram()

Exploring the possibility of transforming variables for a better fit.

ggplot(house_train_df, aes(y=SalePrice, x=YearBuilt)) + 
    scale_y_continuous(labels = comma) + geom_point() +
    facet_wrap(vars(Neighborhood))

ggplot(house_train_df, aes(y=SalePrice, x=YearBuilt)) + 
    scale_x_log10() + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=TotalBsmtSF)) + 
    scale_x_log10() + geom_point()

ggplot(house_train_df, aes(y=SalePrice, x=TotalBsmtSF)) + 
    scale_x_sqrt() + geom_point()

[1] “SalePrice” “OverallQual” “YearBuilt” “YearRemodAdd” “BsmtFinSF1” “TotalBsmtSF” “1stFlrSF” “2ndFlrSF”
[9] “GrLivArea” “FullBath” “TotRmsAbvGrd” “Fireplaces” “GarageCars” “GarageArea” “WoodDeckSF” “OpenPorchSF” [17] “MasVnrArea”

Full Model

Adding key factor variables to candidate numeric variable identified previously.

candidates <- c('SalePrice','OverallQual','YearBuilt','YearRemodAdd','BsmtFinSF1','BsmtUnfSF',
                'TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath','FullBath','HalfBath','KitchenAbvGr',
                'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                'FireplaceQu',
                'SaleCondition','OverallCond','CentralAir','PoolQC','ExterCond','YearRemodAdd',
                'MasVnrArea',
                'MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','LotConfig','HouseStyle',
                'RoofStyle','RoofMatl','BsmtQual','BsmtCond','BsmtFinType1','KitchenQual','GarageType',
                'GarageQual')


full_df <- house_train_df %>%
   dplyr::select(all_of(candidates)) %>%
   mutate(ExterCond=(as.factor(ExterCond)),
            MSSubClass=(as.factor(MSSubClass)),
            MSZoning=(as.factor(MSZoning)),
            Street=(as.factor(Street)),
            Alley=(as.factor(Alley)),
            LotShape=(as.factor(LotShape)),
            LandContour=(as.factor(LandContour)),
            LotConfig=(as.factor(LotConfig)),
            HouseStyle=(as.factor(HouseStyle)),
            RoofStyle=(as.factor(RoofStyle)),
            RoofMatl=(as.factor(RoofMatl)),
            BsmtQual=(as.factor(BsmtQual)),
            BsmtCond=(as.factor(BsmtCond)),
            BsmtFinType1=(as.factor(BsmtFinType1)),
            KitchenQual=(as.factor(KitchenQual)),
            FireplaceQu=(as.factor(FireplaceQu)), 
            GarageType=(as.factor(GarageType)), 
            GarageQual=(as.factor(GarageQual)), 
            SaleCondition=(as.factor(SaleCondition)), 
            OverallCond=c(as.factor(OverallCond)), 
            CentralAir=(as.factor(CentralAir)),
            PoolQC=(as.factor(PoolQC))
   )


#full_df <- ames_corr 
ml.full <- lm(formula = SalePrice ~ ., data = full_df)


# stepwize optimizaiton
ml.full.step <- stats::step(ml.full, criteria='AIC', trace=FALSE,direction='both')
summary(ml.full.step)

## 
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + BsmtUnfSF + 
##     TotalBsmtSF + `1stFlrSF` + `2ndFlrSF` + KitchenAbvGr + TotRmsAbvGrd + 
##     Fireplaces + GarageCars + GarageArea + WoodDeckSF + SaleCondition + 
##     OverallCond + PoolQC + MasVnrArea + MSSubClass + MSZoning + 
##     LotShape + LandContour + LotConfig + RoofMatl + BsmtQual + 
##     BsmtFinType1 + KitchenQual, data = full_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -395914  -12122     114   11539  187150 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.291e+06  1.568e+05  -8.231 4.33e-16 ***
## OverallQual           9.752e+03  1.089e+03   8.959  < 2e-16 ***
## YearBuilt             3.741e+02  7.345e+01   5.093 4.02e-07 ***
## BsmtUnfSF            -2.062e+01  2.960e+00  -6.968 5.03e-12 ***
## TotalBsmtSF           5.005e+01  5.335e+00   9.381  < 2e-16 ***
## `1stFlrSF`            4.129e+01  5.661e+00   7.293 5.14e-13 ***
## `2ndFlrSF`            5.928e+01  5.018e+00  11.813  < 2e-16 ***
## KitchenAbvGr         -2.355e+04  6.122e+03  -3.847 0.000125 ***
## TotRmsAbvGrd          1.870e+03  9.462e+02   1.976 0.048310 *  
## Fireplaces            5.752e+03  1.443e+03   3.987 7.04e-05 ***
## GarageCars            4.705e+03  2.406e+03   1.956 0.050703 .  
## GarageArea            1.192e+01  8.139e+00   1.465 0.143286    
## WoodDeckSF            1.249e+01  6.487e+00   1.926 0.054347 .  
## SaleConditionAdjLand  1.534e+04  1.738e+04   0.883 0.377615    
## SaleConditionAlloca   2.308e+04  1.061e+04   2.175 0.029767 *  
## SaleConditionFamily  -1.785e+03  6.992e+03  -0.255 0.798489    
## SaleConditionNormal   9.051e+03  3.063e+03   2.955 0.003186 ** 
## SaleConditionPartial  2.455e+04  4.257e+03   5.766 1.01e-08 ***
## OverallCond2         -6.698e+03  3.172e+04  -0.211 0.832778    
## OverallCond3         -2.448e+04  2.997e+04  -0.817 0.414156    
## OverallCond4         -1.467e+04  2.961e+04  -0.496 0.620300    
## OverallCond5         -5.122e+03  2.952e+04  -0.173 0.862293    
## OverallCond6          1.666e+03  2.954e+04   0.056 0.955027    
## OverallCond7          1.020e+04  2.951e+04   0.345 0.729784    
## OverallCond8          1.001e+04  2.965e+04   0.337 0.735846    
## OverallCond9          2.637e+04  3.030e+04   0.870 0.384342    
## PoolQCFa             -1.225e+05  2.885e+04  -4.244 2.34e-05 ***
## PoolQCGd             -5.575e+04  3.001e+04  -1.858 0.063419 .  
## PoolQCNone           -1.017e+05  2.116e+04  -4.807 1.70e-06 ***
## MasVnrArea            2.539e+01  4.968e+00   5.110 3.68e-07 ***
## MSSubClass30          1.500e+04  4.756e+03   3.155 0.001642 ** 
## MSSubClass40          1.484e+04  1.459e+04   1.017 0.309422    
## MSSubClass45          1.350e+04  8.693e+03   1.553 0.120763    
## MSSubClass50          3.569e+02  4.269e+03   0.084 0.933391    
## MSSubClass60         -1.016e+04  4.861e+03  -2.090 0.036811 *  
## MSSubClass70          1.104e+03  6.269e+03   0.176 0.860275    
## MSSubClass75         -5.633e+03  9.191e+03  -0.613 0.540022    
## MSSubClass80         -6.682e+03  4.289e+03  -1.558 0.119501    
## MSSubClass85         -6.026e+03  6.714e+03  -0.898 0.369591    
## MSSubClass90         -1.214e+04  6.646e+03  -1.827 0.067915 .  
## MSSubClass120        -8.254e+03  3.657e+03  -2.257 0.024191 *  
## MSSubClass160        -2.460e+04  5.564e+03  -4.422 1.06e-05 ***
## MSSubClass180         2.043e+03  9.719e+03   0.210 0.833539    
## MSSubClass190         3.840e+03  6.892e+03   0.557 0.577553    
## MSZoningFV            2.585e+04  1.025e+04   2.523 0.011764 *  
## MSZoningRH            1.825e+04  1.184e+04   1.541 0.123441    
## MSZoningRL            1.601e+04  9.467e+03   1.692 0.090954 .  
## MSZoningRM            8.414e+03  9.522e+03   0.884 0.377067    
## LotShapeIR2           1.525e+04  4.729e+03   3.225 0.001292 ** 
## LotShapeIR3           1.176e+04  9.581e+03   1.227 0.220048    
## LotShapeReg          -1.024e+03  1.796e+03  -0.570 0.568658    
## LandContourHLS        2.267e+04  5.542e+03   4.091 4.56e-05 ***
## LandContourLow        1.667e+04  6.680e+03   2.496 0.012694 *  
## LandContourLvl        8.705e+03  3.981e+03   2.187 0.028932 *  
## LotConfigCulDSac      8.311e+03  3.641e+03   2.283 0.022604 *  
## LotConfigFR2         -7.351e+03  4.574e+03  -1.607 0.108239    
## LotConfigFR3         -7.840e+03  1.418e+04  -0.553 0.580538    
## LotConfigInside      -9.145e+02  2.019e+03  -0.453 0.650667    
## RoofMatlCompShg       6.822e+05  4.084e+04  16.702  < 2e-16 ***
## RoofMatlMembran       7.073e+05  5.021e+04  14.088  < 2e-16 ***
## RoofMatlMetal         7.045e+05  5.041e+04  13.974  < 2e-16 ***
## RoofMatlRoll          6.746e+05  5.036e+04  13.396  < 2e-16 ***
## RoofMatlTar&Grv       6.611e+05  4.101e+04  16.120  < 2e-16 ***
## RoofMatlWdShake       6.761e+05  4.307e+04  15.697  < 2e-16 ***
## RoofMatlWdShngl       7.545e+05  4.228e+04  17.846  < 2e-16 ***
## BsmtQualFa           -2.957e+04  6.809e+03  -4.342 1.52e-05 ***
## BsmtQualGd           -2.861e+04  3.532e+03  -8.098 1.24e-15 ***
## BsmtQualTA           -2.893e+04  4.412e+03  -6.557 7.82e-11 ***
## BsmtFinType1BLQ       4.454e+02  3.052e+03   0.146 0.883982    
## BsmtFinType1GLQ       9.453e+03  2.739e+03   3.451 0.000575 ***
## BsmtFinType1LwQ      -5.333e+03  3.904e+03  -1.366 0.172204    
## BsmtFinType1Rec      -2.835e+03  3.235e+03  -0.876 0.381029    
## BsmtFinType1Unf       5.868e+03  3.178e+03   1.846 0.065093 .  
## KitchenQualFa        -3.251e+04  6.599e+03  -4.927 9.39e-07 ***
## KitchenQualGd        -3.120e+04  3.626e+03  -8.602  < 2e-16 ***
## KitchenQualTA        -3.525e+04  4.092e+03  -8.615  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27690 on 1347 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.8848, Adjusted R-squared:  0.8783 
## F-statistic: 137.9 on 75 and 1347 DF,  p-value: < 2.2e-16

# residuals
par(mfrow=c(2,2))
plot(ml.full.step)

Model 1

Removed the following variable to generate the final model - GarageCars - WoodDeckSF - GarageArea - OverallCond - MSSubClass - MSZoning - BsmtFinType1 - LotConfig

candidates <- c('SalePrice','OverallQual','YearBuilt','BsmtUnfSF','TotalBsmtSF',
                '1stFlrSF','2ndFlrSF','KitchenAbvGr', 'TotRmsAbvGrd','Fireplaces',
                'SaleCondition',
                'PoolQC', 'MasVnrArea','LotShape', 'LandContour',
                'RoofMatl', 'BsmtQual','KitchenQual')

df1 <- full_df %>% dplyr::select(all_of(candidates))


# model v1
ml.v1 <- lm(formula = SalePrice ~ ., data = df1)


# stepwize optimization
ml.v1.step <- stats::step(ml.v1, criteria='AIC', trace=FALSE,direction='both')
summary(ml.v1.step)

## 
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + BsmtUnfSF + 
##     TotalBsmtSF + `1stFlrSF` + `2ndFlrSF` + KitchenAbvGr + TotRmsAbvGrd + 
##     Fireplaces + SaleCondition + PoolQC + MasVnrArea + LotShape + 
##     LandContour + RoofMatl + BsmtQual + KitchenQual, data = df1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -401554  -14380     -87   13885  186274 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -9.204e+05  9.850e+04  -9.344  < 2e-16 ***
## OverallQual           1.192e+04  1.030e+03  11.567  < 2e-16 ***
## YearBuilt             1.859e+02  4.437e+01   4.191 2.95e-05 ***
## BsmtUnfSF            -2.103e+01  2.087e+00 -10.077  < 2e-16 ***
## TotalBsmtSF           4.907e+01  4.870e+00  10.077  < 2e-16 ***
## `1stFlrSF`            4.738e+01  5.381e+00   8.806  < 2e-16 ***
## `2ndFlrSF`            5.196e+01  3.517e+00  14.772  < 2e-16 ***
## KitchenAbvGr         -2.858e+04  4.167e+03  -6.857 1.04e-11 ***
## TotRmsAbvGrd          1.948e+03  9.273e+02   2.101 0.035837 *  
## Fireplaces            5.097e+03  1.453e+03   3.508 0.000465 ***
## SaleConditionAdjLand  5.109e+03  1.524e+04   0.335 0.737559    
## SaleConditionAlloca   2.383e+04  9.660e+03   2.467 0.013729 *  
## SaleConditionFamily  -2.492e+03  7.264e+03  -0.343 0.731642    
## SaleConditionNormal   1.323e+04  3.109e+03   4.254 2.24e-05 ***
## SaleConditionPartial  3.137e+04  4.335e+03   7.235 7.56e-13 ***
## PoolQCFa             -1.262e+05  2.995e+04  -4.213 2.67e-05 ***
## PoolQCGd             -4.450e+04  3.074e+04  -1.447 0.148002    
## PoolQCNone           -9.865e+04  2.167e+04  -4.551 5.79e-06 ***
## MasVnrArea            2.456e+01  5.048e+00   4.865 1.27e-06 ***
## LotShapeIR2           1.624e+04  4.836e+03   3.359 0.000803 ***
## LotShapeIR3           9.599e+03  1.002e+04   0.958 0.338379    
## LotShapeReg          -4.817e+03  1.752e+03  -2.750 0.006040 ** 
## LandContourHLS        2.501e+04  5.760e+03   4.341 1.52e-05 ***
## LandContourLow        1.922e+04  6.633e+03   2.898 0.003809 ** 
## LandContourLvl        1.006e+04  4.043e+03   2.489 0.012935 *  
## RoofMatlCompShg       7.008e+05  4.115e+04  17.030  < 2e-16 ***
## RoofMatlMembran       7.324e+05  5.091e+04  14.386  < 2e-16 ***
## RoofMatlMetal         7.224e+05  5.115e+04  14.125  < 2e-16 ***
## RoofMatlRoll          6.891e+05  5.075e+04  13.577  < 2e-16 ***
## RoofMatlTar&Grv       6.811e+05  4.123e+04  16.519  < 2e-16 ***
## RoofMatlWdShake       7.015e+05  4.351e+04  16.121  < 2e-16 ***
## RoofMatlWdShngl       7.724e+05  4.255e+04  18.153  < 2e-16 ***
## BsmtQualFa           -2.732e+04  6.946e+03  -3.933 8.81e-05 ***
## BsmtQualGd           -2.994e+04  3.650e+03  -8.201 5.26e-16 ***
## BsmtQualNone         -4.483e+03  8.412e+03  -0.533 0.594152    
## BsmtQualTA           -3.266e+04  4.355e+03  -7.499 1.13e-13 ***
## KitchenQualFa        -4.125e+04  6.543e+03  -6.304 3.86e-10 ***
## KitchenQualGd        -3.072e+04  3.777e+03  -8.133 9.06e-16 ***
## KitchenQualTA        -4.308e+04  4.180e+03 -10.306  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29280 on 1421 degrees of freedom
## Multiple R-squared:  0.8677, Adjusted R-squared:  0.8642 
## F-statistic: 245.3 on 38 and 1421 DF,  p-value: < 2.2e-16

# residuals
par(mfrow=c(2,2))
plot(ml.v1.step)

The residual analysis indicates that there is still a little bit of work that we could do to improve the model.

Predict

Selected model 2 for the predictions

house_test_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/house-prices/test.csv?raw=true")


# reokace b/a with 0 where applicable
house_test_df <- house_test_df %>% mutate(
                    LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
                    MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
                    GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
                    FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
                    BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
                    TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF),
                    PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
                    Street = ifelse(is.na(Street), "None", Street),
                    Alley = ifelse(is.na(Alley), "None", Alley),
                    GarageType = ifelse(is.na(GarageType), "None", GarageType),
                    GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
                    GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
                    GarageCond = ifelse(is.na(GarageCond), "None", GarageCond),
                    BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
                    KitchenQual = ifelse(is.na(KitchenQual), "Gd", KitchenQual),
                    GrLivArea = ifelse(is.na(GrLivArea), 0, GrLivArea),
                    BsmtUnfSF = ifelse(is.na(BsmtUnfSF), 0, BsmtUnfSF)
                  )



# identify variables with null values
all_nas <- colSums(sapply(house_test_df[names(house_test_df)], is.na))
all_nas[which(all_nas > 0)]

##     MSZoning    Utilities  Exterior1st  Exterior2nd   MasVnrType     BsmtCond 
##            4            2            1            1           16           45 
## BsmtExposure BsmtFinType1 BsmtFinType2   BsmtFinSF2 BsmtFullBath BsmtHalfBath 
##           44           42           42            1            2            2 
##   Functional   GarageCars   GarageArea        Fence  MiscFeature     SaleType 
##            2            1            1         1169         1408            1

house_test_df <- house_test_df %>%
   mutate(ExterCond=(as.factor(ExterCond)),
            MSSubClass=(as.factor(MSSubClass)),
            MSZoning=(as.factor(MSZoning)),
            Street=(as.factor(Street)),
            Alley=(as.factor(Alley)),
            LotShape=(as.factor(LotShape)),
            LandContour=(as.factor(LandContour)),
            LotConfig=(as.factor(LotConfig)),
            HouseStyle=(as.factor(HouseStyle)),
            RoofStyle=(as.factor(RoofStyle)),
            RoofMatl=(as.factor(RoofMatl)),
            BsmtQual=(as.factor(BsmtQual)),
            BsmtCond=(as.factor(BsmtCond)),
            BsmtFinType1=(as.factor(BsmtFinType1)),
            KitchenQual=(as.factor(KitchenQual)),
            FireplaceQu=(as.factor(FireplaceQu)), 
            GarageType=(as.factor(GarageType)), 
            GarageQual=(as.factor(GarageQual)), 
            SaleCondition=(as.factor(SaleCondition)), 
            OverallCond=c(as.factor(OverallCond)), 
            CentralAir=(as.factor(CentralAir)),
            PoolQC=(as.factor(PoolQC))
   )


# predict new values
house_test_df$SalePrice <- predict.lm(ml.v1.step, house_test_df, type = "response")


# plot housing prices vs neighborhood
ggplot(house_test_df, aes(x=SalePrice, fill=Neighborhood)) + geom_histogram()

# create results dataframe
result_df <- house_test_df %>% dplyr::select(Id,SalePrice)

# identify na predictions
all_nas <- colSums(sapply(result_df[names(result_df)], is.na))
all_nas[which(all_nas > 0)]

## named numeric(0)

# write results
write.csv(result_df,"./Final/house-prices/result.csv", row.names = FALSE)

Submit to Kaggle competion

The model score once submited to the Kaggle competion was 0.16978.

Submit

Leader Board

DATA605_Final

David Simbandumwe