Form the \(A\) matrix. Then introduce decay and form the \(B\) matrix as we did in the course notes.
# form A matrix
= matrix(c(0, 1/2, 1/2, 0, 0, 0,
A 0, 0, 0, 0, 0, 0,
1/3, 1/3, 0, 0, 1/3, 0,
0, 0, 0, 0, 1/2, 1/2,
0, 0, 0, 1/2, 0, 1/2,
0, 0, 0, 1, 0, 0),
nrow =6,
byrow= TRUE)
Since row 2 is filled with zeros, we replace with the probability of landing on any of the pages: 1/6
# replace row 2
2,] = rep(1/6, 6)
A[ A
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0000000 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3333333 0.3333333 0.0000000 0.0000000 0.3333333 0.0000000
## [4,] 0.0000000 0.0000000 0.0000000 0.0000000 0.5000000 0.5000000
## [5,] 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.5000000
## [6,] 0.0000000 0.0000000 0.0000000 1.0000000 0.0000000 0.0000000
Introduce decay and form the B matrix.
= 0.85 # decay
d = 6 # number of pages
n
= d * A + ((1-d)/n)
B B
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0250000 0.4500000 0.4500000 0.0250000 0.0250000 0.0250000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3083333 0.3083333 0.0250000 0.0250000 0.3083333 0.0250000
## [4,] 0.0250000 0.0250000 0.0250000 0.0250000 0.4500000 0.4500000
## [5,] 0.0250000 0.0250000 0.0250000 0.4500000 0.0250000 0.4500000
## [6,] 0.0250000 0.0250000 0.0250000 0.8750000 0.0250000 0.0250000
Start with a uniform rank vector \(r\) and perform power iterations on \(B\) till convergence. That is, compute the solution \(r = B^n \times r\). Attempt this for a sufficiently large \(n\) so that \(r\) actually converges.
library(matrixcalc)
# define uniform rank vector: r
= rep(1/n, n)
r
# find convergence at n
= 10
n matrix.power(t(B), n) %*% r
## [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085
= 20
n matrix.power(t(B), n) %*% r
## [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408
= 30
n matrix.power(t(B), n) %*% r
## [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
= 40
n matrix.power(t(B), n) %*% r
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
= 50
n matrix.power(t(B), n) %*% r
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
all.equal(matrix.power(t(B),40) %*% r, matrix.power(t(B),50) %*% r)
## [1] TRUE
# store the convergence
= matrix.power(t(B), 40) %*% r convergence
Compute the eigen-decomposition of \(B\) and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.
= eigen(B)
eig_decomp eig_decomp
## eigen() decomposition
## $values
## [1] 1.00000000 0.57619235 -0.42500001 -0.42499999 -0.34991524 -0.08461044
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.4082483 -0.7278031 -5.345224e-01 5.345225e-01 -0.795670150
## [2,] -0.4082483 -0.3721164 -5.216180e-09 -5.216180e-09 0.059710287
## [3,] -0.4082483 -0.5389259 5.345225e-01 -5.345225e-01 0.602762996
## [4,] -0.4082483 0.1174605 -2.672613e-01 2.672612e-01 0.002611877
## [5,] -0.4082483 0.1174605 -2.672613e-01 2.672612e-01 0.002611877
## [6,] -0.4082483 0.1174605 5.345225e-01 -5.345224e-01 0.002611877
## [,6]
## [1,] 0.486246420
## [2,] -0.673469294
## [3,] 0.556554233
## [4,] -0.009145393
## [5,] -0.009145393
## [6,] -0.009145393
We can see 1 is the largest eigenvalue.
Sum the orresponding eigenvector:
sum(as.numeric(eig_decomp$vectors[,1]/sum(eig_decomp$vector[,1])))
## [1] 1
Use the graph package in R and its page.rank method to compute the Page Rank of the graph as given in \(A\). Note that you don’t need to apply decay. The package starts with a connected graph and applies decay internally. Verify that you do get the same PageRank vector as the two approaches above.
library(igraph)
# graph the matrix A
= graph_from_adjacency_matrix(A, weighted=TRUE)
A_graph plot(A_graph)
Verify equality with convergence vector.
page.rank(A_graph)
## $vector
## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
##
## $value
## [1] 1
##
## $options
## NULL
convergence
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
all.equal(page.rank(A_graph)$vector, convergence[,1])
## [1] TRUE
Sign in to Kaggle.
Download the data.
library(tidyverse)
= '/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/DIGIT/test.csv'
test_data = '/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/DIGIT/train.csv'
train_data
= read.csv(test_data)
test = read.csv(train_data)
train
dim(test)
## [1] 28000 784
dim(train)
## [1] 42000 785
Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.)
# plot single image
= matrix(unlist(train[10,-1])/255,nrow = 28,byrow = T)
m image(m)
# plot first 10 images
par(mfrow=c(2,5))
for (i in 1:10) {
= matrix(unlist(train[i,-1])/255,nrow = 28,byrow = T)
m image(t(apply(m,2,rev)),axes=FALSE) # print and reverse matrix
}
# print the first 10 labels
1:10,1] train[
## [1] 1 0 1 4 0 0 7 3 5 3
What is the frequency distribution of the numbers in the dataset?
# frequency distribution table
= table((train[,1])) |>
freq as.data.frame() |>
rename(number = Var1, count = Freq) |>
mutate(freq = round(prop.table(table((train[,1]))),4))
freq
number | count | freq |
---|---|---|
0 | 4132 | 0.0984 |
1 | 4684 | 0.1115 |
2 | 4177 | 0.0995 |
3 | 4351 | 0.1036 |
4 | 4072 | 0.0970 |
5 | 3795 | 0.0904 |
6 | 4137 | 0.0985 |
7 | 4401 | 0.1048 |
8 | 4063 | 0.0967 |
9 | 4188 | 0.0997 |
sum(freq$freq)
## [1] 1.0001
Thanks, rounding.
ggplot(train, aes(x=train[,1])) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=1) +
scale_x_continuous(breaks = 0:9) +
labs(x='number')
For each number, provide the mean pixel intensity. What does this tell you?
|>
train mutate(mean_pix = rowMeans(across(where(is.numeric)))) |>
group_by(label) |>
summarise(mean_pix = mean(mean_pix))
label | mean_pix |
---|---|
0 | 44.11772 |
1 | 19.34964 |
2 | 38.05490 |
3 | 36.08049 |
4 | 30.87481 |
5 | 32.91837 |
6 | 35.33875 |
7 | 29.22069 |
8 | 38.46134 |
9 | 31.29013 |
We can see 0 has the highest (brightest) average pixel intensity, while 1 has the lowest. I suspect this represents the information needed to display each number.
Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why?
library(stats)
= prcomp(train[,2:785])
train_pca
# calculate cumulative variances
= (cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2))
train_cv # store components for 95% variance
= min(which((train_cv >= .95)))
train_cv_95 train_cv_95
## [1] 154
154 components account for 95% of the variance.
# total possible components
= (cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2))
train_cv = min(which((train_cv >= 1)))
train_cv_100 train_cv_100
## [1] 704
704 components account for 100% of the variance. This indicates a substantial number of components account for very little variance between 95-100%
Plot the first 10 images generated by PCA. They will appear to be noise. Why?
# plot first 10 PCA images
par(mfrow=c(2,5))
# rotate the PCA matrix
= train_pca$rotation
train_pca_rot
for (i in 1:10) {
= array(train_pca_rot[,i], dim = c(28, 28))
a image(1:28, 1:28, a, axes=FALSE)
}
There’s noise here because the components haven’t been normalized, and PCA maximizes variance.
Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see?
# select only 8s
= train |>
eights filter(label == 8)
# pca
= prcomp(eights[,2:785])
eights_pca
# calculate cumulative variances
= (cumsum(eights_pca$sdev^2) / sum(eights_pca$sdev^2))
eights_cv # store components for 100% variance
= min(which((eights_cv >= 1))) eights_cv_100
# plot first 10 images
par(mfrow=c(2,5))
# rotate the eights PCA matrix
= eights_pca$rotation
eights_pca_rot
for (i in 1:10) {
= array(eights_pca_rot[,i], dim = c(28, 28))
a image(1:28, 1:28, a, axes=FALSE)
}
The images are still noisy, but clearer than the full set, and appear to go from clearest to blurriest. This is likely because only 248 variables are needed to account for 100% variance in this model, so there appears to be much less overall variance.
An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)
library(nnet)
# build multinomial model
= multinom(label ~ ., train, MaxNWts=50000) model
## # weights: 7860 (7065 variable)
## initial value 96708.573906
## iter 10 value 27932.238987
## iter 20 value 22901.936560
## iter 30 value 21677.526948
## iter 40 value 21254.278400
## iter 50 value 21077.662091
## iter 60 value 21002.266892
## iter 70 value 20928.779026
## iter 80 value 20866.883697
## iter 90 value 20792.605325
## iter 100 value 20727.274628
## final value 20727.274628
## stopped after 100 iterations
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# test accuracy
= predict(model, train)
predictions = table(predictions, train$label)
mtab confusionMatrix(mtab)
## Confusion Matrix and Statistics
##
##
## predictions 0 1 2 3 4 5 6 7 8 9
## 0 3953 0 10 5 11 24 43 6 16 13
## 1 24 4605 146 96 83 77 54 113 397 83
## 2 17 12 3639 76 36 18 72 38 21 14
## 3 9 13 73 3762 11 88 3 7 66 51
## 4 8 4 26 3 3448 12 17 21 18 63
## 5 27 13 21 183 55 3283 101 13 178 46
## 6 10 2 13 6 24 44 3769 4 13 0
## 7 35 13 111 91 18 51 10 4053 35 179
## 8 38 22 111 91 47 141 41 7 3247 25
## 9 11 0 27 38 339 57 27 139 72 3714
##
## Overall Statistics
##
## Accuracy : 0.8922
## 95% CI : (0.8892, 0.8952)
## No Information Rate : 0.1115
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8802
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.95668 0.9831 0.87120 0.86463 0.84676 0.86509
## Specificity 0.99662 0.9712 0.99196 0.99147 0.99547 0.98333
## Pos Pred Value 0.96864 0.8110 0.92290 0.92138 0.95249 0.83750
## Neg Pred Value 0.99528 0.9978 0.98586 0.98447 0.98374 0.98655
## Prevalence 0.09838 0.1115 0.09945 0.10360 0.09695 0.09036
## Detection Rate 0.09412 0.1096 0.08664 0.08957 0.08210 0.07817
## Detection Prevalence 0.09717 0.1352 0.09388 0.09721 0.08619 0.09333
## Balanced Accuracy 0.97665 0.9772 0.93158 0.92805 0.92111 0.92421
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.91105 0.9209 0.79916 0.88682
## Specificity 0.99694 0.9856 0.98621 0.98122
## Pos Pred Value 0.97014 0.8819 0.86127 0.83951
## Neg Pred Value 0.99035 0.9907 0.97866 0.98739
## Prevalence 0.09850 0.1048 0.09674 0.09971
## Detection Rate 0.08974 0.0965 0.07731 0.08843
## Detection Prevalence 0.09250 0.1094 0.08976 0.10533
## Balanced Accuracy 0.95399 0.9532 0.89269 0.93402
The classification model is 89% accurate.
Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
= read.csv('/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/HOUSE/train.csv')
house_train = read.csv('/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/HOUSE/test.csv')
house_test
names(house_train)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
dim(house_train)
## [1] 1460 81
summary(house_train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
dim(house_train)
## [1] 1460 81
We have 1460 rows and 81 columns of data.
ggplot(house_train, aes(x=SalePrice)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=10000) +
labs(x = 'Sale Price')
ggplot(house_train, aes(x=OverallCond)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=1) +
labs(x = 'Overall Condition')
ggplot(house_train, aes(x=YearBuilt)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=10) +
labs(x = 'Year Built')
Scatterplot Sale Price vs. Lot Area & Overall Condition:
ggplot(house_train, aes(TotRmsAbvGrd, SalePrice)) +
geom_jitter(aes(color=OverallCond), size=3) +
labs(x='Rooms Above Ground', y='Sale Price') +
scale_color_continuous(name='Overall Condition')
Derive a correlation matrix for any three quantitative variables in the dataset.
# vector of desired columns
= c('YearBuilt','OverallCond','TotRmsAbvGrd','SalePrice')
cols = cor(house_train[,cols])
cor_mat cor_mat
## YearBuilt OverallCond TotRmsAbvGrd SalePrice
## YearBuilt 1.00000000 -0.37598320 0.09558913 0.52289733
## OverallCond -0.37598320 1.00000000 -0.05758317 -0.07785589
## TotRmsAbvGrd 0.09558913 -0.05758317 1.00000000 0.53372316
## SalePrice 0.52289733 -0.07785589 0.53372316 1.00000000
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.
\(H_0:\) The correlation between
each pairwise set of variables with 80% confidence is 0.
\(H_A:\) The correlation between each
pairwise set of variables with 80% confidence is not 0.
cor.test(house_train$SalePrice, house_train$YearBuilt, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_train$SalePrice and house_train$YearBuilt
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4980766 0.5468619
## sample estimates:
## cor
## 0.5228973
We see there is a pretty strong positive correlation between Sale Price and Year Built.
cor.test(house_train$SalePrice, house_train$OverallCond, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_train$SalePrice and house_train$OverallCond
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.1111272 -0.0444103
## sample estimates:
## cor
## -0.07785589
cor.test(house_train$SalePrice, house_train$TotRmsAbvGrd, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_train$SalePrice and house_train$TotRmsAbvGrd
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5092841 0.5573021
## sample estimates:
## cor
## 0.5337232
There are strong correlations between the Sales Price, Overall Condition, and Total Rooms Above Ground. We reject the null hypthosis that the correlation between the variables is 0.
Given the formula to estimate familywise error,
\(FWE = 1 - (1-α)^n\) where \(α\) is the significance level for a single hypothesis test, and \(n\) is the total number of tests, we have:
= 3 # number of tests
n = 0.05 # alpha
a = function(n, a){
FWE 1 - (1-a)^n
}
FWE(n,a)
## [1] 0.142625
The estimated familywise error is 14%, so there is a 14% chance of coming to one or more false conclusions.
Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
#invert the correlation matrix
= solve(cor_mat)
prec_mat prec_mat
## YearBuilt OverallCond TotRmsAbvGrd SalePrice
## YearBuilt 1.7744022 0.6060981 0.4688685 -1.1308879
## OverallCond 0.6060981 1.2134925 0.1827151 -0.3199688
## TotRmsAbvGrd 0.4688685 0.1827151 1.5227280 -1.0436598
## SalePrice -1.1308879 -0.3199688 -1.0436598 2.1234522
# multiply the correlation matrix by the precision matrix
%*% prec_mat cor_mat
## YearBuilt OverallCond TotRmsAbvGrd SalePrice
## YearBuilt 1.000000e+00 -2.775558e-17 0.000000e+00 0.000000e+00
## OverallCond 1.249001e-16 1.000000e+00 5.551115e-17 -5.551115e-17
## TotRmsAbvGrd 1.110223e-16 -2.775558e-17 1.000000e+00 0.000000e+00
## SalePrice 2.220446e-16 0.000000e+00 0.000000e+00 1.000000e+00
# multiply the precision matrix by the correlation matrix
%*% cor_mat prec_mat
## YearBuilt OverallCond TotRmsAbvGrd SalePrice
## YearBuilt 1.000000e+00 2.775558e-17 -1.110223e-16 0.000000e+00
## OverallCond 8.326673e-17 1.000000e+00 0.000000e+00 5.551115e-17
## TotRmsAbvGrd 1.110223e-16 -2.775558e-17 1.000000e+00 2.220446e-16
## SalePrice 0.000000e+00 -5.551115e-17 0.000000e+00 1.000000e+00
# conduct LU decomposition on the matrix
= lu.decomposition(cor_mat)
c_mat_decomp
c_mat_decomp
## $L
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 0.00000000 0.000000 0
## [2,] -0.37598320 1.00000000 0.000000 0
## [3,] 0.09558913 -0.02520654 1.000000 0
## [4,] 0.52289733 0.13829449 0.491492 1
##
## $U
## [,1] [,2] [,3] [,4]
## [1,] 1 -0.3759832 0.09558913 0.5228973
## [2,] 0 0.8586366 -0.02164326 0.1187447
## [3,] 0 0.0000000 0.99031717 0.4867330
## [4,] 0 0.0000000 0.00000000 0.4709312
Recalling an earlier histogram, we can see that sale price is right-skewed.
ggplot(house_train, aes(x=SalePrice)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, binwidth=10000) +
labs(x = 'Sale Price')
And this checks out, as the mean sale price is higher than its median.
mean(house_train$SalePrice) - median(house_train$SalePrice)
## [1] 17921.2
min(house_train$SalePrice)
## [1] 34900
The minimum value is above 0, so no shifting is necessary.
library(MASS)
# find optimal value
= fitdistr(house_train$SalePrice, densfun = 'exponential')
lambda lambda
## rate
## 5.527268e-06
## (1.446552e-07)
# take 1000 samples using optimal value
= rexp(1000, lambda$estimate) samps
par(mfrow=c(1,2))
hist(house_train$SalePrice, breaks=40, xlab='Price', main='Sale Price')
hist(samps, breaks=40, xlab='Samples', main='Fitted Distribution')
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data.
# 5th percentile, labmda
= qexp(.05, rate=lambda$estimate)
lambda_5p lambda_5p
## [1] 9280.044
# 95th percentile, lambda
= qexp(.95, rate=lambda$estimate)
lambda_95p lambda_95p
## [1] 541991.5
# 5th percentile, empirical
= quantile(house_train$SalePrice, 0.05)
house_5p house_5p
## 5%
## 88000
# 95th percentile, empirical
= quantile(house_train$SalePrice, 0.95)
house_95p house_95p
## 95%
## 326100
summary(house_train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
summary(samps)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.1 52231.4 133148.7 176819.6 249091.4 1317234.3
We can see the samples shifted the data towards a minimum of 0 and actually increased the right skew.
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
I’m going to see if the data satisfies the conditions for simple linear regression. To start, I will isolate the numeric variables and take a look at a pairs plot to see which variables appear to have linear relationships.
# copy the numeric columns to a new datafram: model.df
= house_train |>
model.df ::select(-Id) |>
dplyrkeep(is.numeric)
glimpse(model.df)
## Rows: 1,460
## Columns: 37
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
I see some columns with NAs. Let’s take a look.
colSums(is.na(model.df))
## MSSubClass LotFrontage LotArea OverallQual OverallCond
## 0 259 0 0 0
## YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2
## 0 0 8 0 0
## BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF
## 0 0 0 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 0 0 0 0
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt
## 0 0 0 0 81
## GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch
## 0 0 0 0 0
## X3SsnPorch ScreenPorch PoolArea MiscVal MoSold
## 0 0 0 0 0
## YrSold SalePrice
## 0 0
Looks like LotFrontage, MasVnrArea,and GarageYrBlt have NAs, though not many.
LotFrontage = Linear feet of street connected to property MasVnrArea
= Masonry Veneer Area
GarageYrBlt = Garage Year Built
I think these columns could all be useful to our model, so rather than remove the columns or impute the NAs, I’m simply going to remove the rows with these NAs, since it’s only a maximum of about 350 out 1460 observations.
= model.df |>
model.df na.omit()
glimpse(model.df)
## Rows: 1,121
## Columns: 37
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 50, 190, 20, 60, 20, 45, 90,…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, 51, 50, 70, 85, 91, 51, 72, …
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 6120, 7420…
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 5, 5, 9, 7, 7, 4, 5, 5, 8, 7, 8,…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 8, 5, 5, 6, 5, 7, 5,…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1931, 1939, 19…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1950, 1950, 19…
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 0, 0, 0, 286, 306, 0, 0, 0,…
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 0, 851, 906, 998, 0,…
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 952, 140, 134, 177, 14…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 952, 991, 1040, 11…
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1022, 1077, 1040, …
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 752, 0, 0, 1142, 0, 0, 0…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 1774, 1077, 10…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3, 1, 2,…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 2, 2, 3, 4, 3, 2, 2, 3, 3, 4, 3, 3,…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,…
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 8, 5, 5, 11, 7, 5, 6, 6, 6, 9, 6, 7…
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1,…
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1931, 1939, 19…
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 3, 1, 2,…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 468, 205, 384, 736, 8…
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 90, 0, 0, 147, 160, 48, 0, 0…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 0, 4, 0, 21, 33, 112, 0, 102,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 205, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2…
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 4, 1, 2, 7, 8, 7, 10, 6, 5, 11, 6…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2008, 2008, 20…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
This removes 339 rows from our data. If the variables prove insignificant to the model, I will restore the rows to the data. Now let’s take a look at a pairs plot to observe which variables may have a linear relationship.
Now let’s take a look at the linear regression model.
= lm(SalePrice ~ ., model.df)
model.full summary(model.full)
##
## Call:
## lm(formula = SalePrice ~ ., data = model.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442865 -16873 -2581 14998 318042
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.232e+05 1.701e+06 -0.190 0.849317
## MSSubClass -2.005e+02 3.449e+01 -5.814 8.03e-09 ***
## LotFrontage -1.161e+02 6.124e+01 -1.896 0.058203 .
## LotArea 5.454e-01 1.573e-01 3.466 0.000548 ***
## OverallQual 1.870e+04 1.478e+03 12.646 < 2e-16 ***
## OverallCond 5.227e+03 1.367e+03 3.824 0.000139 ***
## YearBuilt 3.170e+02 8.762e+01 3.617 0.000311 ***
## YearRemodAdd 1.206e+02 8.661e+01 1.392 0.164174
## MasVnrArea 3.160e+01 7.006e+00 4.511 7.15e-06 ***
## BsmtFinSF1 1.739e+01 5.835e+00 2.980 0.002947 **
## BsmtFinSF2 8.362e+00 8.763e+00 0.954 0.340205
## BsmtUnfSF 5.006e+00 5.275e+00 0.949 0.342890
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.591e+01 7.356e+00 6.241 6.21e-10 ***
## X2ndFlrSF 4.668e+01 6.099e+00 7.654 4.28e-14 ***
## LowQualFinSF 3.415e+01 2.788e+01 1.225 0.220788
## GrLivArea NA NA NA NA
## BsmtFullBath 8.980e+03 3.194e+03 2.812 0.005018 **
## BsmtHalfBath 2.490e+03 5.071e+03 0.491 0.623487
## FullBath 5.390e+03 3.529e+03 1.527 0.126941
## HalfBath -1.119e+03 3.320e+03 -0.337 0.736244
## BedroomAbvGr -1.023e+04 2.154e+03 -4.750 2.30e-06 ***
## KitchenAbvGr -2.193e+04 6.704e+03 -3.271 0.001105 **
## TotRmsAbvGrd 5.440e+03 1.486e+03 3.661 0.000263 ***
## Fireplaces 4.375e+03 2.188e+03 2.000 0.045793 *
## GarageYrBlt -4.914e+01 9.093e+01 -0.540 0.589011
## GarageCars 1.679e+04 3.487e+03 4.815 1.68e-06 ***
## GarageArea 6.488e+00 1.211e+01 0.536 0.592338
## WoodDeckSF 2.155e+01 1.002e+01 2.151 0.031713 *
## OpenPorchSF -2.315e+00 1.948e+01 -0.119 0.905404
## EnclosedPorch 7.233e+00 2.061e+01 0.351 0.725733
## X3SsnPorch 3.458e+01 3.749e+01 0.922 0.356593
## ScreenPorch 5.797e+01 2.040e+01 2.842 0.004572 **
## PoolArea -6.126e+01 2.984e+01 -2.053 0.040326 *
## MiscVal -3.850e+00 6.955e+00 -0.554 0.579980
## MoSold -2.240e+02 4.227e+02 -0.530 0.596213
## YrSold -2.536e+02 8.454e+02 -0.300 0.764216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36790 on 1086 degrees of freedom
## Multiple R-squared: 0.8095, Adjusted R-squared: 0.8036
## F-statistic: 135.7 on 34 and 1086 DF, p-value: < 2.2e-16
= summary(model.full)$coefficients[,4] model.full.pvals
We see a number of variables that don’t appear to be significant to the model, but we do an adjusted R-squared of 80%, which is a good start. GarageYrBlt doesn’t look like it will be significant so I am going to remove that column and restore the observations we removed in order to utilize that variable without NAs. Masonry Veneer Area is highly significant, and Lot Frontage may be, so we will keep those for now.
We can also remove TotalBsmtSF, GrLivArea, which are not defined due to singularities, and OpenPorchSF, YrSold, EnclosedPorch, HalfBath, BsmtHalfBath, MoSold, MiscVal, and GarageArea, which are not significant.
= c('Id', 'GarageYrBlt', 'TotalBsmtSF', 'GrLivArea', 'OpenPorchSF', 'YrSold', 'EnclosedPorch', 'HalfBath', 'BsmtHalfBath', 'MoSold', 'MiscVal', 'GarageArea')
cols
= house_train |>
model.df2 ::select(-cols) |>
dplyrkeep(is.numeric) |>
na.omit()
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(cols)
##
## # Now:
## data %>% select(all_of(cols))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
= lm(SalePrice ~., model.df2)
model.v2 summary(model.v2)
##
## Call:
## lm(formula = SalePrice ~ ., data = model.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -452259 -17560 -2381 14179 323849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.791e+05 1.415e+05 -6.919 7.48e-12 ***
## MSSubClass -1.888e+02 3.199e+01 -5.901 4.72e-09 ***
## LotFrontage -8.906e+01 5.775e+01 -1.542 0.123282
## LotArea 5.532e-01 1.546e-01 3.578 0.000360 ***
## OverallQual 1.754e+04 1.373e+03 12.778 < 2e-16 ***
## OverallCond 4.177e+03 1.215e+03 3.438 0.000607 ***
## YearBuilt 2.869e+02 6.165e+01 4.654 3.62e-06 ***
## YearRemodAdd 1.803e+02 7.548e+01 2.389 0.017069 *
## MasVnrArea 3.543e+01 6.827e+00 5.190 2.47e-07 ***
## BsmtFinSF1 1.875e+01 5.425e+00 3.456 0.000567 ***
## BsmtFinSF2 8.159e+00 8.338e+00 0.979 0.328011
## BsmtUnfSF 7.629e+00 4.862e+00 1.569 0.116910
## X1stFlrSF 4.632e+01 6.905e+00 6.709 3.05e-11 ***
## X2ndFlrSF 4.618e+01 5.039e+00 9.164 < 2e-16 ***
## LowQualFinSF 3.543e+01 2.175e+01 1.629 0.103524
## BsmtFullBath 9.805e+03 2.879e+03 3.406 0.000682 ***
## FullBath 7.124e+03 2.994e+03 2.380 0.017490 *
## BedroomAbvGr -1.003e+04 1.938e+03 -5.176 2.66e-07 ***
## KitchenAbvGr -1.276e+04 5.765e+03 -2.213 0.027102 *
## TotRmsAbvGrd 5.048e+03 1.415e+03 3.568 0.000374 ***
## Fireplaces 4.617e+03 2.086e+03 2.214 0.027054 *
## GarageCars 1.100e+04 1.911e+03 5.754 1.11e-08 ***
## WoodDeckSF 2.413e+01 9.618e+00 2.509 0.012241 *
## X3SsnPorch 3.043e+01 3.692e+01 0.824 0.409918
## ScreenPorch 5.650e+01 1.965e+01 2.876 0.004101 **
## PoolArea -6.394e+01 2.883e+01 -2.218 0.026755 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36460 on 1169 degrees of freedom
## Multiple R-squared: 0.8119, Adjusted R-squared: 0.8078
## F-statistic: 201.8 on 25 and 1169 DF, p-value: < 2.2e-16
The model seems to be improving. The adjusted R-squared remains at 80%, and the residuals deviance is approaching symmetry. Let’s remove X3SsnPorch and BsmtFinSF2 from the model and try again.
= c('X3SsnPorch','BsmtFinSF2')
cols = model.df2 |>
model.df3 ::select(-cols)
dplyr
= lm(SalePrice ~., model.df3)
model.v3 summary(model.v3)
##
## Call:
## lm(formula = SalePrice ~ ., data = model.df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -450535 -17624 -2527 14253 322671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.796e+05 1.415e+05 -6.923 7.25e-12 ***
## MSSubClass -1.894e+02 3.198e+01 -5.923 4.16e-09 ***
## LotFrontage -8.679e+01 5.763e+01 -1.506 0.132333
## LotArea 5.694e-01 1.536e-01 3.708 0.000219 ***
## OverallQual 1.760e+04 1.371e+03 12.843 < 2e-16 ***
## OverallCond 4.221e+03 1.214e+03 3.477 0.000526 ***
## YearBuilt 2.911e+02 6.149e+01 4.735 2.46e-06 ***
## YearRemodAdd 1.766e+02 7.535e+01 2.343 0.019293 *
## MasVnrArea 3.550e+01 6.825e+00 5.201 2.33e-07 ***
## BsmtFinSF1 1.615e+01 4.690e+00 3.444 0.000593 ***
## BsmtUnfSF 5.367e+00 4.263e+00 1.259 0.208272
## X1stFlrSF 4.874e+01 6.477e+00 7.526 1.04e-13 ***
## X2ndFlrSF 4.625e+01 5.038e+00 9.181 < 2e-16 ***
## LowQualFinSF 3.608e+01 2.173e+01 1.660 0.097197 .
## BsmtFullBath 1.035e+04 2.815e+03 3.676 0.000247 ***
## FullBath 7.136e+03 2.990e+03 2.387 0.017163 *
## BedroomAbvGr -9.991e+03 1.936e+03 -5.159 2.91e-07 ***
## KitchenAbvGr -1.335e+04 5.738e+03 -2.326 0.020182 *
## TotRmsAbvGrd 4.923e+03 1.411e+03 3.489 0.000502 ***
## Fireplaces 4.536e+03 2.084e+03 2.177 0.029705 *
## GarageCars 1.097e+04 1.910e+03 5.743 1.19e-08 ***
## WoodDeckSF 2.382e+01 9.592e+00 2.483 0.013166 *
## ScreenPorch 5.700e+01 1.960e+01 2.908 0.003705 **
## PoolArea -6.232e+01 2.873e+01 -2.169 0.030303 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36450 on 1171 degrees of freedom
## Multiple R-squared: 0.8116, Adjusted R-squared: 0.8079
## F-statistic: 219.3 on 23 and 1171 DF, p-value: < 2.2e-16
Removing LotFrontage, LowQualFinSF, BsmUnfSF and we should be good to go.
= c('LotFrontage','LowQualFinSF','BsmtUnfSF')
cols = model.df3 |>
model.df4 ::select(-cols)
dplyr
= lm(SalePrice ~., model.df4)
model.v4 summary(model.v4)
##
## Call:
## lm(formula = SalePrice ~ ., data = model.df4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -453173 -17058 -2301 14054 320504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.656e+05 1.409e+05 -6.851 1.18e-11 ***
## MSSubClass -1.722e+02 2.994e+01 -5.750 1.14e-08 ***
## LotArea 5.079e-01 1.488e-01 3.413 0.000664 ***
## OverallQual 1.813e+04 1.337e+03 13.560 < 2e-16 ***
## OverallCond 3.897e+03 1.208e+03 3.227 0.001285 **
## YearBuilt 2.708e+02 6.046e+01 4.479 8.23e-06 ***
## YearRemodAdd 1.881e+02 7.530e+01 2.499 0.012607 *
## MasVnrArea 3.546e+01 6.813e+00 5.205 2.29e-07 ***
## BsmtFinSF1 1.197e+01 3.511e+00 3.409 0.000673 ***
## X1stFlrSF 5.054e+01 5.711e+00 8.848 < 2e-16 ***
## X2ndFlrSF 4.455e+01 4.989e+00 8.928 < 2e-16 ***
## BsmtFullBath 9.916e+03 2.789e+03 3.555 0.000393 ***
## FullBath 7.463e+03 2.989e+03 2.497 0.012672 *
## BedroomAbvGr -1.013e+04 1.931e+03 -5.245 1.85e-07 ***
## KitchenAbvGr -1.526e+04 5.690e+03 -2.682 0.007411 **
## TotRmsAbvGrd 5.189e+03 1.395e+03 3.718 0.000210 ***
## Fireplaces 4.097e+03 2.077e+03 1.973 0.048747 *
## GarageCars 1.061e+04 1.905e+03 5.569 3.18e-08 ***
## WoodDeckSF 2.443e+01 9.564e+00 2.555 0.010747 *
## ScreenPorch 5.905e+01 1.959e+01 3.015 0.002625 **
## PoolArea -6.474e+01 2.839e+01 -2.280 0.022760 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36510 on 1174 degrees of freedom
## Multiple R-squared: 0.8106, Adjusted R-squared: 0.8073
## F-statistic: 251.2 on 20 and 1174 DF, p-value: < 2.2e-16
Now we see that all variables in our model have significance. Let’s take a look at the residuals to observe if they meet the conditions for linear regression.
hist(model.v4$residuals)
The residuals are normally distributed… good.
plot(model.v4)
We can see the residuals are somewhat non-linear. and somewhat right-skewed. We can observe that they’re somewhat homoskedactic, but there are outliers affecting the model. Let’s try reducing the model to the most significant variables and see if this improves the fit.
= summary(model.v4)$coefficients[-1,4]
pvals
|>
pvals as.data.frame() |>
arrange(pvals)
pvals | |
---|---|
OverallQual | 0.0000000 |
X2ndFlrSF | 0.0000000 |
X1stFlrSF | 0.0000000 |
MSSubClass | 0.0000000 |
GarageCars | 0.0000000 |
BedroomAbvGr | 0.0000002 |
MasVnrArea | 0.0000002 |
YearBuilt | 0.0000082 |
TotRmsAbvGrd | 0.0002100 |
BsmtFullBath | 0.0003933 |
LotArea | 0.0006644 |
BsmtFinSF1 | 0.0006733 |
OverallCond | 0.0012846 |
ScreenPorch | 0.0026253 |
KitchenAbvGr | 0.0074112 |
WoodDeckSF | 0.0107471 |
YearRemodAdd | 0.0126070 |
FullBath | 0.0126725 |
PoolArea | 0.0227601 |
Fireplaces | 0.0487469 |
Let’s keep the remaining variables at least r negative decimals places, so the 12 most significant variables.
= pvals |>
val_names as.data.frame() |>
arrange(pvals) |>
tail(8) |>
row.names()
= model.df4 |>
model.df5 ::select(-val_names) dplyr
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(val_names)
##
## # Now:
## data %>% select(all_of(val_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
= lm(SalePrice ~., model.df5)
model.v5 summary(model.v5)
##
## Call:
## lm(formula = SalePrice ~ ., data = model.df5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -492766 -17508 -1858 15974 273713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.454e+05 8.984e+04 -7.185 1.19e-12 ***
## MSSubClass -2.078e+02 2.828e+01 -7.348 3.74e-13 ***
## LotArea 5.191e-01 1.520e-01 3.414 0.000662 ***
## OverallQual 2.188e+04 1.268e+03 17.260 < 2e-16 ***
## YearBuilt 2.987e+02 4.717e+01 6.332 3.44e-10 ***
## MasVnrArea 3.295e+01 6.914e+00 4.766 2.11e-06 ***
## BsmtFinSF1 1.217e+01 3.523e+00 3.455 0.000571 ***
## X1stFlrSF 5.400e+01 5.411e+00 9.979 < 2e-16 ***
## X2ndFlrSF 5.169e+01 4.767e+00 10.843 < 2e-16 ***
## BsmtFullBath 9.962e+03 2.839e+03 3.509 0.000466 ***
## BedroomAbvGr -1.041e+04 1.931e+03 -5.389 8.54e-08 ***
## TotRmsAbvGrd 4.244e+03 1.367e+03 3.105 0.001945 **
## GarageCars 1.097e+04 1.947e+03 5.632 2.22e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37560 on 1182 degrees of freedom
## Multiple R-squared: 0.7981, Adjusted R-squared: 0.7961
## F-statistic: 389.5 on 12 and 1182 DF, p-value: < 2.2e-16
So we lose a little bit of R-squared, and it seems the residuals deviance is more asymetrical, let’s take a look at the residuals.
hist(summary(model.v5)$residuals)
plot(model.v5)
We don’t observe any noticeable improvement in the residuals with this model. Taking a closer look at the data, we learn that MSSubClass is a categorical data column describing the class of the buildings.
plot(SalePrice ~ MSSubClass, model.df5)
Let’s remove this variable from the model and see if it improves the fit.
= model.df5 |>
model.df6 ::select(-MSSubClass)
dplyr
= lm(SalePrice ~., model.df6)
model.v6 summary(model.v6)
##
## Call:
## lm(formula = SalePrice ~ ., data = model.df6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -505085 -18286 -1076 16635 280446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.201e+05 9.176e+04 -6.758 2.19e-11 ***
## LotArea 7.237e-01 1.528e-01 4.737 2.44e-06 ***
## OverallQual 2.197e+04 1.296e+03 16.956 < 2e-16 ***
## YearBuilt 2.757e+02 4.811e+01 5.731 1.26e-08 ***
## MasVnrArea 3.104e+01 7.062e+00 4.396 1.20e-05 ***
## BsmtFinSF1 1.286e+01 3.600e+00 3.571 0.00037 ***
## X1stFlrSF 5.633e+01 5.522e+00 10.202 < 2e-16 ***
## X2ndFlrSF 4.335e+01 4.733e+00 9.160 < 2e-16 ***
## BsmtFullBath 7.798e+03 2.886e+03 2.702 0.00699 **
## BedroomAbvGr -8.496e+03 1.956e+03 -4.344 1.52e-05 ***
## TotRmsAbvGrd 3.987e+03 1.397e+03 2.855 0.00438 **
## GarageCars 1.245e+04 1.980e+03 6.291 4.43e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38390 on 1183 degrees of freedom
## Multiple R-squared: 0.7889, Adjusted R-squared: 0.787
## F-statistic: 402 on 11 and 1183 DF, p-value: < 2.2e-16
hist(summary(model.v6)$residuals)
plot(model.v6)
Again, no improvement. Let’s try reducing to the variables with the highest t-values and see how the model fits.
= summary(model.v6)$coefficients[-1,3]
t_vals
|>
t_vals as.data.frame() |>
arrange(desc(t_vals))
t_vals | |
---|---|
OverallQual | 16.955977 |
X1stFlrSF | 10.202467 |
X2ndFlrSF | 9.160032 |
GarageCars | 6.291123 |
YearBuilt | 5.731199 |
LotArea | 4.736543 |
MasVnrArea | 4.395768 |
BsmtFinSF1 | 3.570969 |
TotRmsAbvGrd | 2.854904 |
BsmtFullBath | 2.702133 |
BedroomAbvGr | -4.344397 |
= t_vals |>
vals as.data.frame() |>
arrange(desc(t_vals)) |>
head(5) |>
row.names()
= model.df6 |>
model.df7 ::select(c(vals, 'SalePrice'))
dplyr
= lm(SalePrice ~ ., model.df7)
model.v7 summary(model.v7)
##
## Call:
## lm(formula = SalePrice ~ ., data = model.df7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -428994 -20841 -1773 18441 288174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.379e+05 9.416e+04 -7.837 1.02e-14 ***
## OverallQual 2.297e+04 1.334e+03 17.220 < 2e-16 ***
## X1stFlrSF 7.783e+01 4.048e+00 19.227 < 2e-16 ***
## X2ndFlrSF 4.748e+01 3.226e+00 14.719 < 2e-16 ***
## GarageCars 1.361e+04 2.066e+03 6.585 6.82e-11 ***
## YearBuilt 3.286e+02 4.963e+01 6.620 5.43e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40450 on 1189 degrees of freedom
## Multiple R-squared: 0.7645, Adjusted R-squared: 0.7635
## F-statistic: 772 on 5 and 1189 DF, p-value: < 2.2e-16
We see haven’t lost much R-squared, and the residuals deviance symmetry improves.
hist(summary(model.v7)$residuals)
plot(model.v7)
Still not good. Previously I’ve bypassed producing pairs plots to observe the relationship between the data because the number of variables was grinding my machine to a halt, but now we’ve subset far enough down to take a look:
library(GGally)
ggpairs(model.df7)
We see overall quality and garage cars are discrete variables, and X2ndFlrSF has many 0s as not every house has a second floor. What happens if we create a third column, combining first and second floor square footage, include the interaction between YearBuilt and OverallQuality and fit SalePrice against that?
= c('X1stFlrSF','X2ndFlrSF','SalePrice','YearBuilt','OverallQual')
cols = model.df7 |>
model.df8 ::select(cols) |>
dplyrmutate(CombineFtg = X1stFlrSF + X2ndFlrSF)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(cols)
##
## # Now:
## data %>% select(all_of(cols))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
= lm(SalePrice ~ OverallQual*YearBuilt + CombineFtg, model.df8)
model.v8 summary(model.v8)
##
## Call:
## lm(formula = SalePrice ~ OverallQual * YearBuilt + CombineFtg,
## data = model.df8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -421172 -21408 -1269 19324 274759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.894e+06 3.560e+05 5.320 1.24e-07 ***
## OverallQual -4.507e+05 5.615e+04 -8.026 2.39e-15 ***
## YearBuilt -1.000e+03 1.813e+02 -5.517 4.22e-08 ***
## CombineFtg 6.114e+01 2.955e+00 20.687 < 2e-16 ***
## OverallQual:YearBuilt 2.419e+02 2.846e+01 8.498 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41160 on 1190 degrees of freedom
## Multiple R-squared: 0.7559, Adjusted R-squared: 0.7551
## F-statistic: 921.3 on 4 and 1190 DF, p-value: < 2.2e-16
hist(model.v8$residuals)
plot(model.v8)
We now observe all variables are signficant, the adjusted R-squared of 75% is acceptable, and the distribution of residuals is close enough to satisfying the conditions of simple linear regression. Now we can use this model to make predictions on the test data and observe our accuracy. First we have to transform the test data to match our training data.
= house_test |>
test_tran mutate(CombineFtg = X1stFlrSF + X2ndFlrSF)
= predict(model.v8, test_tran)
predictions = predictions |>
predictions as.data.frame() |>
mutate(Id = test_tran$Id) |>
rename(SalePrice = predictions) |>
::select(c(2,1))
dplyr
head(predictions)
Id | SalePrice |
---|---|
1461 | 106016.3 |
1462 | 154797.8 |
1463 | 158367.1 |
1464 | 189661.0 |
1465 | 229468.3 |
1466 | 190522.7 |
write.csv(predictions,'/Users/joshiden/Documents/Classes/CUNY SPS/Fall 2022/DATA 605 /HW/DATA-605/HW/FINAL EXAM/predictions.csv', row.names=FALSE)
Unfortunately these predictions did not provide accurate results. In retrospect I think the data required further tranformations in order to make more accurate predictions. Thanks for reading!