Problem 1. 30 Points
IS 605 FUNDAMENTALS OF COMPUTATIONAL MATHEMATICS - 2022
You’ll verify for yourself that PageRank works by performing calculations on a small universe of web pages. Let’s use the 6 page universe that we had in the previous discussion For this directed graph, perform the following calculations in R.
A <- matrix(c(
0, 1/2, 1/2, 0, 0, 0,
0, 0, 0, 0, 0, 0,
1/3, 1/3, 0, 0, 1/3, 0,
0, 0, 0, 0, 1/2, 1/2,
0, 0, 0, 1/2, 0, 1/2,
0, 0, 0, 1, 0, 0
),
byrow = TRUE, nrow = 6
)
Since Node 2 has only inputs and no outputs (disconnected) we need to first update our A matrix. The initial probability of landing on this node is 1/number of nodes.
A_update<- matrix(c(
0, 1/2, 1/2, 0, 0, 0,
1/6, 1/6, 1/6, 1/6, 1/6, 1/6,
1/3, 1/3, 0, 0, 1/3, 0,
0, 0, 0, 0, 1/2, 1/2,
0, 0, 0, 1/2, 0, 1/2,
0, 0, 0, 1, 0, 0
),
byrow = TRUE, nrow = 6
)
#Verify that row-wise probabilities sum to 1
apply(A_update, 1, sum)
## [1] 1 1 1 1 1 1
\(B=0.85A + \frac{0.15}{n}\)
B <- 0.85*A_update + 0.15/(nrow(A_update))
B
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0250000 0.4500000 0.4500000 0.0250000 0.0250000 0.0250000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3083333 0.3083333 0.0250000 0.0250000 0.3083333 0.0250000
## [4,] 0.0250000 0.0250000 0.0250000 0.0250000 0.4500000 0.4500000
## [5,] 0.0250000 0.0250000 0.0250000 0.4500000 0.0250000 0.4500000
## [6,] 0.0250000 0.0250000 0.0250000 0.8750000 0.0250000 0.0250000
library(matrixcalc)
r0 <- matrix(c(1/6, 1/6, 1/6, 1/6, 1/6, 1/6))
matrix.power(t(B), 5) %*% r0
## [,1]
## [1,] 0.05716521
## [2,] 0.08331157
## [3,] 0.06394177
## [4,] 0.33889812
## [5,] 0.19600722
## [6,] 0.26067610
matrix.power(t(B), 10) %*% r0
## [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085
matrix.power(t(B), 20) %*% r0
## [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408
matrix.power(t(B), 30) %*% r0
## [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
matrix.power(t(B), 40) %*% r0
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
Looks like somewhere between 30 and 40 iterations we converge.
The largest eigenvalue is 1! So far so good.
eig_B <- eigen(t(B))
eig_B$values
## [1] 1.00000000+0i 0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i
Something doesn’t look right. Our eigenvectors don’t sum to 1
eig_vec <- eig_B$vectors[,1]
markov <-matrix.power(t(B), 40) %*% r0
eig_vec
## [1] 0.1044385+0i 0.1488249+0i 0.1159674+0i 0.7043472+0i 0.4037861+0i
## [6] 0.5425377+0i
t(markov)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.05170475 0.07367926 0.05741241 0.3487037 0.1999038 0.2685961
sum(eig_vec)
## [1] 2.019902+0i
sum(t(markov))
## [1] 1
So after some research, R does not normalize the eignvectors (like Matlab does) link to normalizing method
#eig_B$vectors
c1 <- colSums(eig_B$vectors)
scaled_eig_B <- scale(eig_B$vectors, center = FALSE, scale = c1)
## Warning in scale.default(eig_B$vectors, center = FALSE, scale = c1): imaginary
## parts discarded in coercion
scaled_eig_B[,1]
## [1] 0.05170475+0i 0.07367926+0i 0.05741241+0i 0.34870369+0i 0.19990381+0i
## [6] 0.26859608+0i
sum(scaled_eig_B[,1])
## [1] 1+0i
library(igraph)
## Warning: package 'igraph' was built under R version 4.1.2
##
## Attaching package: 'igraph'
## The following object is masked from 'package:matrixcalc':
##
## %s%
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
page_rank_graph <- graph_from_adjacency_matrix(A, weighted = TRUE)
plot(page_rank_graph)
We get the same results as above!
as.matrix(page.rank(page_rank_graph)$vector)
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
Final Problem 2. 40 points.
competition, and download the data. You will not be required to submit work to Kaggle, but you do need the data. ‘MNIST (“Modified National Institute of Standards and Technology”) is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike.”
format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
train <- train %>% mutate_at(vars(starts_with("pixel")), ~ ./255)
numbers <- train %>% select(label) %>% mutate(label = as.numeric(label))
hist(numbers$label)
This just tells me how much ink is on the screen for each number. The numbers 1 and 9 have the darkest pixel intensity.
num_sum <- train %>% group_by(label) %>% summarise_all(mean)
num_sum2 <- num_sum %>% dplyr::select(-label) %>% rowMeans()
for (i in 1:length(num_sum2)){
print(paste("The number ", i, " has a mean of ", num_sum2[i]))
}
## [1] "The number 1 has a mean of 0.173231330286229"
## [1] "The number 2 has a mean of 0.075972720428906"
## [1] "The number 3 has a mean of 0.149415262873165"
## [1] "The number 4 has a mean of 0.141657603055012"
## [1] "The number 5 has a mean of 0.121212097314368"
## [1] "The number 6 has a mean of 0.129231294625887"
## [1] "The number 7 has a mean of 0.138730078688473"
## [1] "The number 8 has a mean of 0.1147021021542"
## [1] "The number 9 has a mean of 0.150981134516322"
## [1] "The number 10 has a mean of 0.12281787715086"
# Create a 28*28 matrix with pixel color values
m = matrix(unlist(train[10,-1]),nrow = 28,byrow = T)
# Plot that matrix
image(m,col=grey.colors(255))
rotate <- function(x) t(apply(x, 2, rev)) # reverses (rotates the matrix)
# Plot a bunch of images
par(mfrow=c(2,3))
lapply(1:10,
function(x) image(
rotate(matrix(unlist(train[x,-1]),nrow = 28,byrow = T)),
col=grey.colors(255),
xlab=train[x,1]
)
)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points)
784 components are possible because that is the number of columns in my matrix
We need 20 to explain 95% of the variance.
train <- train[,-1]
train_scaled = as.data.frame(scale(train,scale = FALSE,center = TRUE))
#head(train_scaled)
digits_covMatrix = cov(train_scaled)
pca.train<-prcomp(digits_covMatrix)
varianceExplained<-as.data.frame(pca.train$sdev^2/sum(pca.train$sdev^2))
varianceExplained = cbind(c(1:784), cumsum(varianceExplained))
colnames(varianceExplained)<-c("NmbrPCs","CumVar")
head(varianceExplained)
## NmbrPCs CumVar
## 1 1 0.2532704
## 2 2 0.4210926
## 3 3 0.5443227
## 4 4 0.6382699
## 5 5 0.7099512
## 6 6 0.7691757
We have reduced the dimensions of the data. There is some going to some information loss. But we have kept the essential information.
rotate<-pca.train$rotation[,1:20]
trainFinal = as.matrix(train_scaled)%*%(rotate)
# Plot a bunch of images
for (i in 1:10){
plot(2,3, xlim=c(1,28), ylim=c(1,28))
OpenImageR::imageShow(array(pca.train$rotation[,i],c(28,28)))
}
8. Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)
train <- read.csv("/Users/williamaiken/Downloads/digit-recognizer/train.csv")
train8 <- train %>% filter(label == '8')
train8 <- train[,-1]
train_scaled8 = as.data.frame(scale(train8,scale = FALSE,center = TRUE))
digits_covMatrix8 = cov(train_scaled8)
pca.train8<-prcomp(digits_covMatrix8)
varianceExplained<-as.data.frame(pca.train$sdev^2/sum(pca.train$sdev^2))
varianceExplained = cbind(c(1:784), cumsum(varianceExplained))
colnames(varianceExplained)<-c("NmbrPCs","CumVar")
head(varianceExplained)
## NmbrPCs CumVar
## 1 1 0.2532704
## 2 2 0.4210926
## 3 3 0.5443227
## 4 4 0.6382699
## 5 5 0.7099512
## 6 6 0.7691757
rotate8<-pca.train8$rotation[,1:18]
trainFinal = as.matrix(train_scaled8)%*%(rotate8)
Those are 8s!
# Plot a bunch of images
for (i in 1:10){
plot(2,3, xlim=c(1,28), ylim=c(1,28))
OpenImageR::imageShow(array(pca.train$rotation[,i],c(28,28)))
}
y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)
library(h2o)
## Warning: package 'h2o' was built under R version 4.1.2
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
## start a local h2o cluster
localH2O = h2o.init(max_mem_size = '6g', # use 6GB of RAM of *GB available
nthreads = -1) # use all CPUs (8 on my personal computer :3)
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/k5/0f3n91110ss_s7xm5hk13lf80000gn/T//RtmpmZ25eL/file9f3a1052928f/h2o_williamaiken_started_from_r.out
## /var/folders/k5/0f3n91110ss_s7xm5hk13lf80000gn/T//RtmpmZ25eL/file9f3a1d13f2a8/h2o_williamaiken_started_from_r.err
##
##
## Starting H2O JVM and connecting: .... Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 7 seconds 256 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.36.0.4
## H2O cluster version age: 1 month and 22 days
## H2O cluster name: H2O_started_from_R_williamaiken_hrd182
## H2O cluster total nodes: 1
## H2O cluster total memory: 6.00 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.1.1 (2021-08-10)
## MNIST data as H2O
train[,1] = as.factor(train[,1]) # convert digit labels to factor for classification
train_h2o = as.h2o(train)
##
|
| | 0%
|
|======================================================================| 100%
test_h2o = as.h2o(test)
##
|
| | 0%
|
|======================================================================| 100%
## set timer
s <- proc.time()
## train model
model =
h2o.deeplearning(x = 2:785, # column numbers for predictors
y = 1, # column number for label
training_frame = train_h2o, # data in H2O format
activation = "RectifierWithDropout", # algorithm
input_dropout_ratio = 0.2, # % of inputs dropout
hidden_dropout_ratios = c(0.5,0.5), # % for nodes dropout
balance_classes = TRUE,
hidden = c(100,100), # two layers of 100 nodes
momentum_stable = 0.99,
nesterov_accelerated_gradient = T, # use it for speed
epochs = 15) # no. of epochs
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [pixel729, pixel644, pixel645, pixel448, pixel727, pixel728, pixel560, pixel52, pixel760, pixel10, pixel54, pixel53, pixel168, pixel56, pixel11, pixel55, pixel57, pixel16, pixel18, pixel17, pixel19, pixel754, pixel755, pixel756, pixel757, pixel758, pixel759, pixel83, pixel196, pixel82, pixel85, pixel671, pixel84, pixel111, pixel672, pixel112, pixel673, pixel476, pixel392, pixel700, pixel701, pixel141, pixel780, pixel30, pixel781, pixel782, pixel420, pixel783, pixel31, pixel421, pixel140, pixel699, pixel139, pixel8, pixel9, pixel6, pixel7, pixel4, pixel5, pixel2, pixel3, pixel0, pixel21, pixel1, pixel20, pixel23, pixel532, pixel730, pixel22, pixel731, pixel25, pixel24, pixel27, pixel26, pixel29, pixel28].
## momentum_stable cannot be specified if adaptive_rate is enabled..
##
|
| | 0%
|
|== | 2%
|
|=== | 5%
|
|===== | 7%
|
|======= | 9%
|
|======== | 12%
|
|========== | 14%
|
|============ | 17%
|
|============= | 19%
|
|=============== | 21%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 28%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 45%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 55%
|
|======================================== | 57%
|
|========================================== | 59%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|=============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|=================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 81%
|
|========================================================== | 83%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## print confusion matrix
h2o.confusionMatrix(model)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0 1 2 3 4 5 6 7 8 9 Error Rate
## 0 924 0 3 2 0 3 6 3 1 0 0.0191 = 18 / 942
## 1 0 955 7 2 2 1 2 7 2 1 0.0245 = 24 / 979
## 2 4 2 1010 2 2 1 2 19 4 0 0.0344 = 36 / 1,046
## 3 0 0 14 912 0 9 1 20 2 0 0.0480 = 46 / 958
## 4 0 0 8 0 947 0 4 3 2 15 0.0327 = 32 / 979
## 5 2 0 1 11 3 952 14 7 3 6 0.0470 = 47 / 999
## 6 4 0 5 0 3 3 1022 9 1 0 0.0239 = 25 / 1,047
## 7 4 3 7 1 2 1 2 975 0 4 0.0240 = 24 / 999
## 8 1 5 10 8 2 10 4 10 976 3 0.0515 = 53 / 1,029
## 9 2 1 2 2 24 3 0 32 5 883 0.0744 = 71 / 954
## Totals 941 966 1067 940 985 983 1057 1085 996 912 0.0379 = 376 / 9,932
## print time elapsed
s - proc.time()
## user system elapsed
## -2.055 -0.215 -151.653
## classify test set
h2o_y_test <- h2o.predict(model, test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
## convert H2O format into data frame and save as csv
df_y_test = as.data.frame(h2o_y_test)
df_y_test = data.frame(ImageId = seq(1,length(df_y_test$predict)), Label = df_y_test$predict)
#write.csv(df_y_test, file = "submission-r-h2o.csv", row.names=F)
## shut down virutal H2O cluster
h2o.shutdown(prompt = F)
Final Problem 3. 30 points You are to compete in the House Prices: Advanced Regression Techniques competition https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.
Descriptive and Inferential Statistics.
We have a lot of different distribution types. This data could benefit from some normalization which I’m not going to address due to time constraints.
library(ggplot2)
library(reshape2)
melt.housing_train = melt(housing_train)
## Using MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Heating, HeatingQC, CentralAir, Electrical, KitchenQual, Functional, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PavedDrive, PoolQC, Fence, MiscFeature, SaleType, SaleCondition as id variables
ggplot(data = melt.housing_train, aes(x = value)) +
stat_density() +
facet_wrap(~variable, scales = "free")
## Warning: Removed 348 rows containing non-finite values (stat_density).
library(gtsummary)
housing_train %>% tbl_summary(statistic = list(all_continuous() ~ "{mean} ({sd})", all_categorical() ~ "{n} / {N} ({p}%)"), digits = all_continuous() ~ 2, missing_text = "(Missing)")
Characteristic | N = 1,4601 |
---|---|
Id | 730.50 (421.61) |
MSSubClass | 56.90 (42.30) |
MSZoning | |
C (all) | 10 / 1,460 (0.7%) |
FV | 65 / 1,460 (4.5%) |
RH | 16 / 1,460 (1.1%) |
RL | 1,151 / 1,460 (79%) |
RM | 218 / 1,460 (15%) |
LotFrontage | 70.05 (24.28) |
(Missing) | 259 |
LotArea | 10,516.83 (9,981.26) |
Street | |
Grvl | 6 / 1,460 (0.4%) |
Pave | 1,454 / 1,460 (100%) |
Alley | |
Grvl | 50 / 91 (55%) |
Pave | 41 / 91 (45%) |
(Missing) | 1,369 |
LotShape | |
IR1 | 484 / 1,460 (33%) |
IR2 | 41 / 1,460 (2.8%) |
IR3 | 10 / 1,460 (0.7%) |
Reg | 925 / 1,460 (63%) |
LandContour | |
Bnk | 63 / 1,460 (4.3%) |
HLS | 50 / 1,460 (3.4%) |
Low | 36 / 1,460 (2.5%) |
Lvl | 1,311 / 1,460 (90%) |
Utilities | |
AllPub | 1,459 / 1,460 (100%) |
NoSeWa | 1 / 1,460 (<0.1%) |
LotConfig | |
Corner | 263 / 1,460 (18%) |
CulDSac | 94 / 1,460 (6.4%) |
FR2 | 47 / 1,460 (3.2%) |
FR3 | 4 / 1,460 (0.3%) |
Inside | 1,052 / 1,460 (72%) |
LandSlope | |
Gtl | 1,382 / 1,460 (95%) |
Mod | 65 / 1,460 (4.5%) |
Sev | 13 / 1,460 (0.9%) |
Neighborhood | |
Blmngtn | 17 / 1,460 (1.2%) |
Blueste | 2 / 1,460 (0.1%) |
BrDale | 16 / 1,460 (1.1%) |
BrkSide | 58 / 1,460 (4.0%) |
ClearCr | 28 / 1,460 (1.9%) |
CollgCr | 150 / 1,460 (10%) |
Crawfor | 51 / 1,460 (3.5%) |
Edwards | 100 / 1,460 (6.8%) |
Gilbert | 79 / 1,460 (5.4%) |
IDOTRR | 37 / 1,460 (2.5%) |
MeadowV | 17 / 1,460 (1.2%) |
Mitchel | 49 / 1,460 (3.4%) |
NAmes | 225 / 1,460 (15%) |
NoRidge | 41 / 1,460 (2.8%) |
NPkVill | 9 / 1,460 (0.6%) |
NridgHt | 77 / 1,460 (5.3%) |
NWAmes | 73 / 1,460 (5.0%) |
OldTown | 113 / 1,460 (7.7%) |
Sawyer | 74 / 1,460 (5.1%) |
SawyerW | 59 / 1,460 (4.0%) |
Somerst | 86 / 1,460 (5.9%) |
StoneBr | 25 / 1,460 (1.7%) |
SWISU | 25 / 1,460 (1.7%) |
Timber | 38 / 1,460 (2.6%) |
Veenker | 11 / 1,460 (0.8%) |
Condition1 | |
Artery | 48 / 1,460 (3.3%) |
Feedr | 81 / 1,460 (5.5%) |
Norm | 1,260 / 1,460 (86%) |
PosA | 8 / 1,460 (0.5%) |
PosN | 19 / 1,460 (1.3%) |
RRAe | 11 / 1,460 (0.8%) |
RRAn | 26 / 1,460 (1.8%) |
RRNe | 2 / 1,460 (0.1%) |
RRNn | 5 / 1,460 (0.3%) |
Condition2 | |
Artery | 2 / 1,460 (0.1%) |
Feedr | 6 / 1,460 (0.4%) |
Norm | 1,445 / 1,460 (99%) |
PosA | 1 / 1,460 (<0.1%) |
PosN | 2 / 1,460 (0.1%) |
RRAe | 1 / 1,460 (<0.1%) |
RRAn | 1 / 1,460 (<0.1%) |
RRNn | 2 / 1,460 (0.1%) |
BldgType | |
1Fam | 1,220 / 1,460 (84%) |
2fmCon | 31 / 1,460 (2.1%) |
Duplex | 52 / 1,460 (3.6%) |
Twnhs | 43 / 1,460 (2.9%) |
TwnhsE | 114 / 1,460 (7.8%) |
HouseStyle | |
1.5Fin | 154 / 1,460 (11%) |
1.5Unf | 14 / 1,460 (1.0%) |
1Story | 726 / 1,460 (50%) |
2.5Fin | 8 / 1,460 (0.5%) |
2.5Unf | 11 / 1,460 (0.8%) |
2Story | 445 / 1,460 (30%) |
SFoyer | 37 / 1,460 (2.5%) |
SLvl | 65 / 1,460 (4.5%) |
OverallQual | 6.10 (1.38) |
OverallCond | |
1 | 1 / 1,460 (<0.1%) |
2 | 5 / 1,460 (0.3%) |
3 | 25 / 1,460 (1.7%) |
4 | 57 / 1,460 (3.9%) |
5 | 821 / 1,460 (56%) |
6 | 252 / 1,460 (17%) |
7 | 205 / 1,460 (14%) |
8 | 72 / 1,460 (4.9%) |
9 | 22 / 1,460 (1.5%) |
YearBuilt | 1,971.27 (30.20) |
YearRemodAdd | 1,984.87 (20.65) |
RoofStyle | |
Flat | 13 / 1,460 (0.9%) |
Gable | 1,141 / 1,460 (78%) |
Gambrel | 11 / 1,460 (0.8%) |
Hip | 286 / 1,460 (20%) |
Mansard | 7 / 1,460 (0.5%) |
Shed | 2 / 1,460 (0.1%) |
RoofMatl | |
ClyTile | 1 / 1,460 (<0.1%) |
CompShg | 1,434 / 1,460 (98%) |
Membran | 1 / 1,460 (<0.1%) |
Metal | 1 / 1,460 (<0.1%) |
Roll | 1 / 1,460 (<0.1%) |
Tar&Grv | 11 / 1,460 (0.8%) |
WdShake | 5 / 1,460 (0.3%) |
WdShngl | 6 / 1,460 (0.4%) |
Exterior1st | |
AsbShng | 20 / 1,460 (1.4%) |
AsphShn | 1 / 1,460 (<0.1%) |
BrkComm | 2 / 1,460 (0.1%) |
BrkFace | 50 / 1,460 (3.4%) |
CBlock | 1 / 1,460 (<0.1%) |
CemntBd | 61 / 1,460 (4.2%) |
HdBoard | 222 / 1,460 (15%) |
ImStucc | 1 / 1,460 (<0.1%) |
MetalSd | 220 / 1,460 (15%) |
Plywood | 108 / 1,460 (7.4%) |
Stone | 2 / 1,460 (0.1%) |
Stucco | 25 / 1,460 (1.7%) |
VinylSd | 515 / 1,460 (35%) |
Wd Sdng | 206 / 1,460 (14%) |
WdShing | 26 / 1,460 (1.8%) |
Exterior2nd | |
AsbShng | 20 / 1,460 (1.4%) |
AsphShn | 3 / 1,460 (0.2%) |
Brk Cmn | 7 / 1,460 (0.5%) |
BrkFace | 25 / 1,460 (1.7%) |
CBlock | 1 / 1,460 (<0.1%) |
CmentBd | 60 / 1,460 (4.1%) |
HdBoard | 207 / 1,460 (14%) |
ImStucc | 10 / 1,460 (0.7%) |
MetalSd | 214 / 1,460 (15%) |
Other | 1 / 1,460 (<0.1%) |
Plywood | 142 / 1,460 (9.7%) |
Stone | 5 / 1,460 (0.3%) |
Stucco | 26 / 1,460 (1.8%) |
VinylSd | 504 / 1,460 (35%) |
Wd Sdng | 197 / 1,460 (13%) |
Wd Shng | 38 / 1,460 (2.6%) |
MasVnrType | |
BrkCmn | 15 / 1,452 (1.0%) |
BrkFace | 445 / 1,452 (31%) |
None | 864 / 1,452 (60%) |
Stone | 128 / 1,452 (8.8%) |
(Missing) | 8 |
MasVnrArea | 103.69 (181.07) |
(Missing) | 8 |
ExterQual | |
Ex | 52 / 1,460 (3.6%) |
Fa | 14 / 1,460 (1.0%) |
Gd | 488 / 1,460 (33%) |
TA | 906 / 1,460 (62%) |
ExterCond | |
Ex | 3 / 1,460 (0.2%) |
Fa | 28 / 1,460 (1.9%) |
Gd | 146 / 1,460 (10%) |
Po | 1 / 1,460 (<0.1%) |
TA | 1,282 / 1,460 (88%) |
Foundation | |
BrkTil | 146 / 1,460 (10%) |
CBlock | 634 / 1,460 (43%) |
PConc | 647 / 1,460 (44%) |
Slab | 24 / 1,460 (1.6%) |
Stone | 6 / 1,460 (0.4%) |
Wood | 3 / 1,460 (0.2%) |
BsmtQual | |
Ex | 121 / 1,423 (8.5%) |
Fa | 35 / 1,423 (2.5%) |
Gd | 618 / 1,423 (43%) |
TA | 649 / 1,423 (46%) |
(Missing) | 37 |
BsmtCond | |
Fa | 45 / 1,423 (3.2%) |
Gd | 65 / 1,423 (4.6%) |
Po | 2 / 1,423 (0.1%) |
TA | 1,311 / 1,423 (92%) |
(Missing) | 37 |
BsmtExposure | |
Av | 221 / 1,422 (16%) |
Gd | 134 / 1,422 (9.4%) |
Mn | 114 / 1,422 (8.0%) |
No | 953 / 1,422 (67%) |
(Missing) | 38 |
BsmtFinType1 | |
ALQ | 220 / 1,423 (15%) |
BLQ | 148 / 1,423 (10%) |
GLQ | 418 / 1,423 (29%) |
LwQ | 74 / 1,423 (5.2%) |
Rec | 133 / 1,423 (9.3%) |
Unf | 430 / 1,423 (30%) |
(Missing) | 37 |
BsmtFinSF1 | 443.64 (456.10) |
BsmtFinType2 | |
ALQ | 19 / 1,422 (1.3%) |
BLQ | 33 / 1,422 (2.3%) |
GLQ | 14 / 1,422 (1.0%) |
LwQ | 46 / 1,422 (3.2%) |
Rec | 54 / 1,422 (3.8%) |
Unf | 1,256 / 1,422 (88%) |
(Missing) | 38 |
BsmtFinSF2 | 46.55 (161.32) |
BsmtUnfSF | 567.24 (441.87) |
TotalBsmtSF | 1,057.43 (438.71) |
Heating | |
Floor | 1 / 1,460 (<0.1%) |
GasA | 1,428 / 1,460 (98%) |
GasW | 18 / 1,460 (1.2%) |
Grav | 7 / 1,460 (0.5%) |
OthW | 2 / 1,460 (0.1%) |
Wall | 4 / 1,460 (0.3%) |
HeatingQC | |
Ex | 741 / 1,460 (51%) |
Fa | 49 / 1,460 (3.4%) |
Gd | 241 / 1,460 (17%) |
Po | 1 / 1,460 (<0.1%) |
TA | 428 / 1,460 (29%) |
CentralAir | |
N | 95 / 1,460 (6.5%) |
Y | 1,365 / 1,460 (93%) |
Electrical | |
FuseA | 94 / 1,459 (6.4%) |
FuseF | 27 / 1,459 (1.9%) |
FuseP | 3 / 1,459 (0.2%) |
Mix | 1 / 1,459 (<0.1%) |
SBrkr | 1,334 / 1,459 (91%) |
(Missing) | 1 |
X1stFlrSF | 1,162.63 (386.59) |
X2ndFlrSF | 346.99 (436.53) |
LowQualFinSF | 5.84 (48.62) |
GrLivArea | 1,515.46 (525.48) |
BsmtFullBath | |
0 | 856 / 1,460 (59%) |
1 | 588 / 1,460 (40%) |
2 | 15 / 1,460 (1.0%) |
3 | 1 / 1,460 (<0.1%) |
BsmtHalfBath | |
0 | 1,378 / 1,460 (94%) |
1 | 80 / 1,460 (5.5%) |
2 | 2 / 1,460 (0.1%) |
FullBath | |
0 | 9 / 1,460 (0.6%) |
1 | 650 / 1,460 (45%) |
2 | 768 / 1,460 (53%) |
3 | 33 / 1,460 (2.3%) |
HalfBath | |
0 | 913 / 1,460 (63%) |
1 | 535 / 1,460 (37%) |
2 | 12 / 1,460 (0.8%) |
BedroomAbvGr | |
0 | 6 / 1,460 (0.4%) |
1 | 50 / 1,460 (3.4%) |
2 | 358 / 1,460 (25%) |
3 | 804 / 1,460 (55%) |
4 | 213 / 1,460 (15%) |
5 | 21 / 1,460 (1.4%) |
6 | 7 / 1,460 (0.5%) |
8 | 1 / 1,460 (<0.1%) |
KitchenAbvGr | |
0 | 1 / 1,460 (<0.1%) |
1 | 1,392 / 1,460 (95%) |
2 | 65 / 1,460 (4.5%) |
3 | 2 / 1,460 (0.1%) |
KitchenQual | |
Ex | 100 / 1,460 (6.8%) |
Fa | 39 / 1,460 (2.7%) |
Gd | 586 / 1,460 (40%) |
TA | 735 / 1,460 (50%) |
TotRmsAbvGrd | 6.52 (1.63) |
Functional | |
Maj1 | 14 / 1,460 (1.0%) |
Maj2 | 5 / 1,460 (0.3%) |
Min1 | 31 / 1,460 (2.1%) |
Min2 | 34 / 1,460 (2.3%) |
Mod | 15 / 1,460 (1.0%) |
Sev | 1 / 1,460 (<0.1%) |
Typ | 1,360 / 1,460 (93%) |
Fireplaces | |
0 | 690 / 1,460 (47%) |
1 | 650 / 1,460 (45%) |
2 | 115 / 1,460 (7.9%) |
3 | 5 / 1,460 (0.3%) |
FireplaceQu | |
Ex | 24 / 770 (3.1%) |
Fa | 33 / 770 (4.3%) |
Gd | 380 / 770 (49%) |
Po | 20 / 770 (2.6%) |
TA | 313 / 770 (41%) |
(Missing) | 690 |
GarageType | |
2Types | 6 / 1,379 (0.4%) |
Attchd | 870 / 1,379 (63%) |
Basment | 19 / 1,379 (1.4%) |
BuiltIn | 88 / 1,379 (6.4%) |
CarPort | 9 / 1,379 (0.7%) |
Detchd | 387 / 1,379 (28%) |
(Missing) | 81 |
GarageYrBlt | 1,978.51 (24.69) |
(Missing) | 81 |
GarageFinish | |
Fin | 352 / 1,379 (26%) |
RFn | 422 / 1,379 (31%) |
Unf | 605 / 1,379 (44%) |
(Missing) | 81 |
GarageCars | |
0 | 81 / 1,460 (5.5%) |
1 | 369 / 1,460 (25%) |
2 | 824 / 1,460 (56%) |
3 | 181 / 1,460 (12%) |
4 | 5 / 1,460 (0.3%) |
GarageArea | 472.98 (213.80) |
GarageQual | |
Ex | 3 / 1,379 (0.2%) |
Fa | 48 / 1,379 (3.5%) |
Gd | 14 / 1,379 (1.0%) |
Po | 3 / 1,379 (0.2%) |
TA | 1,311 / 1,379 (95%) |
(Missing) | 81 |
GarageCond | |
Ex | 2 / 1,379 (0.1%) |
Fa | 35 / 1,379 (2.5%) |
Gd | 9 / 1,379 (0.7%) |
Po | 7 / 1,379 (0.5%) |
TA | 1,326 / 1,379 (96%) |
(Missing) | 81 |
PavedDrive | |
N | 90 / 1,460 (6.2%) |
P | 30 / 1,460 (2.1%) |
Y | 1,340 / 1,460 (92%) |
WoodDeckSF | 94.24 (125.34) |
OpenPorchSF | 46.66 (66.26) |
EnclosedPorch | 21.95 (61.12) |
X3SsnPorch | 3.41 (29.32) |
ScreenPorch | 15.06 (55.76) |
PoolArea | |
0 | 1,453 / 1,460 (100%) |
480 | 1 / 1,460 (<0.1%) |
512 | 1 / 1,460 (<0.1%) |
519 | 1 / 1,460 (<0.1%) |
555 | 1 / 1,460 (<0.1%) |
576 | 1 / 1,460 (<0.1%) |
648 | 1 / 1,460 (<0.1%) |
738 | 1 / 1,460 (<0.1%) |
PoolQC | |
Ex | 2 / 7 (29%) |
Fa | 2 / 7 (29%) |
Gd | 3 / 7 (43%) |
(Missing) | 1,453 |
Fence | |
GdPrv | 59 / 281 (21%) |
GdWo | 54 / 281 (19%) |
MnPrv | 157 / 281 (56%) |
MnWw | 11 / 281 (3.9%) |
(Missing) | 1,179 |
MiscFeature | |
Gar2 | 2 / 54 (3.7%) |
Othr | 2 / 54 (3.7%) |
Shed | 49 / 54 (91%) |
TenC | 1 / 54 (1.9%) |
(Missing) | 1,406 |
MiscVal | 43.49 (496.12) |
MoSold | 6.32 (2.70) |
YrSold | |
2006 | 314 / 1,460 (22%) |
2007 | 329 / 1,460 (23%) |
2008 | 304 / 1,460 (21%) |
2009 | 338 / 1,460 (23%) |
2010 | 175 / 1,460 (12%) |
SaleType | |
COD | 43 / 1,460 (2.9%) |
Con | 2 / 1,460 (0.1%) |
ConLD | 9 / 1,460 (0.6%) |
ConLI | 5 / 1,460 (0.3%) |
ConLw | 5 / 1,460 (0.3%) |
CWD | 4 / 1,460 (0.3%) |
New | 122 / 1,460 (8.4%) |
Oth | 3 / 1,460 (0.2%) |
WD | 1,267 / 1,460 (87%) |
SaleCondition | |
Abnorml | 101 / 1,460 (6.9%) |
AdjLand | 4 / 1,460 (0.3%) |
Alloca | 12 / 1,460 (0.8%) |
Family | 20 / 1,460 (1.4%) |
Normal | 1,198 / 1,460 (82%) |
Partial | 125 / 1,460 (8.6%) |
SalePrice | 180,921.20 (79,442.50) |
1
Mean (SD); n / N (%)
|
pairs(housing_train %>% select(OverallQual, OverallCond, SalePrice))
library(corrplot)
## corrplot 0.92 loaded
subset <- housing_train %>% select(OverallQual, OverallCond, SalePrice)
M <- stats::cor(subset)
kableExtra::kable(round(M,2))
OverallQual | OverallCond | SalePrice | |
---|---|---|---|
OverallQual | 1.00 | -0.09 | 0.79 |
OverallCond | -0.09 | 1.00 | -0.08 |
SalePrice | 0.79 | -0.08 | 1.00 |
cor.test(subset$SalePrice, subset$OverallQual, conf.level=0.8)
##
## Pearson's product-moment correlation
##
## data: subset$SalePrice and subset$OverallQual
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7780752 0.8032204
## sample estimates:
## cor
## 0.7909816
cor.test(subset$SalePrice, subset$OverallCond, conf.level=0.8)
##
## Pearson's product-moment correlation
##
## data: subset$SalePrice and subset$OverallCond
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.1111272 -0.0444103
## sample estimates:
## cor
## -0.07785589
cor.test(subset$OverallQual, subset$OverallCond, conf.level=0.8)
##
## Pearson's product-moment correlation
##
## data: subset$OverallQual and subset$OverallCond
## t = -3.5253, df = 1458, p-value = 0.0004362
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.12510797 -0.05855136
## sample estimates:
## cor
## -0.09193234
Looking at the scatter plots and the correlations, there is a positive relationship between SalePrice and OverallQual. The correlation coefficient is about 0.8 which will give us an R-squared of about 0.64 which is pretty good. Most of the variation in SalePrice can be explained by OverallQual alone. The 80% confidence interval does not include zero, we can reject the null hypothesis that the true correlation is zero.
Looking at the correlations, there is a negative relationship between OverallQual and OverallCond. The correlation coefficient is about -0.09 which will give us an R-squared of about 0.0081 which is pretty small. Almost none of the variation in OverallQual can be explained by OverallCond alone. The 80% confidence interval does not include zero, we can reject the null hypothesis that the true correlation is zero. Looking at the scatter plot alone it is hard to see any relationship. I would be hard-pressed to draw a best fit line through that graph.
Looking at the correlations, there is a negative relationship between SalePrice and OverallCond. The correlation coefficient is about -0.08 which will give us an R-squared of about 0.0064 which is pretty small. Almost none of the variation in OverallQual can be explained by OverallCond alone. The 80% confidence interval does not include zero, we can reject the null hypothesis that the true correlation is zero. Looking at the scatter plot it looks like there are actually two relationships. It looks like there is a positive relationship between SalePrice and OverallCond for the conditions ranging from ‘Poor’ to ‘Average’ and then no relationship between SalePrice and OverallCondition from ‘Above Average’ to ‘Very Excellent’. If we look at the correlation again but limiting the conditions from ‘Poor’ to ‘Average’ this is what we see.
cor.test(subset$OverallQual[subset$OverallCond < 6], subset$OverallCond[subset$OverallCond < 6], conf.level=0.8)
##
## Pearson's product-moment correlation
##
## data: subset$OverallQual[subset$OverallCond < 6] and subset$OverallCond[subset$OverallCond < 6]
## t = 12.284, df = 907, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.3405843 0.4135671
## sample estimates:
## cor
## 0.3776621
I’m not concerned about familywise error because I can examine the confidence intervals adjusted for multiple testing and the confidence intervals still don’t include zero.
print(psych::corr.test(subset, ci=0.8), short=FALSE)
## Call:psych::corr.test(x = subset, ci = 0.8)
## Correlation matrix
## OverallQual OverallCond SalePrice
## OverallQual 1.00 -0.09 0.79
## OverallCond -0.09 1.00 -0.08
## SalePrice 0.79 -0.08 1.00
## Sample Size
## [1] 1460
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## OverallQual OverallCond SalePrice
## OverallQual 0 0 0
## OverallCond 0 0 0
## SalePrice 0 0 0
##
## Confidence intervals based upon normal theory. To get bootstrapped values, try cor.ci
## raw.lower raw.r raw.upper raw.p lower.adj upper.adj
## OvrlQ-OvrlC -0.14 -0.09 -0.04 0 -0.15 -0.03
## OvrlQ-SlPrc 0.77 0.79 0.81 0 0.77 0.81
## OvrlC-SlPrc -0.13 -0.08 -0.03 0 -0.13 -0.03
Linear Algebra and Correlation.
#could have done this with solve() or ginv()
library(matlib)
## Warning: package 'matlib' was built under R version 4.1.2
##
## Attaching package: 'matlib'
## The following object is masked from 'package:matrixcalc':
##
## vec
P = inv(t(M))
print(P)
##
## [1,] 2.67793985 0.08177049 -2.11183483
## [2,] 0.08177049 1.00859536 0.01384614
## [3,] -2.11183483 0.01384614 2.67150050
MP = M%*%P
print(round(MP))
##
## OverallQual 1 0 0
## OverallCond 0 1 0
## SalePrice 0 0 1
PM = P%*%M
print(round(PM))
## OverallQual OverallCond SalePrice
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
library(matrixcalc)
lu.decomposition(MP)
## $L
## [,1] [,2] [,3]
## [1,] 1.000000e+00 0.000000e+00 0
## [2,] -5.084237e-09 1.000000e+00 0
## [3,] 4.214407e-09 -4.254262e-10 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 -2.24059e-09 3.363727e-09
## [2,] 0 1.00000e+00 3.309696e-09
## [3,] 0 0.00000e+00 1.000000e+00
lu.decomposition(PM)
## $L
## [,1] [,2] [,3]
## [1,] 1.000000e+00 0.000000e+00 0
## [2,] -2.240590e-09 1.000000e+00 0
## [3,] 3.363727e-09 3.309696e-09 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 -5.084237e-09 4.214407e-09
## [2,] 0 1.000000e+00 -4.254262e-10
## [3,] 0 0.000000e+00 1.000000e+00
Calculus-Based Probability & Statistics.
library(fitdistrplus)
## Warning: package 'fitdistrplus' was built under R version 4.1.2
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: survival
plotdist(housing_train$BsmtFinSF2, histo = TRUE, demp = TRUE)
exp.f <- fitdistr(housing_train$BsmtFinSF2, "exponential") # Normal
exp.f
## rate
## 0.0214825932
## (0.0005622252)
lambda = 0.0214825932
samples <- rexp(1000,lambda)
orig <- hist(housing_train$BsmtFinSF2)
exp_samp <- hist(samples)
plot(orig, col = "lightblue")
plot(exp_samp, col = "green", add = TRUE)
The distribution looked more like a chi-squared distribution before and now a better approximation of an exponential.
#qexp(0.05, rate = lambda)
-base::log(0.95)/lambda
## [1] 2.387668
#qexp(0.95, rate = lambda)
-base::log(0.05)/lambda
## [1] 139.4493
quantile(housing_train$BsmtUnfSF, c(0.05, 0.95))
## 5% 95%
## 0 1468
I’m inspecting the missingness and removing any feature where we are missing 50% or more of the values
library(naniar)
gg_miss_var(housing_train, show_pct = TRUE)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
housing_train <- housing_train %>% dplyr::select(-c("PoolQC", "MiscFeature", "Alley", "Fence", "Id", "FireplaceQu"))
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
housing_train <- housing_train %>%
mutate_if(is.numeric, function(x) ifelse(is.na(x), mean(x, na.rm = T), x)) %>%
mutate_if(base::is.character, function(x) ifelse(is.na(x), getmode(x), x))
gg_miss_var(housing_train, show_pct = TRUE)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
y <- housing_train %>% dplyr::select(SalePrice)
X <- housing_train %>% dplyr::select(-c( "SalePrice"))
housing_train3 <- bind_cols(y, X)
full_model <- lm(housing_train3)
summary(full_model)
##
## Call:
## lm(formula = housing_train3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181141 -9352 315 9708 181141
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.228e+05 1.054e+06 -0.686 0.492996
## MSSubClass -3.898e+01 8.277e+01 -0.471 0.637818
## MSZoningFV 3.463e+04 1.197e+04 2.894 0.003871 **
## MSZoningRH 2.682e+04 1.192e+04 2.251 0.024578 *
## MSZoningRL 2.833e+04 1.020e+04 2.779 0.005534 **
## MSZoningRM 2.420e+04 9.540e+03 2.536 0.011322 *
## LotFrontage 5.749e+01 4.347e+01 1.322 0.186251
## LotArea 7.440e-01 1.090e-01 6.823 1.40e-11 ***
## StreetPave 3.078e+04 1.209e+04 2.546 0.011019 *
## LotShapeIR2 3.474e+03 4.237e+03 0.820 0.412433
## LotShapeIR3 3.543e+03 8.904e+03 0.398 0.690726
## LotShapeReg 1.867e+03 1.607e+03 1.161 0.245682
## LandContourHLS 8.688e+03 5.137e+03 1.691 0.091072 .
## LandContourLow -9.357e+03 6.381e+03 -1.466 0.142812
## LandContourLvl 6.330e+03 3.691e+03 1.715 0.086574 .
## UtilitiesNoSeWa -3.160e+04 2.646e+04 -1.194 0.232678
## LotConfigCulDSac 9.001e+03 3.314e+03 2.716 0.006697 **
## LotConfigFR2 -7.421e+03 4.033e+03 -1.840 0.066020 .
## LotConfigFR3 -1.359e+04 1.262e+04 -1.077 0.281728
## LotConfigInside -1.006e+03 1.791e+03 -0.562 0.574399
## LandSlopeMod 6.494e+03 3.979e+03 1.632 0.102942
## LandSlopeSev -4.358e+04 1.142e+04 -3.816 0.000142 ***
## NeighborhoodBlueste -1.979e+02 1.906e+04 -0.010 0.991719
## NeighborhoodBrDale 7.374e+02 1.100e+04 0.067 0.946589
## NeighborhoodBrkSide -4.030e+03 9.451e+03 -0.426 0.669833
## NeighborhoodClearCr -1.383e+04 9.205e+03 -1.502 0.133255
## NeighborhoodCollgCr -1.016e+04 7.264e+03 -1.398 0.162381
## NeighborhoodCrawfor 1.178e+04 8.563e+03 1.375 0.169264
## NeighborhoodEdwards -2.007e+04 8.009e+03 -2.506 0.012343 *
## NeighborhoodGilbert -1.203e+04 7.694e+03 -1.563 0.118245
## NeighborhoodIDOTRR -9.722e+03 1.075e+04 -0.905 0.365792
## NeighborhoodMeadowV -5.224e+03 1.123e+04 -0.465 0.641968
## NeighborhoodMitchel -2.150e+04 8.166e+03 -2.633 0.008571 **
## NeighborhoodNAmes -1.623e+04 7.823e+03 -2.075 0.038183 *
## NeighborhoodNoRidge 2.645e+04 8.441e+03 3.133 0.001772 **
## NeighborhoodNPkVill 1.265e+04 1.406e+04 0.900 0.368435
## NeighborhoodNridgHt 1.750e+04 7.509e+03 2.330 0.019961 *
## NeighborhoodNWAmes -1.818e+04 8.045e+03 -2.260 0.023996 *
## NeighborhoodOldTown -1.336e+04 9.600e+03 -1.392 0.164205
## NeighborhoodSawyer -1.134e+04 8.138e+03 -1.394 0.163599
## NeighborhoodSawyerW -5.068e+03 7.798e+03 -0.650 0.515915
## NeighborhoodSomerst -1.808e+03 9.029e+03 -0.200 0.841300
## NeighborhoodStoneBr 3.798e+04 8.331e+03 4.558 5.67e-06 ***
## NeighborhoodSWISU -7.934e+03 9.716e+03 -0.817 0.414330
## NeighborhoodTimber -9.955e+03 8.125e+03 -1.225 0.220739
## NeighborhoodVeenker 3.721e+02 1.053e+04 0.035 0.971825
## Condition1Feedr 6.067e+03 4.996e+03 1.214 0.224807
## Condition1Norm 1.466e+04 4.156e+03 3.527 0.000435 ***
## Condition1PosA 4.990e+03 1.001e+04 0.498 0.618369
## Condition1PosN 1.176e+04 7.438e+03 1.581 0.114030
## Condition1RRAe -1.613e+04 9.085e+03 -1.776 0.075995 .
## Condition1RRAn 7.458e+03 6.841e+03 1.090 0.275863
## Condition1RRNe -1.515e+03 1.758e+04 -0.086 0.931354
## Condition1RRNn 6.175e+03 1.285e+04 0.480 0.630993
## Condition2Feedr -6.539e+03 2.344e+04 -0.279 0.780266
## Condition2Norm -7.121e+03 2.030e+04 -0.351 0.725872
## Condition2PosA 3.410e+04 3.721e+04 0.917 0.359516
## Condition2PosN -2.383e+05 2.771e+04 -8.600 < 2e-16 ***
## Condition2RRAe -1.272e+05 4.639e+04 -2.742 0.006198 **
## Condition2RRAn -1.930e+04 3.167e+04 -0.610 0.542239
## Condition2RRNn 1.722e+03 2.716e+04 0.063 0.949465
## BldgType2fmCon -3.593e+03 1.249e+04 -0.288 0.773662
## BldgTypeDuplex -7.280e+03 7.352e+03 -0.990 0.322268
## BldgTypeTwnhs -1.912e+04 1.001e+04 -1.911 0.056273 .
## BldgTypeTwnhsE -1.658e+04 8.995e+03 -1.844 0.065452 .
## HouseStyle1.5Unf 1.386e+04 7.881e+03 1.759 0.078807 .
## HouseStyle1Story 8.341e+03 4.324e+03 1.929 0.053965 .
## HouseStyle2.5Fin -2.500e+04 1.226e+04 -2.039 0.041683 *
## HouseStyle2.5Unf -8.517e+03 9.276e+03 -0.918 0.358665
## HouseStyle2Story -5.280e+03 3.500e+03 -1.509 0.131661
## HouseStyleSFoyer 3.076e+03 6.237e+03 0.493 0.622011
## HouseStyleSLvl 2.796e+03 5.508e+03 0.508 0.611881
## OverallQual 6.347e+03 1.010e+03 6.287 4.47e-10 ***
## OverallCond 5.675e+03 8.726e+02 6.504 1.14e-10 ***
## YearBuilt 3.089e+02 7.612e+01 4.059 5.24e-05 ***
## YearRemodAdd 9.442e+01 5.536e+01 1.705 0.088376 .
## RoofStyleGable 5.801e+03 1.847e+04 0.314 0.753507
## RoofStyleGambrel 8.696e+03 2.020e+04 0.431 0.666872
## RoofStyleHip 5.677e+03 1.853e+04 0.306 0.759371
## RoofStyleMansard 1.748e+04 2.149e+04 0.813 0.416182
## RoofStyleShed 9.593e+04 3.476e+04 2.760 0.005865 **
## RoofMatlCompShg 6.684e+05 3.317e+04 20.148 < 2e-16 ***
## RoofMatlMembran 7.603e+05 4.778e+04 15.911 < 2e-16 ***
## RoofMatlMetal 7.327e+05 4.676e+04 15.670 < 2e-16 ***
## RoofMatlRoll 6.539e+05 4.166e+04 15.698 < 2e-16 ***
## RoofMatlTar&Grv 6.711e+05 3.797e+04 17.674 < 2e-16 ***
## RoofMatlWdShake 6.599e+05 3.674e+04 17.963 < 2e-16 ***
## RoofMatlWdShngl 7.221e+05 3.448e+04 20.945 < 2e-16 ***
## Exterior1stAsphShn -1.497e+04 3.296e+04 -0.454 0.649827
## Exterior1stBrkComm -1.536e+04 2.777e+04 -0.553 0.580203
## Exterior1stBrkFace 2.191e+03 1.274e+04 0.172 0.863468
## Exterior1stCBlock -2.271e+04 2.727e+04 -0.833 0.405135
## Exterior1stCemntBd -1.556e+04 1.911e+04 -0.814 0.415570
## Exterior1stHdBoard -1.970e+04 1.292e+04 -1.525 0.127426
## Exterior1stImStucc -5.242e+04 2.753e+04 -1.904 0.057138 .
## Exterior1stMetalSd -1.144e+04 1.461e+04 -0.783 0.433572
## Exterior1stPlywood -2.028e+04 1.274e+04 -1.591 0.111756
## Exterior1stStone -1.618e+04 2.403e+04 -0.673 0.500884
## Exterior1stStucco -1.021e+04 1.414e+04 -0.722 0.470573
## Exterior1stVinylSd -1.946e+04 1.326e+04 -1.467 0.142596
## Exterior1stWd Sdng -1.932e+04 1.233e+04 -1.567 0.117388
## Exterior1stWdShing -1.417e+04 1.334e+04 -1.062 0.288513
## Exterior2ndAsphShn 1.278e+04 2.223e+04 0.575 0.565369
## Exterior2ndBrk Cmn 1.059e+04 2.015e+04 0.526 0.599147
## Exterior2ndBrkFace 8.193e+03 1.314e+04 0.624 0.533053
## Exterior2ndCBlock NA NA NA NA
## Exterior2ndCmentBd 1.395e+04 1.879e+04 0.743 0.457848
## Exterior2ndHdBoard 1.318e+04 1.240e+04 1.063 0.288108
## Exterior2ndImStucc 3.027e+04 1.423e+04 2.127 0.033641 *
## Exterior2ndMetalSd 9.546e+03 1.422e+04 0.671 0.502070
## Exterior2ndOther -1.123e+04 2.727e+04 -0.412 0.680713
## Exterior2ndPlywood 1.075e+04 1.205e+04 0.892 0.372563
## Exterior2ndStone -5.665e+03 1.704e+04 -0.333 0.739533
## Exterior2ndStucco 9.762e+03 1.359e+04 0.718 0.472752
## Exterior2ndVinylSd 1.680e+04 1.281e+04 1.311 0.190059
## Exterior2ndWd Sdng 1.488e+04 1.192e+04 1.248 0.212096
## Exterior2ndWd Shng 8.748e+03 1.245e+04 0.702 0.482524
## MasVnrTypeBrkFace 6.216e+03 6.790e+03 0.915 0.360162
## MasVnrTypeNone 9.704e+03 6.841e+03 1.418 0.156332
## MasVnrTypeStone 1.203e+04 7.172e+03 1.678 0.093638 .
## MasVnrArea 1.992e+01 5.762e+00 3.458 0.000563 ***
## ExterQualFa -6.508e+03 1.099e+04 -0.592 0.553820
## ExterQualGd -2.072e+04 4.803e+03 -4.314 1.73e-05 ***
## ExterQualTA -2.129e+04 5.307e+03 -4.011 6.40e-05 ***
## ExterCondFa -6.065e+03 1.805e+04 -0.336 0.736978
## ExterCondGd -9.800e+03 1.724e+04 -0.568 0.569893
## ExterCondPo 2.330e+03 3.139e+04 0.074 0.940853
## ExterCondTA -6.979e+03 1.720e+04 -0.406 0.685099
## FoundationCBlock 3.168e+03 3.186e+03 0.994 0.320248
## FoundationPConc 4.446e+03 3.431e+03 1.296 0.195319
## FoundationSlab 5.663e+03 7.802e+03 0.726 0.468126
## FoundationStone 6.253e+03 1.105e+04 0.566 0.571479
## FoundationWood -2.867e+04 1.479e+04 -1.939 0.052778 .
## BsmtQualFa -1.375e+04 6.311e+03 -2.178 0.029590 *
## BsmtQualGd -1.851e+04 3.318e+03 -5.580 2.96e-08 ***
## BsmtQualTA -1.573e+04 4.130e+03 -3.809 0.000146 ***
## BsmtCondGd 1.388e+02 5.285e+03 0.026 0.979058
## BsmtCondPo 7.368e+04 2.993e+04 2.462 0.013957 *
## BsmtCondTA 3.382e+03 4.255e+03 0.795 0.426875
## BsmtExposureGd 1.364e+04 3.003e+03 4.540 6.17e-06 ***
## BsmtExposureMn -4.301e+03 3.037e+03 -1.416 0.156922
## BsmtExposureNo -6.025e+03 2.183e+03 -2.760 0.005871 **
## BsmtFinType1BLQ 1.376e+03 2.776e+03 0.495 0.620345
## BsmtFinType1GLQ 5.707e+03 2.516e+03 2.268 0.023490 *
## BsmtFinType1LwQ -3.643e+03 3.754e+03 -0.970 0.332044
## BsmtFinType1Rec -6.368e+02 2.988e+03 -0.213 0.831286
## BsmtFinType1Unf 3.672e+03 2.903e+03 1.265 0.206153
## BsmtFinSF1 3.428e+01 4.661e+00 7.355 3.47e-13 ***
## BsmtFinType2BLQ -1.370e+04 7.551e+03 -1.814 0.069950 .
## BsmtFinType2GLQ -2.272e+03 9.334e+03 -0.243 0.807733
## BsmtFinType2LwQ -1.687e+04 7.366e+03 -2.290 0.022197 *
## BsmtFinType2Rec -1.118e+04 7.100e+03 -1.574 0.115649
## BsmtFinType2Unf -1.070e+04 7.520e+03 -1.424 0.154842
## BsmtFinSF2 2.375e+01 8.546e+00 2.779 0.005537 **
## BsmtUnfSF 1.485e+01 3.984e+00 3.726 0.000203 ***
## TotalBsmtSF NA NA NA NA
## HeatingGasA -2.646e+03 2.475e+04 -0.107 0.914867
## HeatingGasW -5.346e+03 2.561e+04 -0.209 0.834647
## HeatingGrav -7.573e+03 2.720e+04 -0.278 0.780727
## HeatingOthW -2.451e+04 3.085e+04 -0.794 0.427068
## HeatingWall 7.571e+03 2.873e+04 0.264 0.792202
## HeatingQCFa 6.140e+02 4.677e+03 0.131 0.895568
## HeatingQCGd -3.211e+03 2.066e+03 -1.554 0.120370
## HeatingQCPo 2.245e+03 2.675e+04 0.084 0.933147
## HeatingQCTA -3.424e+03 2.069e+03 -1.655 0.098227 .
## CentralAirY -3.698e+02 3.883e+03 -0.095 0.924138
## ElectricalFuseF 5.950e+02 5.779e+03 0.103 0.918014
## ElectricalFuseP -4.018e+03 1.853e+04 -0.217 0.828424
## ElectricalMix -4.783e+04 4.473e+04 -1.069 0.285078
## ElectricalSBrkr -1.420e+03 2.954e+03 -0.481 0.630706
## X1stFlrSF 5.108e+01 5.219e+00 9.788 < 2e-16 ***
## X2ndFlrSF 6.684e+01 5.528e+00 12.091 < 2e-16 ***
## LowQualFinSF 1.530e+01 1.829e+01 0.836 0.403173
## GrLivArea NA NA NA NA
## BsmtFullBath 6.340e+02 1.974e+03 0.321 0.748179
## BsmtHalfBath -7.764e+02 3.025e+03 -0.257 0.797487
## FullBath 3.847e+03 2.206e+03 1.744 0.081457 .
## HalfBath 1.014e+03 2.090e+03 0.485 0.627584
## BedroomAbvGr -3.459e+03 1.362e+03 -2.539 0.011224 *
## KitchenAbvGr -1.261e+04 5.673e+03 -2.223 0.026400 *
## KitchenQualFa -1.972e+04 6.206e+03 -3.178 0.001520 **
## KitchenQualGd -2.491e+04 3.461e+03 -7.198 1.06e-12 ***
## KitchenQualTA -2.273e+04 3.899e+03 -5.829 7.11e-09 ***
## TotRmsAbvGrd 1.063e+03 9.430e+02 1.127 0.260055
## FunctionalMaj2 1.867e+02 1.445e+04 0.013 0.989693
## FunctionalMin1 9.749e+03 8.610e+03 1.132 0.257759
## FunctionalMin2 1.145e+04 8.623e+03 1.328 0.184501
## FunctionalMod 5.927e+02 1.041e+04 0.057 0.954619
## FunctionalSev -3.836e+04 2.915e+04 -1.316 0.188481
## FunctionalTyp 2.135e+04 7.457e+03 2.863 0.004261 **
## Fireplaces 2.483e+03 1.338e+03 1.856 0.063681 .
## GarageTypeAttchd 1.882e+04 1.097e+04 1.715 0.086542 .
## GarageTypeBasment 2.104e+04 1.269e+04 1.658 0.097553 .
## GarageTypeBuiltIn 1.727e+04 1.140e+04 1.515 0.130127
## GarageTypeCarPort 2.322e+04 1.457e+04 1.593 0.111325
## GarageTypeDetchd 2.132e+04 1.094e+04 1.949 0.051492 .
## GarageYrBlt 2.237e+01 5.659e+01 0.395 0.692730
## GarageFinishRFn -2.912e+03 1.965e+03 -1.482 0.138633
## GarageFinishUnf 4.891e+02 2.397e+03 0.204 0.838333
## GarageCars 2.935e+03 2.199e+03 1.334 0.182333
## GarageArea 1.439e+01 7.749e+00 1.856 0.063644 .
## GarageQualFa -1.179e+05 3.001e+04 -3.928 9.03e-05 ***
## GarageQualGd -1.104e+05 3.072e+04 -3.595 0.000337 ***
## GarageQualPo -1.319e+05 3.843e+04 -3.433 0.000617 ***
## GarageQualTA -1.115e+05 2.968e+04 -3.755 0.000181 ***
## GarageCondFa 1.060e+05 3.460e+04 3.065 0.002226 **
## GarageCondGd 1.061e+05 3.572e+04 2.970 0.003039 **
## GarageCondPo 1.084e+05 3.718e+04 2.916 0.003605 **
## GarageCondTA 1.085e+05 3.426e+04 3.166 0.001582 **
## PavedDriveP -5.030e+03 5.475e+03 -0.919 0.358461
## PavedDriveY -1.471e+03 3.426e+03 -0.429 0.667821
## WoodDeckSF 1.334e+01 5.833e+00 2.287 0.022370 *
## OpenPorchSF 5.963e+00 1.143e+01 0.522 0.602094
## EnclosedPorch 4.197e+00 1.240e+01 0.339 0.734959
## X3SsnPorch 3.325e+01 2.244e+01 1.482 0.138578
## ScreenPorch 2.939e+01 1.230e+01 2.389 0.017044 *
## PoolArea 8.349e+01 1.834e+01 4.553 5.81e-06 ***
## MiscVal 1.380e-01 1.420e+00 0.097 0.922609
## MoSold -4.365e+02 2.446e+02 -1.784 0.074604 .
## YrSold -4.175e+02 5.162e+02 -0.809 0.418761
## SaleTypeCon 2.790e+04 1.766e+04 1.580 0.114351
## SaleTypeConLD 1.532e+04 9.702e+03 1.579 0.114595
## SaleTypeConLI 6.265e+03 1.151e+04 0.544 0.586478
## SaleTypeConLw 1.199e+03 1.217e+04 0.099 0.921525
## SaleTypeCWD 1.530e+04 1.293e+04 1.184 0.236802
## SaleTypeNew 2.247e+04 1.552e+04 1.448 0.147809
## SaleTypeOth 8.551e+03 1.443e+04 0.593 0.553519
## SaleTypeWD -1.782e+01 4.193e+03 -0.004 0.996609
## SaleConditionAdjLand 1.021e+04 1.454e+04 0.702 0.482725
## SaleConditionAlloca 4.024e+03 8.591e+03 0.468 0.639582
## SaleConditionFamily -1.739e+03 6.088e+03 -0.286 0.775139
## SaleConditionNormal 5.133e+03 2.884e+03 1.780 0.075360 .
## SaleConditionPartial -2.594e+03 1.493e+04 -0.174 0.862106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22820 on 1230 degrees of freedom
## Multiple R-squared: 0.9305, Adjusted R-squared: 0.9175
## F-statistic: 71.86 on 229 and 1230 DF, p-value: < 2.2e-16
housing_train4 <- housing_train3 %>%
dplyr::select(SalePrice,
MSZoning,
LotArea,
Street,
LandContour,
LotConfig,
LandSlope,
Neighborhood,
Condition1,
BldgType,
OverallQual,
OverallCond,
YearBuilt,
YearRemodAdd,
RoofMatl,
MasVnrType,
MasVnrArea,
ExterQual,
BsmtQual,
BsmtCond,
BsmtExposure,
BsmtFinType1,
BsmtFinSF1,
BsmtFinType2,
BsmtFinSF2,
BsmtUnfSF,
X1stFlrSF,
X2ndFlrSF,
FullBath,
BedroomAbvGr,
KitchenAbvGr,
KitchenQual,
Fireplaces,
GarageQual,
GarageCond,
WoodDeckSF,
ScreenPorch,
PoolArea)
reduced_model <- lm(housing_train4)
summary(reduced_model)
##
## Call:
## lm(formula = housing_train4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -355676 -10595 525 10304 157343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.518e+06 1.550e+05 -9.798 < 2e-16 ***
## MSZoningFV 2.607e+04 1.221e+04 2.135 0.032956 *
## MSZoningRH 1.880e+04 1.215e+04 1.548 0.121902
## MSZoningRL 2.193e+04 1.022e+04 2.146 0.032052 *
## MSZoningRM 2.012e+04 9.566e+03 2.103 0.035648 *
## LotArea 6.658e-01 1.024e-01 6.500 1.13e-10 ***
## StreetPave 2.703e+04 1.228e+04 2.201 0.027919 *
## LandContourHLS 1.633e+04 5.229e+03 3.123 0.001826 **
## LandContourLow -4.908e+02 6.412e+03 -0.077 0.938996
## LandContourLvl 9.020e+03 3.732e+03 2.417 0.015778 *
## LotConfigCulDSac 6.796e+03 3.256e+03 2.087 0.037045 *
## LotConfigFR2 -8.174e+03 4.164e+03 -1.963 0.049849 *
## LotConfigFR3 -1.645e+04 1.341e+04 -1.226 0.220382
## LotConfigInside -1.969e+03 1.808e+03 -1.090 0.276110
## LandSlopeMod 5.648e+03 4.045e+03 1.396 0.162854
## LandSlopeSev -3.233e+04 1.069e+04 -3.024 0.002546 **
## NeighborhoodBlueste -6.145e+03 2.003e+04 -0.307 0.759031
## NeighborhoodBrDale -1.303e+04 1.095e+04 -1.189 0.234454
## NeighborhoodBrkSide -1.861e+04 9.233e+03 -2.016 0.044004 *
## NeighborhoodClearCr -2.511e+04 9.215e+03 -2.725 0.006516 **
## NeighborhoodCollgCr -1.652e+04 7.374e+03 -2.241 0.025187 *
## NeighborhoodCrawfor 4.832e+01 8.586e+03 0.006 0.995510
## NeighborhoodEdwards -3.349e+04 8.015e+03 -4.178 3.12e-05 ***
## NeighborhoodGilbert -2.161e+04 7.832e+03 -2.759 0.005881 **
## NeighborhoodIDOTRR -2.275e+04 1.063e+04 -2.139 0.032605 *
## NeighborhoodMeadowV -1.820e+04 1.038e+04 -1.753 0.079817 .
## NeighborhoodMitchel -3.459e+04 8.237e+03 -4.199 2.86e-05 ***
## NeighborhoodNAmes -2.625e+04 7.798e+03 -3.366 0.000785 ***
## NeighborhoodNoRidge 2.395e+04 8.515e+03 2.813 0.004976 **
## NeighborhoodNPkVill -7.147e+02 1.116e+04 -0.064 0.948942
## NeighborhoodNridgHt 1.513e+04 7.663e+03 1.974 0.048615 *
## NeighborhoodNWAmes -2.883e+04 7.995e+03 -3.606 0.000322 ***
## NeighborhoodOldTown -2.699e+04 9.577e+03 -2.818 0.004903 **
## NeighborhoodSawyer -2.656e+04 8.193e+03 -3.242 0.001216 **
## NeighborhoodSawyerW -1.730e+04 7.882e+03 -2.195 0.028326 *
## NeighborhoodSomerst -1.032e+03 9.288e+03 -0.111 0.911569
## NeighborhoodStoneBr 3.013e+04 8.495e+03 3.547 0.000403 ***
## NeighborhoodSWISU -2.053e+04 9.671e+03 -2.123 0.033925 *
## NeighborhoodTimber -2.087e+04 8.357e+03 -2.498 0.012624 *
## NeighborhoodVeenker -5.803e+03 1.058e+04 -0.549 0.583391
## Condition1Feedr 5.491e+03 4.950e+03 1.109 0.267464
## Condition1Norm 1.169e+04 4.078e+03 2.867 0.004213 **
## Condition1PosA 3.739e+03 1.021e+04 0.366 0.714258
## Condition1PosN -1.119e+04 7.276e+03 -1.538 0.124208
## Condition1RRAe -1.235e+04 9.002e+03 -1.371 0.170457
## Condition1RRAn 7.843e+03 6.641e+03 1.181 0.237798
## Condition1RRNe -4.874e+03 1.871e+04 -0.261 0.794475
## Condition1RRNn 5.582e+03 1.305e+04 0.428 0.669000
## BldgType2fmCon -6.405e+03 5.809e+03 -1.103 0.270414
## BldgTypeDuplex -4.780e+03 5.963e+03 -0.802 0.422948
## BldgTypeTwnhs -3.515e+04 5.441e+03 -6.461 1.45e-10 ***
## BldgTypeTwnhsE -2.570e+04 3.654e+03 -7.034 3.19e-12 ***
## OverallQual 6.360e+03 1.003e+03 6.341 3.10e-10 ***
## OverallCond 5.610e+03 8.258e+02 6.794 1.64e-11 ***
## YearBuilt 3.884e+02 6.219e+01 6.246 5.64e-10 ***
## YearRemodAdd 8.351e+01 5.364e+01 1.557 0.119791
## RoofMatlCompShg 6.192e+05 3.120e+04 19.846 < 2e-16 ***
## RoofMatlMembran 6.755e+05 4.294e+04 15.732 < 2e-16 ***
## RoofMatlMetal 6.590e+05 4.236e+04 15.556 < 2e-16 ***
## RoofMatlRoll 6.100e+05 4.082e+04 14.941 < 2e-16 ***
## RoofMatlTar&Grv 6.005e+05 3.222e+04 18.640 < 2e-16 ***
## RoofMatlWdShake 6.334e+05 3.353e+04 18.888 < 2e-16 ***
## RoofMatlWdShngl 6.773e+05 3.283e+04 20.629 < 2e-16 ***
## MasVnrTypeBrkFace 1.104e+04 6.975e+03 1.583 0.113729
## MasVnrTypeNone 1.529e+04 7.021e+03 2.178 0.029593 *
## MasVnrTypeStone 1.855e+04 7.414e+03 2.502 0.012480 *
## MasVnrArea 1.716e+01 5.973e+00 2.872 0.004143 **
## ExterQualFa -7.378e+03 1.009e+04 -0.731 0.464732
## ExterQualGd -1.858e+04 4.891e+03 -3.798 0.000152 ***
## ExterQualTA -2.150e+04 5.390e+03 -3.988 7.02e-05 ***
## BsmtQualFa -1.736e+04 6.426e+03 -2.702 0.006981 **
## BsmtQualGd -2.675e+04 3.414e+03 -7.836 9.40e-15 ***
## BsmtQualTA -2.245e+04 4.203e+03 -5.341 1.08e-07 ***
## BsmtCondGd 6.329e+03 5.444e+03 1.163 0.245195
## BsmtCondPo 1.670e+04 2.206e+04 0.757 0.449018
## BsmtCondTA 6.894e+03 4.302e+03 1.602 0.109293
## BsmtExposureGd 1.508e+04 3.164e+03 4.765 2.09e-06 ***
## BsmtExposureMn -1.284e+02 3.093e+03 -0.042 0.966891
## BsmtExposureNo -4.213e+03 2.126e+03 -1.982 0.047690 *
## BsmtFinType1BLQ 2.259e+03 2.835e+03 0.797 0.425775
## BsmtFinType1GLQ 5.746e+03 2.589e+03 2.220 0.026598 *
## BsmtFinType1LwQ -5.103e+03 3.786e+03 -1.348 0.177886
## BsmtFinType1Rec -2.078e+02 3.058e+03 -0.068 0.945845
## BsmtFinType1Unf 2.822e+03 2.973e+03 0.949 0.342718
## BsmtFinSF1 3.155e+01 3.938e+00 8.013 2.40e-15 ***
## BsmtFinType2BLQ -1.525e+04 7.893e+03 -1.932 0.053597 .
## BsmtFinType2GLQ -4.713e+03 9.655e+03 -0.488 0.625576
## BsmtFinType2LwQ -1.909e+04 7.687e+03 -2.484 0.013114 *
## BsmtFinType2Rec -1.500e+04 7.334e+03 -2.045 0.041016 *
## BsmtFinType2Unf -1.085e+04 7.815e+03 -1.388 0.165476
## BsmtFinSF2 2.277e+01 8.453e+00 2.693 0.007159 **
## BsmtUnfSF 1.560e+01 3.247e+00 4.803 1.74e-06 ***
## X1stFlrSF 5.567e+01 4.103e+00 13.570 < 2e-16 ***
## X2ndFlrSF 5.544e+01 2.844e+00 19.495 < 2e-16 ***
## FullBath 3.439e+03 2.041e+03 1.685 0.092225 .
## BedroomAbvGr -3.048e+03 1.235e+03 -2.468 0.013717 *
## KitchenAbvGr -1.454e+04 5.398e+03 -2.693 0.007170 **
## KitchenQualFa -2.420e+04 6.129e+03 -3.949 8.24e-05 ***
## KitchenQualGd -2.495e+04 3.617e+03 -6.899 8.03e-12 ***
## KitchenQualTA -2.594e+04 4.064e+03 -6.383 2.39e-10 ***
## Fireplaces 3.276e+03 1.369e+03 2.393 0.016869 *
## GarageQualFa -1.197e+05 2.935e+04 -4.078 4.81e-05 ***
## GarageQualGd -1.033e+05 3.003e+04 -3.441 0.000596 ***
## GarageQualPo -1.343e+05 3.643e+04 -3.687 0.000236 ***
## GarageQualTA -1.139e+05 2.911e+04 -3.913 9.55e-05 ***
## GarageCondFa 9.970e+04 3.477e+04 2.867 0.004204 **
## GarageCondGd 9.022e+04 3.559e+04 2.535 0.011358 *
## GarageCondPo 9.979e+04 3.727e+04 2.677 0.007515 **
## GarageCondTA 1.005e+05 3.443e+04 2.919 0.003568 **
## WoodDeckSF 8.518e+00 6.044e+00 1.409 0.158985
## ScreenPorch 2.931e+01 1.281e+01 2.289 0.022258 *
## PoolArea 9.550e+01 1.846e+01 5.172 2.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24990 on 1348 degrees of freedom
## Multiple R-squared: 0.9086, Adjusted R-squared: 0.9011
## F-statistic: 120.7 on 111 and 1348 DF, p-value: < 2.2e-16
The p-value for the F-statistic is less than 0.05 so we can reject the null hypothesis that this model is no better than the model just containing the intercept.
The R-squared is 0.9086 which means that ~ 90% of the variation in response variable can be explained by the predictor variable.
A lot of our Standard errors are about the same size as our betas that is not a good thing. We were hoping for 5-10X smaller.
There is some skew to the residuals but the residuals look evenly distributed and the Q-Q looks good except for the right tail.
Having already made a Kaggle submission, I already know the punchline. This model is too large and may be over-fitting the data. It performs poorly on unseen data.
plot(reduced_model)
## Warning: not plotting observations with leverage one:
## 121, 186, 272, 1276, 1299
SalePrice <- predict(reduced_model,test)
prediction <- data.frame( Id = test[,"Id"], SalePrice = SalePrice)
# replace NA with mean house price
# prediction <- replace(prediction,is.na(prediction), mean(prediction$SalePrice, na.rm = TRUE))
# Tried replacing missing predictions with zeros and didn't change my score
prediction <- replace(prediction,is.na(prediction), 0)
#write.csv(prediction, "prediction_2.csv", row.names = FALSE)
img