Problem 1 - Playing with PageRank (30 Points)
You’ll verify for yourself that PageRank works by performing calculations on a small universe of web pages. Let’s use the 6 page universe that we had in the previous discussion For this directed graph, perform the following calculations in R.
- Form the A matrix. Then, introduce decay and form the B matrix as we did in the course notes. (5 Points)
# create transition matrix
A = matrix(c(0,0,1/3,0,0,0, 1/2,0,1/3,0,0,0, 1/2,0,0,0,0,0, 0,0,0,0,1/2,1, 0,0,1/3,1/2,0,0, 0,0,0,1/2,1/2,0),6)
print('A Transition Matrix')## [1] "A Transition Matrix"
print(round(A,3))## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.000 0.500 0.5 0.0 0.000 0.0
## [2,] 0.000 0.000 0.0 0.0 0.000 0.0
## [3,] 0.333 0.333 0.0 0.0 0.333 0.0
## [4,] 0.000 0.000 0.0 0.0 0.500 0.5
## [5,] 0.000 0.000 0.0 0.5 0.000 0.5
## [6,] 0.000 0.000 0.0 1.0 0.000 0.0
# replace 2nd row with random prob for the 6 pages 1/6
ri <- matrix(c(1/6, 1/6, 1/6, 1/6, 1/6, 1/6),ncol=6)
A[2,] <- ri
A <- t(A)
print(round(A,3))## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0 0.167 0.333 0.0 0.0 0
## [2,] 0.5 0.167 0.333 0.0 0.0 0
## [3,] 0.5 0.167 0.000 0.0 0.0 0
## [4,] 0.0 0.167 0.000 0.0 0.5 1
## [5,] 0.0 0.167 0.333 0.5 0.0 0
## [6,] 0.0 0.167 0.000 0.5 0.5 0
# calculate the decay
decay <- 0.85
n <- nrow(A)
B <- decay*A + (1-decay)/n
print('---')## [1] "---"
print('\ B Matrix with Decay')## [1] " B Matrix with Decay"
print(round(B,4))## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.025 0.1667 0.3083 0.025 0.025 0.025
## [2,] 0.450 0.1667 0.3083 0.025 0.025 0.025
## [3,] 0.450 0.1667 0.0250 0.025 0.025 0.025
## [4,] 0.025 0.1667 0.0250 0.025 0.450 0.875
## [5,] 0.025 0.1667 0.3083 0.450 0.025 0.025
## [6,] 0.025 0.1667 0.0250 0.450 0.450 0.025
- Start with a uniform rank vector r and perform power iterations on B till convergence. That is, compute the solution r = Bn × r. Attempt this for a sufficiently large n so that r actually converges. (5 Points)
ri = matrix(c(1/6,1/6,1/6,1/6,1/6,1/6),6)
ri## [,1]
## [1,] 0.1666667
## [2,] 0.1666667
## [3,] 0.1666667
## [4,] 0.1666667
## [5,] 0.1666667
## [6,] 0.1666667
n <- 10
r10 <- matrix.power(B, n) %*% ri
r10## [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085
n <- 20
r20 <- matrix.power(B, n) %*% ri
r20## [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408
n <- 30
r30 <- matrix.power(B, n) %*% ri
r30## [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
# by n = 30 the result vector converges
print('---')## [1] "---"
print('1.b) Matrix Converges by n=30 ')## [1] "1.b) Matrix Converges by n=30 "
rf <- r30
rf## [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
- Compute the eigen-decomposition of B and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.(10 points)
ev <- eigen(B)
ev## eigen() decomposition
## $values
## [1] 1.00000000+0i 0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i
##
## $vectors
## [,1] [,2] [,3]
## [1,] 0.1044385+0i 0.2931457+0i 1.227106e-15+0.000000e+00i
## [2,] 0.1488249+0i 0.5093703+0i -1.431623e-17-0.000000e+00i
## [3,] 0.1159674+0i 0.3414619+0i -1.382539e-15-0.000000e+00i
## [4,] 0.7043472+0i -0.5890805+0i -7.071068e-01+0.000000e+00i
## [5,] 0.4037861+0i -0.1413606+0i 7.071068e-01+0.000000e+00i
## [6,] 0.5425377+0i -0.4135367+0i 0.000000e+00-4.057108e-09i
## [,4] [,5] [,6]
## [1,] 1.227106e-15-0.000000e+00i -0.06471710+0i -0.212296003+0i
## [2,] -1.431623e-17+0.000000e+00i 0.01388698+0i 0.854071294+0i
## [3,] -1.382539e-15+0.000000e+00i 0.07298180+0i -0.363638739+0i
## [4,] -7.071068e-01-0.000000e+00i -0.66058664+0i 0.018399984+0i
## [5,] 7.071068e-01+0.000000e+00i 0.73761812+0i -0.304719509+0i
## [6,] 0.000000e+00+4.057108e-09i -0.09918316+0i 0.008182973+0i
# max eigenvalue
print('---')## [1] "---"
print('eigen values')## [1] "eigen values"
ev$values## [1] 1.00000000+0i 0.57619235+0i -0.42500000+0i -0.42500000-0i -0.34991524+0i
## [6] -0.08461044+0i
print('max eigen value is 1')## [1] "max eigen value is 1"
which.max(ev$values)## [1] 1
# same vector and sums to 1
e_vec <- as.numeric((1/sum(ev$vectors[,1]))*ev$vectors[,1])
print('same vector')## [1] "same vector"
e_vec## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
print('vector sums to 1')## [1] "vector sums to 1"
sum(e_vec)## [1] 1
Use the graph package in R and its page.rank method to compute the Page Rank of the graph as given in A. Note that you don’t need to apply decay. The package starts with a connected graph and applies decay internally. Verify that you do get the same PageRank vector as the two approaches above. (10 points)
A_2 <- matrix(c(0,1/2,1/2,0,0,0,0,0,0,0,0,0,1/3,1/3,0,0,1/3,0,0,0,0,0,1/2,1/2,0,0,0,1/2,0,1/2,0,0,0,1,0,0),nrow=6,ncol=6,byrow = T)
igraph_A <- graph_from_adjacency_matrix(A_2, weighted=TRUE,mode = "directed")
plot(igraph_A)ipageRank <- as.matrix(page.rank(igraph_A)$vector)
round(ipageRank,4) == round(rf,4)## [,1]
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
Problem 2 - digit-recognizer
MNIST (“Modified National Institute of Standards and Technology”) is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike.
Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)
train_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/digit-recognizer/train.csv?raw=true", show_col_types = FALSE)
# print first 10
head(train_df, n = 10) %>%
mutate(id = row_number()) %>%
pivot_longer(cols = -c(label, id)) %>%
mutate(
pixel = as.numeric(str_extract(name, "\\d+")),
x = pixel %% 28,
y = 28 - pixel %/% 28
) %>%
ggplot(aes(x, y, fill = value)) +
geom_tile() +
facet_wrap(~ label + id)labels = train_df[,1]
train_data <- train_df[,-1]/255
# print first 10
head(cbind(labels,train_data), n = 10) %>%
mutate(id = row_number()) %>%
pivot_longer(cols = -c(label, id)) %>%
mutate(
pixel = as.numeric(str_extract(name, "\\d+")),
x = pixel %% 28,
y = 28 - pixel %/% 28
) %>%
ggplot(aes(x, y, fill = value)) +
geom_tile() +
facet_wrap(~ label + id)What is the frequency distribution of the numbers in the dataset? (5 points)
train_vec=as.vector(unlist(train_data))
hist(train_vec, breaks = 100)train_arr=array(train_vec, dim=c(nrow(train_data),28,28))
train_arr<-train_arr/255
logInd<-train_arr>0
hist(train_arr[logInd], breaks = 100, freq = FALSE)For each number, provide the mean pixel intensity. What does this tell you? (5 points)
train_freq <- as.data.frame(table(labels)/nrow(train_data))
train_freq## label Freq
## 1 0 0.09838095
## 2 1 0.11152381
## 3 2 0.09945238
## 4 3 0.10359524
## 5 4 0.09695238
## 6 5 0.09035714
## 7 6 0.09850000
## 8 7 0.10478571
## 9 8 0.09673810
## 10 9 0.09971429
ggplot(train_freq, aes(x = label, y = Freq, fill = Freq)) +
geom_bar(stat = 'identity') +
scale_fill_gradient(low = "lightgray", high = "red")Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points)
# create vector and array
train_vec=as.vector(unlist(train_data))
train_arr=array(train_vec, dim=c(nrow(train_data),28,28))
# convert this to 784 rows (pixels) and 42000 columns (images)
train_flat_mat<-t(matrix(train_arr, ncol=28*28, byrow = FALSE))
train_pca<-prcomp(train_flat_mat, scale = TRUE)
train_cumvar <- (cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2))
# 95% variance
cumvar_95 <- which.max(train_cumvar >= .95)
print(paste0("At 95% variance there were ", (cumvar_95), " components generated."))## [1] "At 95% variance there were 124 components generated."
var_explained = train_pca$sdev^2 / sum(train_pca$sdev^2)
qplot(c(1:50), var_explained[1:50]) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot First 50 Only") Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points) | There are 704 components generated. There are more components because of the increasing the accounted accuracy.
# 95% variance
cumvar_100 <- which.max(train_cumvar >= 1)
print(paste0("At 100% variance there were ", (cumvar_100), " components generated."))## [1] "At 100% variance there were 704 components generated."
Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points) | Images generated by PCA typically have noise.
par(mfrow = c (2, 5))
for (i in 1:10){
image(1:28, 1:28, array(train_pca$x[,i], dim = c(28, 28)))
}Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)
# create array and matrix
eights<-which(labels == 8)
eights_arr<-train_arr[eights,,]
eights_flat_mat<-t(matrix(eights_arr, ncol=28*28, byrow = FALSE))
# pca
eights_pca <- prcomp(eights_flat_mat, scale = TRUE)
eights_cumvar <- (cumsum(eights_pca$sdev^2) / sum(eights_pca$sdev^2))
# 100% variance
eights_100 <- which.max(eights_cumvar >= 1)
print(paste0("At 100% variance there were ", (eights_100), " components generated."))## [1] "At 100% variance there were 537 components generated."
var_explained = train_pca$sdev^2 / sum(train_pca$sdev^2)
qplot(c(1:50), var_explained[1:50]) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot First 50 Only") par(mfrow = c (2, 5))
for (i in 1:10){
image(1:28, 1:28, array(eights_pca$x[,i], dim = c(28, 28)))
}An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)
# create model
train_labels_new <- as.factor(train_df$label)
train_flat_new<-as.data.frame(cbind(labels, t(train_flat_mat) ))
# multinomial model
nnet_m <- nnet::multinom(label ~., data = train_flat_new, MaxNWts = 10000000)## # weights: 7860 (7065 variable)
## initial value 96708.573906
## iter 10 value 25322.714106
## iter 20 value 20402.086316
## iter 30 value 19312.872829
## iter 40 value 18703.256586
## iter 50 value 18197.815143
## iter 60 value 17732.985798
## iter 70 value 16739.962157
## iter 80 value 14961.658448
## iter 90 value 13446.085942
## iter 100 value 12442.636014
## final value 12442.636014
## stopped after 100 iterations
# predict images
prediction_model <- predict(nnet_m, train_flat_new)
confusionMatrix(prediction_model, train_labels_new)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 3994 0 19 11 4 35 17 5 19 12
## 1 3 4588 59 32 20 39 28 37 134 18
## 2 11 13 3753 88 17 19 17 37 21 10
## 3 8 12 65 3879 9 91 1 9 91 53
## 4 13 6 60 10 3852 55 35 44 38 136
## 5 33 12 20 162 5 3386 45 10 132 33
## 6 35 3 38 15 22 52 3973 2 19 3
## 7 7 8 54 35 7 28 2 4076 18 87
## 8 20 32 85 78 22 51 17 4 3519 25
## 9 8 10 24 41 114 39 2 177 72 3811
##
## Overall Statistics
##
## Accuracy : 0.9245
## 95% CI : (0.922, 0.9271)
## No Information Rate : 0.1115
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9161
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.96660 0.9795 0.89849 0.89152 0.94597 0.89223
## Specificity 0.99678 0.9901 0.99384 0.99100 0.98953 0.98817
## Pos Pred Value 0.97036 0.9254 0.94155 0.91963 0.90657 0.88223
## Neg Pred Value 0.99636 0.9974 0.98885 0.98751 0.99417 0.98928
## Prevalence 0.09838 0.1115 0.09945 0.10360 0.09695 0.09036
## Detection Rate 0.09510 0.1092 0.08936 0.09236 0.09171 0.08062
## Detection Prevalence 0.09800 0.1180 0.09490 0.10043 0.10117 0.09138
## Balanced Accuracy 0.98169 0.9848 0.94617 0.94126 0.96775 0.94020
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.9604 0.92615 0.86611 0.90998
## Specificity 0.9950 0.99346 0.99120 0.98712
## Pos Pred Value 0.9546 0.94308 0.91331 0.88669
## Neg Pred Value 0.9957 0.99137 0.98574 0.99000
## Prevalence 0.0985 0.10479 0.09674 0.09971
## Detection Rate 0.0946 0.09705 0.08379 0.09074
## Detection Prevalence 0.0991 0.10290 0.09174 0.10233
## Balanced Accuracy 0.9777 0.95981 0.92865 0.94855
Problem 3: Advanced Regression Techniques
Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points
# read in training data
house_train_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/house-prices/train.csv?raw=true")
house_train_df %>% select_if(is.numeric) %>% summary()## Id MSSubClass LotFrontage LotArea
## Min. : 1.0 Min. : 20.0 Min. : 21.00 Min. : 1300
## 1st Qu.: 365.8 1st Qu.: 20.0 1st Qu.: 59.00 1st Qu.: 7554
## Median : 730.5 Median : 50.0 Median : 69.00 Median : 9478
## Mean : 730.5 Mean : 56.9 Mean : 70.05 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00 3rd Qu.: 11602
## Max. :1460.0 Max. :190.0 Max. :313.00 Max. :215245
## NA's :259
## OverallQual OverallCond YearBuilt YearRemodAdd
## Min. : 1.000 Min. :1.000 Min. :1872 Min. :1950
## 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967
## Median : 6.000 Median :5.000 Median :1973 Median :1994
## Mean : 6.099 Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :10.000 Max. :9.000 Max. :2010 Max. :2010
##
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 0.0 Median : 383.5 Median : 0.00 Median : 477.5
## Mean : 103.7 Mean : 443.6 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 166.0 3rd Qu.: 712.2 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :1600.0 Max. :5644.0 Max. :1474.00 Max. :2336.0
## NA's :8
## TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF
## Min. : 0.0 Min. : 334 Min. : 0 Min. : 0.000
## 1st Qu.: 795.8 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## Median : 991.5 Median :1087 Median : 0 Median : 0.000
## Mean :1057.4 Mean :1163 Mean : 347 Mean : 5.845
## 3rd Qu.:1298.2 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## Max. :6110.0 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Fireplaces GarageYrBlt GarageCars GarageArea
## Min. :0.000 Min. :1900 Min. :0.000 Min. : 0.0
## 1st Qu.:0.000 1st Qu.:1961 1st Qu.:1.000 1st Qu.: 334.5
## Median :1.000 Median :1980 Median :2.000 Median : 480.0
## Mean :0.613 Mean :1979 Mean :1.767 Mean : 473.0
## 3rd Qu.:1.000 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :3.000 Max. :2010 Max. :4.000 Max. :1418.0
## NA's :81
## WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea MiscVal MoSold
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 5.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 6.000
## Mean : 15.06 Mean : 2.759 Mean : 43.49 Mean : 6.322
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :480.00 Max. :738.000 Max. :15500.00 Max. :12.000
##
## YrSold SalePrice
## Min. :2006 Min. : 34900
## 1st Qu.:2007 1st Qu.:129975
## Median :2008 Median :163000
## Mean :2008 Mean :180921
## 3rd Qu.:2009 3rd Qu.:214000
## Max. :2010 Max. :755000
##
# reokace b/a with 0 where applicable
house_train_df <- house_train_df %>% mutate(
LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
PoolQC = ifelse(is.na(PoolQC), "None", PoolQC)
)
#Print number of null
print('---- nulls ----')## [1] "---- nulls ----"
all_nas <- colSums(sapply(house_train_df[names(house_train_df)], is.na))
all_nas[which(all_nas > 0)]## Alley MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 1369 8 37 37 38 37
## BsmtFinType2 Electrical GarageType GarageFinish GarageQual GarageCond
## 38 1 81 81 81 81
## Fence MiscFeature
## 1179 1406
Provide univariate descriptive statistics and appropriate plots for the training data set.
# SalePrice
ggplot(house_train_df, aes(x = SalePrice)) +
geom_histogram() +
scale_x_continuous(labels = comma)# GrLivArea
ggplot(house_train_df, aes(x = GrLivArea)) +
geom_histogram()# Neighborhood
ggplot(house_train_df, aes(x = Neighborhood)) +
geom_bar() +
coord_flip()# BedroomAbvGr
ggplot(house_train_df, aes(x = BedroomAbvGr)) +
geom_bar()# LotArea
ggplot(house_train_df, aes(x = LotArea)) +
geom_histogram()Provide a scatter plot matrix for at least two of the independent variables and the dependent variable.
house_train_df %>%
dplyr::select('LotFrontage','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd'
,'FullBath','BedroomAbvGr','TotRmsAbvGrd','GrLivArea','YrSold','SalePrice') %>%
ggpairs()ggplot(house_train_df, aes(y=SalePrice, x=GrLivArea)) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=LotFrontage)) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=YearRemodAdd)) + geom_point()Derive a correlation matrix for any three quantitative variables in the dataset.
n_df <- house_train_df %>%
dplyr::select('LotArea','GrLivArea','FullBath') %>%
drop_na()
mycol=colorRampPalette(c("red","orange","yellow","white","green", "dark green"))(20)
mycor=cor(n_df)
mycor## LotArea GrLivArea FullBath
## LotArea 1.0000000 0.2631162 0.1260306
## GrLivArea 0.2631162 1.0000000 0.6300116
## FullBath 0.1260306 0.6300116 1.0000000
corrplot(mycor, method="ellipse", type="upper",
addCoef.col=TRUE, tl.cex=.7, number.cex=.5, insig="blank",
order="hclust", hclust.method="centroid", number.digits=2,
col=mycol)Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points
- LotArea and FullBath - p-value below 0.2 reject the null hypothesis that a correlation does not exist. correlation is = 0.126
- GrLivArea and FullBath - p-value below 0.2 reject the null hypothesis that a correlation does not exist. correlation is = 0.630
Would you be worried about familywise error? Why or why not? 5 points
cor.test(house_train_df$LotArea, house_train_df$GrLivArea, conf.level = 0.8)##
## Pearson's product-moment correlation
##
## data: house_train_df$LotArea and house_train_df$GrLivArea
## t = 10.414, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2315997 0.2940809
## sample estimates:
## cor
## 0.2631162
cor.test(house_train_df$LotArea, house_train_df$FullBath, conf.level = 0.8)##
## Pearson's product-moment correlation
##
## data: house_train_df$LotArea and house_train_df$FullBath
## t = 4.851, df = 1458, p-value = 1.36e-06
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.09286177 0.15892007
## sample estimates:
## cor
## 0.1260306
cor.test(house_train_df$GrLivArea, house_train_df$FullBath, conf.level = 0.8)##
## Pearson's product-moment correlation
##
## data: house_train_df$GrLivArea and house_train_df$FullBath
## t = 30.977, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6093339 0.6498331
## sample estimates:
## cor
## 0.6300116
Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points
mycor.inv = solve(mycor)
mycor.inv## LotArea GrLivArea FullBath
## LotArea 1.07740995 -0.328207 0.07098756
## GrLivArea -0.32820697 1.758121 -1.06627226
## FullBath 0.07098756 -1.066272 1.66281734
m_mi <- mycor %*% mycor.inv
mycor.inv %*% mycor## LotArea GrLivArea FullBath
## LotArea 1.000000e+00 -1.300003e-17 2.775558e-17
## GrLivArea 9.626782e-18 1.000000e+00 0.000000e+00
## FullBath 6.761552e-18 -1.106864e-16 1.000000e+00
lu.decomposition(m_mi)## $L
## [,1] [,2] [,3]
## [1,] 1.000000e+00 0.000000e+00 0
## [2,] -1.300003e-17 1.000000e+00 0
## [3,] 2.775558e-17 -2.671969e-34 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 9.626782e-18 6.761552e-18
## [2,] 0 1.000000e+00 -1.106864e-16
## [3,] 0 0.000000e+00 1.000000e+00
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R- devel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points
ggplot(house_train_df, aes(x=GrLivArea)) + geom_histogram()m <- min(house_train_df$GrLivArea)
m## [1] 334
GrLivArea_shift <- house_train_df$GrLivArea - m
ggplot() + geom_histogram(aes(x=GrLivArea_shift))exp_GrLivArea <- fitdistr(GrLivArea_shift,'exponential')
lamb_GrLivArea <- exp_GrLivArea$estimate
lamb_GrLivArea## rate
## 0.0008464077
exp_sample <- rexp(1000, lamb_GrLivArea)
summary(exp_sample)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.74 313.46 776.80 1125.80 1562.82 10167.15
ggplot() + geom_histogram(aes(x=exp_sample))l <- qexp(0.05, lamb_GrLivArea)
l## [1] 60.60117
u <- qexp(0.95, lamb_GrLivArea)
u## [1] 3539.349
quantile(exp_sample, probs = c(0.05,0.95))## 5% 95%
## 62.45815 3353.36821
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score. 10 points
Load data
# read in training data
house_train_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/house-prices/train.csv?raw=true")
house_train_df %>% select_if(is.numeric) %>% summary()## Id MSSubClass LotFrontage LotArea
## Min. : 1.0 Min. : 20.0 Min. : 21.00 Min. : 1300
## 1st Qu.: 365.8 1st Qu.: 20.0 1st Qu.: 59.00 1st Qu.: 7554
## Median : 730.5 Median : 50.0 Median : 69.00 Median : 9478
## Mean : 730.5 Mean : 56.9 Mean : 70.05 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00 3rd Qu.: 11602
## Max. :1460.0 Max. :190.0 Max. :313.00 Max. :215245
## NA's :259
## OverallQual OverallCond YearBuilt YearRemodAdd
## Min. : 1.000 Min. :1.000 Min. :1872 Min. :1950
## 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967
## Median : 6.000 Median :5.000 Median :1973 Median :1994
## Mean : 6.099 Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :10.000 Max. :9.000 Max. :2010 Max. :2010
##
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 0.0 Median : 383.5 Median : 0.00 Median : 477.5
## Mean : 103.7 Mean : 443.6 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 166.0 3rd Qu.: 712.2 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :1600.0 Max. :5644.0 Max. :1474.00 Max. :2336.0
## NA's :8
## TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF
## Min. : 0.0 Min. : 334 Min. : 0 Min. : 0.000
## 1st Qu.: 795.8 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## Median : 991.5 Median :1087 Median : 0 Median : 0.000
## Mean :1057.4 Mean :1163 Mean : 347 Mean : 5.845
## 3rd Qu.:1298.2 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## Max. :6110.0 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Fireplaces GarageYrBlt GarageCars GarageArea
## Min. :0.000 Min. :1900 Min. :0.000 Min. : 0.0
## 1st Qu.:0.000 1st Qu.:1961 1st Qu.:1.000 1st Qu.: 334.5
## Median :1.000 Median :1980 Median :2.000 Median : 480.0
## Mean :0.613 Mean :1979 Mean :1.767 Mean : 473.0
## 3rd Qu.:1.000 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :3.000 Max. :2010 Max. :4.000 Max. :1418.0
## NA's :81
## WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea MiscVal MoSold
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 5.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 6.000
## Mean : 15.06 Mean : 2.759 Mean : 43.49 Mean : 6.322
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :480.00 Max. :738.000 Max. :15500.00 Max. :12.000
##
## YrSold SalePrice
## Min. :2006 Min. : 34900
## 1st Qu.:2007 1st Qu.:129975
## Median :2008 Median :163000
## Mean :2008 Mean :180921
## 3rd Qu.:2009 3rd Qu.:214000
## Max. :2010 Max. :755000
##
# reokace b/a with 0 where applicable
house_train_df <- house_train_df %>% mutate(
LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF),
PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
Street = ifelse(is.na(Street), "None", Street),
Alley = ifelse(is.na(Alley), "None", Alley),
GarageType = ifelse(is.na(GarageType), "None", GarageType),
GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
GarageCond = ifelse(is.na(GarageCond), "None", GarageCond),
BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
KitchenQual = ifelse(is.na(KitchenQual), "None", KitchenQual),
GrLivArea = ifelse(is.na(GrLivArea), 0, GrLivArea)
)
#Print number of null
print('---- nulls ----')## [1] "---- nulls ----"
all_nas <- colSums(sapply(house_train_df[names(house_train_df)], is.na))
all_nas[which(all_nas > 0)]## MasVnrType BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical
## 8 37 38 37 38 1
## Fence MiscFeature
## 1179 1406
Baseline - Full Numeric Model
# moduel with numeric variables only
houseNum_df <- house_train_df %>% dplyr::select_if(is.numeric)
ml.num <- lm(SalePrice ~., houseNum_df)
# stepwize optimizaiton
ml.num.step <- stats::step(ml.num, criteria='AIC', trace=FALSE,direction='both')
summary(ml.num.step)##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
## BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + LowQualFinSF + BsmtFullBath +
## FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces +
## GarageCars + WoodDeckSF + ScreenPorch, data = houseNum_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -486479 -16700 -2066 13824 287942
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.744e+05 1.245e+05 -7.826 9.70e-15 ***
## MSSubClass -1.690e+02 2.596e+01 -6.512 1.03e-10 ***
## LotArea 4.142e-01 9.987e-02 4.147 3.56e-05 ***
## OverallQual 1.758e+04 1.172e+03 15.002 < 2e-16 ***
## OverallCond 4.359e+03 1.008e+03 4.326 1.62e-05 ***
## YearBuilt 2.970e+02 5.408e+01 5.492 4.68e-08 ***
## YearRemodAdd 1.668e+02 6.544e+01 2.548 0.01092 *
## MasVnrArea 3.156e+01 5.885e+00 5.363 9.53e-08 ***
## BsmtFinSF1 1.609e+01 3.908e+00 4.117 4.06e-05 ***
## BsmtUnfSF 7.005e+00 3.628e+00 1.931 0.05370 .
## `1stFlrSF` 4.917e+01 5.245e+00 9.375 < 2e-16 ***
## `2ndFlrSF` 4.574e+01 4.238e+00 10.794 < 2e-16 ***
## LowQualFinSF 2.863e+01 1.960e+01 1.461 0.14428
## BsmtFullBath 9.670e+03 2.423e+03 3.991 6.91e-05 ***
## FullBath 4.843e+03 2.586e+03 1.873 0.06132 .
## BedroomAbvGr -1.029e+04 1.675e+03 -6.143 1.05e-09 ***
## KitchenAbvGr -1.357e+04 5.143e+03 -2.639 0.00842 **
## TotRmsAbvGrd 5.088e+03 1.223e+03 4.159 3.38e-05 ***
## Fireplaces 3.448e+03 1.741e+03 1.981 0.04782 *
## GarageCars 1.049e+04 1.691e+03 6.202 7.26e-10 ***
## WoodDeckSF 2.515e+01 7.869e+00 3.196 0.00143 **
## ScreenPorch 5.352e+01 1.686e+01 3.174 0.00154 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34720 on 1438 degrees of freedom
## Multiple R-squared: 0.8118, Adjusted R-squared: 0.809
## F-statistic: 295.3 on 21 and 1438 DF, p-value: < 2.2e-16
# residuals
par(mfrow=c(2,2))
plot(ml.num.step)lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
BsmtUnfSF + 1stFlrSF + 2ndFlrSF + LowQualFinSF
+ BsmtFullBath + FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
Fireplaces + GarageCars + WoodDeckSF + ScreenPorch, data =
houseNum_df)
Analysis - Candidate Variable
candidates <- c('SalePrice','MSSubClass','LotArea','LotFrontage','OverallQual','YearBuilt','YearRemodAdd',
'BsmtFinSF1','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath',
'FullBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea',
'WoodDeckSF','OpenPorchSF','FireplaceQu','SaleCondition','OverallCond','CentralAir','PoolQC',
'ExterCond','MasVnrArea','TotalBsmtSF','ScreenPorch')
full_df <- house_train_df %>%
dplyr::select(all_of(candidates)) %>%
mutate(ExterCond=as.numeric(as.factor(ExterCond)),
FullBath=as.numeric(as.factor(FullBath)),
FireplaceQu=as.numeric(as.factor(FireplaceQu)),
SaleCondition=as.numeric(as.factor(SaleCondition)),
GarageCars=as.numeric(as.factor(GarageCars)),
OverallCond=as.numeric(as.factor(OverallCond)),
CentralAir=as.numeric(as.factor(CentralAir)),
PoolQC=as.numeric(as.factor(PoolQC))
)
ames_corr <- full_df
ames_correlations <- cor(ames_corr, method = c("pearson"))
corrplot(ames_correlations, method="color")t <- ames_correlations
t[t < 0.3 ] <- 0
corrplot(t, method="color")t <- as.data.frame(ames_correlations)
t <- t[t$SalePrice > 0.3]
t <- t %>% dplyr::filter(SalePrice > 0.3)
candidates <- names(t)
candidates## [1] "SalePrice" "OverallQual" "YearBuilt" "YearRemodAdd" "BsmtFinSF1"
## [6] "TotalBsmtSF" "1stFlrSF" "2ndFlrSF" "GrLivArea" "FullBath"
## [11] "TotRmsAbvGrd" "Fireplaces" "GarageCars" "GarageArea" "WoodDeckSF"
## [16] "OpenPorchSF" "MasVnrArea"
ames_corr %>%
dplyr::select(all_of(candidates)) %>%
ggpairs()corrplot(as.matrix(t), method="color")ggplot(house_train_df, aes(y=SalePrice, x=OverallQual)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=YearBuilt)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=YearRemodAdd)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=BsmtFinSF1)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=TotalBsmtSF)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=house_train_df$'1stFlrSF')) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=house_train_df$'2ndFlrSF')) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=GrLivArea)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=FullBath)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=TotRmsAbvGrd)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=Fireplaces)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=GarageCars)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=GarageArea)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=WoodDeckSF)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=OpenPorchSF)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=MasVnrArea)) +
scale_y_continuous(labels = comma) + geom_point()ggplot(house_train_df, aes(x=log(SalePrice))) + geom_histogram()ggplot(house_train_df, aes(x=YearBuilt-min(house_train_df$YearBuilt))) + geom_histogram()ggplot(house_train_df, aes(x=YearRemodAdd-min(house_train_df$YearRemodAdd))) + geom_histogram()ggplot(house_train_df, aes(x=log(TotalBsmtSF))) + geom_histogram()ggplot(house_train_df, aes(x=house_train_df$'1stFlrSF')) + geom_histogram()ggplot(house_train_df, aes(x=GrLivArea)) + geom_histogram()ggplot(house_train_df, aes(x=GarageArea)) + geom_histogram()ggplot(house_train_df, aes(y=SalePrice, x=YearBuilt)) +
scale_y_continuous(labels = comma) + geom_point() +
facet_wrap(vars(Neighborhood))ggplot(house_train_df, aes(y=SalePrice, x=YearBuilt)) +
scale_x_log10() + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=TotalBsmtSF)) +
scale_x_log10() + geom_point()ggplot(house_train_df, aes(y=SalePrice, x=TotalBsmtSF)) +
scale_x_sqrt() + geom_point()[1] “SalePrice” “OverallQual” “YearBuilt” “YearRemodAdd” “BsmtFinSF1”
“TotalBsmtSF” “1stFlrSF” “2ndFlrSF”
[9] “GrLivArea” “FullBath” “TotRmsAbvGrd” “Fireplaces” “GarageCars”
“GarageArea” “WoodDeckSF” “OpenPorchSF” [17] “MasVnrArea”
Full Model
candidates <- c('SalePrice','OverallQual','YearBuilt','YearRemodAdd','BsmtFinSF1','BsmtUnfSF',
'TotalBsmtSF',
'1stFlrSF','2ndFlrSF','GrLivArea','BsmtFullBath','FullBath','HalfBath','KitchenAbvGr',
'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
'FireplaceQu',
'SaleCondition','OverallCond','CentralAir','PoolQC','ExterCond','YearRemodAdd',
'MasVnrArea',
'MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','LotConfig','HouseStyle',
'RoofStyle','RoofMatl','BsmtQual','BsmtCond','BsmtFinType1','KitchenQual','GarageType',
'GarageQual')
full_df <- house_train_df %>%
dplyr::select(all_of(candidates)) %>%
mutate(ExterCond=(as.factor(ExterCond)),
MSSubClass=(as.factor(MSSubClass)),
MSZoning=(as.factor(MSZoning)),
Street=(as.factor(Street)),
Alley=(as.factor(Alley)),
LotShape=(as.factor(LotShape)),
LandContour=(as.factor(LandContour)),
LotConfig=(as.factor(LotConfig)),
HouseStyle=(as.factor(HouseStyle)),
RoofStyle=(as.factor(RoofStyle)),
RoofMatl=(as.factor(RoofMatl)),
BsmtQual=(as.factor(BsmtQual)),
BsmtCond=(as.factor(BsmtCond)),
BsmtFinType1=(as.factor(BsmtFinType1)),
KitchenQual=(as.factor(KitchenQual)),
FireplaceQu=(as.factor(FireplaceQu)),
GarageType=(as.factor(GarageType)),
GarageQual=(as.factor(GarageQual)),
SaleCondition=(as.factor(SaleCondition)),
OverallCond=c(as.factor(OverallCond)),
CentralAir=(as.factor(CentralAir)),
PoolQC=(as.factor(PoolQC))
)
#full_df <- ames_corr
ml.full <- lm(formula = SalePrice ~ ., data = full_df)
# stepwize optimizaiton
ml.full.step <- stats::step(ml.full, criteria='AIC', trace=FALSE,direction='both')
summary(ml.full.step)##
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + BsmtUnfSF +
## TotalBsmtSF + `1stFlrSF` + `2ndFlrSF` + KitchenAbvGr + TotRmsAbvGrd +
## Fireplaces + GarageCars + GarageArea + WoodDeckSF + SaleCondition +
## OverallCond + PoolQC + MasVnrArea + MSSubClass + MSZoning +
## LotShape + LandContour + LotConfig + RoofMatl + BsmtQual +
## BsmtFinType1 + KitchenQual, data = full_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -395914 -12122 114 11539 187150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.291e+06 1.568e+05 -8.231 4.33e-16 ***
## OverallQual 9.752e+03 1.089e+03 8.959 < 2e-16 ***
## YearBuilt 3.741e+02 7.345e+01 5.093 4.02e-07 ***
## BsmtUnfSF -2.062e+01 2.960e+00 -6.968 5.03e-12 ***
## TotalBsmtSF 5.005e+01 5.335e+00 9.381 < 2e-16 ***
## `1stFlrSF` 4.129e+01 5.661e+00 7.293 5.14e-13 ***
## `2ndFlrSF` 5.928e+01 5.018e+00 11.813 < 2e-16 ***
## KitchenAbvGr -2.355e+04 6.122e+03 -3.847 0.000125 ***
## TotRmsAbvGrd 1.870e+03 9.462e+02 1.976 0.048310 *
## Fireplaces 5.752e+03 1.443e+03 3.987 7.04e-05 ***
## GarageCars 4.705e+03 2.406e+03 1.956 0.050703 .
## GarageArea 1.192e+01 8.139e+00 1.465 0.143286
## WoodDeckSF 1.249e+01 6.487e+00 1.926 0.054347 .
## SaleConditionAdjLand 1.534e+04 1.738e+04 0.883 0.377615
## SaleConditionAlloca 2.308e+04 1.061e+04 2.175 0.029767 *
## SaleConditionFamily -1.785e+03 6.992e+03 -0.255 0.798489
## SaleConditionNormal 9.051e+03 3.063e+03 2.955 0.003186 **
## SaleConditionPartial 2.455e+04 4.257e+03 5.766 1.01e-08 ***
## OverallCond2 -6.698e+03 3.172e+04 -0.211 0.832778
## OverallCond3 -2.448e+04 2.997e+04 -0.817 0.414156
## OverallCond4 -1.467e+04 2.961e+04 -0.496 0.620300
## OverallCond5 -5.122e+03 2.952e+04 -0.173 0.862293
## OverallCond6 1.666e+03 2.954e+04 0.056 0.955027
## OverallCond7 1.020e+04 2.951e+04 0.345 0.729784
## OverallCond8 1.001e+04 2.965e+04 0.337 0.735846
## OverallCond9 2.637e+04 3.030e+04 0.870 0.384342
## PoolQCFa -1.225e+05 2.885e+04 -4.244 2.34e-05 ***
## PoolQCGd -5.575e+04 3.001e+04 -1.858 0.063419 .
## PoolQCNone -1.017e+05 2.116e+04 -4.807 1.70e-06 ***
## MasVnrArea 2.539e+01 4.968e+00 5.110 3.68e-07 ***
## MSSubClass30 1.500e+04 4.756e+03 3.155 0.001642 **
## MSSubClass40 1.484e+04 1.459e+04 1.017 0.309422
## MSSubClass45 1.350e+04 8.693e+03 1.553 0.120763
## MSSubClass50 3.569e+02 4.269e+03 0.084 0.933391
## MSSubClass60 -1.016e+04 4.861e+03 -2.090 0.036811 *
## MSSubClass70 1.104e+03 6.269e+03 0.176 0.860275
## MSSubClass75 -5.633e+03 9.191e+03 -0.613 0.540022
## MSSubClass80 -6.682e+03 4.289e+03 -1.558 0.119501
## MSSubClass85 -6.026e+03 6.714e+03 -0.898 0.369591
## MSSubClass90 -1.214e+04 6.646e+03 -1.827 0.067915 .
## MSSubClass120 -8.254e+03 3.657e+03 -2.257 0.024191 *
## MSSubClass160 -2.460e+04 5.564e+03 -4.422 1.06e-05 ***
## MSSubClass180 2.043e+03 9.719e+03 0.210 0.833539
## MSSubClass190 3.840e+03 6.892e+03 0.557 0.577553
## MSZoningFV 2.585e+04 1.025e+04 2.523 0.011764 *
## MSZoningRH 1.825e+04 1.184e+04 1.541 0.123441
## MSZoningRL 1.601e+04 9.467e+03 1.692 0.090954 .
## MSZoningRM 8.414e+03 9.522e+03 0.884 0.377067
## LotShapeIR2 1.525e+04 4.729e+03 3.225 0.001292 **
## LotShapeIR3 1.176e+04 9.581e+03 1.227 0.220048
## LotShapeReg -1.024e+03 1.796e+03 -0.570 0.568658
## LandContourHLS 2.267e+04 5.542e+03 4.091 4.56e-05 ***
## LandContourLow 1.667e+04 6.680e+03 2.496 0.012694 *
## LandContourLvl 8.705e+03 3.981e+03 2.187 0.028932 *
## LotConfigCulDSac 8.311e+03 3.641e+03 2.283 0.022604 *
## LotConfigFR2 -7.351e+03 4.574e+03 -1.607 0.108239
## LotConfigFR3 -7.840e+03 1.418e+04 -0.553 0.580538
## LotConfigInside -9.145e+02 2.019e+03 -0.453 0.650667
## RoofMatlCompShg 6.822e+05 4.084e+04 16.702 < 2e-16 ***
## RoofMatlMembran 7.073e+05 5.021e+04 14.088 < 2e-16 ***
## RoofMatlMetal 7.045e+05 5.041e+04 13.974 < 2e-16 ***
## RoofMatlRoll 6.746e+05 5.036e+04 13.396 < 2e-16 ***
## RoofMatlTar&Grv 6.611e+05 4.101e+04 16.120 < 2e-16 ***
## RoofMatlWdShake 6.761e+05 4.307e+04 15.697 < 2e-16 ***
## RoofMatlWdShngl 7.545e+05 4.228e+04 17.846 < 2e-16 ***
## BsmtQualFa -2.957e+04 6.809e+03 -4.342 1.52e-05 ***
## BsmtQualGd -2.861e+04 3.532e+03 -8.098 1.24e-15 ***
## BsmtQualTA -2.893e+04 4.412e+03 -6.557 7.82e-11 ***
## BsmtFinType1BLQ 4.454e+02 3.052e+03 0.146 0.883982
## BsmtFinType1GLQ 9.453e+03 2.739e+03 3.451 0.000575 ***
## BsmtFinType1LwQ -5.333e+03 3.904e+03 -1.366 0.172204
## BsmtFinType1Rec -2.835e+03 3.235e+03 -0.876 0.381029
## BsmtFinType1Unf 5.868e+03 3.178e+03 1.846 0.065093 .
## KitchenQualFa -3.251e+04 6.599e+03 -4.927 9.39e-07 ***
## KitchenQualGd -3.120e+04 3.626e+03 -8.602 < 2e-16 ***
## KitchenQualTA -3.525e+04 4.092e+03 -8.615 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27690 on 1347 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.8848, Adjusted R-squared: 0.8783
## F-statistic: 137.9 on 75 and 1347 DF, p-value: < 2.2e-16
# residuals
par(mfrow=c(2,2))
plot(ml.full.step)Model 1
Removed the following variable to generate the final model - GarageCars - WoodDeckSF - GarageArea - OverallCond - MSSubClass - MSZoning - BsmtFinType1 - LotConfig
candidates <- c('SalePrice','OverallQual','YearBuilt','BsmtUnfSF','TotalBsmtSF',
'1stFlrSF','2ndFlrSF','KitchenAbvGr', 'TotRmsAbvGrd','Fireplaces',
'SaleCondition',
'PoolQC', 'MasVnrArea','LotShape', 'LandContour',
'RoofMatl', 'BsmtQual','KitchenQual')
df1 <- full_df %>% dplyr::select(all_of(candidates))
# model v1
ml.v1 <- lm(formula = SalePrice ~ ., data = df1)
# stepwize optimization
ml.v1.step <- stats::step(ml.v1, criteria='AIC', trace=FALSE,direction='both')
summary(ml.v1.step)##
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + BsmtUnfSF +
## TotalBsmtSF + `1stFlrSF` + `2ndFlrSF` + KitchenAbvGr + TotRmsAbvGrd +
## Fireplaces + SaleCondition + PoolQC + MasVnrArea + LotShape +
## LandContour + RoofMatl + BsmtQual + KitchenQual, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -401554 -14380 -87 13885 186274
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.204e+05 9.850e+04 -9.344 < 2e-16 ***
## OverallQual 1.192e+04 1.030e+03 11.567 < 2e-16 ***
## YearBuilt 1.859e+02 4.437e+01 4.191 2.95e-05 ***
## BsmtUnfSF -2.103e+01 2.087e+00 -10.077 < 2e-16 ***
## TotalBsmtSF 4.907e+01 4.870e+00 10.077 < 2e-16 ***
## `1stFlrSF` 4.738e+01 5.381e+00 8.806 < 2e-16 ***
## `2ndFlrSF` 5.196e+01 3.517e+00 14.772 < 2e-16 ***
## KitchenAbvGr -2.858e+04 4.167e+03 -6.857 1.04e-11 ***
## TotRmsAbvGrd 1.948e+03 9.273e+02 2.101 0.035837 *
## Fireplaces 5.097e+03 1.453e+03 3.508 0.000465 ***
## SaleConditionAdjLand 5.109e+03 1.524e+04 0.335 0.737559
## SaleConditionAlloca 2.383e+04 9.660e+03 2.467 0.013729 *
## SaleConditionFamily -2.492e+03 7.264e+03 -0.343 0.731642
## SaleConditionNormal 1.323e+04 3.109e+03 4.254 2.24e-05 ***
## SaleConditionPartial 3.137e+04 4.335e+03 7.235 7.56e-13 ***
## PoolQCFa -1.262e+05 2.995e+04 -4.213 2.67e-05 ***
## PoolQCGd -4.450e+04 3.074e+04 -1.447 0.148002
## PoolQCNone -9.865e+04 2.167e+04 -4.551 5.79e-06 ***
## MasVnrArea 2.456e+01 5.048e+00 4.865 1.27e-06 ***
## LotShapeIR2 1.624e+04 4.836e+03 3.359 0.000803 ***
## LotShapeIR3 9.599e+03 1.002e+04 0.958 0.338379
## LotShapeReg -4.817e+03 1.752e+03 -2.750 0.006040 **
## LandContourHLS 2.501e+04 5.760e+03 4.341 1.52e-05 ***
## LandContourLow 1.922e+04 6.633e+03 2.898 0.003809 **
## LandContourLvl 1.006e+04 4.043e+03 2.489 0.012935 *
## RoofMatlCompShg 7.008e+05 4.115e+04 17.030 < 2e-16 ***
## RoofMatlMembran 7.324e+05 5.091e+04 14.386 < 2e-16 ***
## RoofMatlMetal 7.224e+05 5.115e+04 14.125 < 2e-16 ***
## RoofMatlRoll 6.891e+05 5.075e+04 13.577 < 2e-16 ***
## RoofMatlTar&Grv 6.811e+05 4.123e+04 16.519 < 2e-16 ***
## RoofMatlWdShake 7.015e+05 4.351e+04 16.121 < 2e-16 ***
## RoofMatlWdShngl 7.724e+05 4.255e+04 18.153 < 2e-16 ***
## BsmtQualFa -2.732e+04 6.946e+03 -3.933 8.81e-05 ***
## BsmtQualGd -2.994e+04 3.650e+03 -8.201 5.26e-16 ***
## BsmtQualNone -4.483e+03 8.412e+03 -0.533 0.594152
## BsmtQualTA -3.266e+04 4.355e+03 -7.499 1.13e-13 ***
## KitchenQualFa -4.125e+04 6.543e+03 -6.304 3.86e-10 ***
## KitchenQualGd -3.072e+04 3.777e+03 -8.133 9.06e-16 ***
## KitchenQualTA -4.308e+04 4.180e+03 -10.306 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29280 on 1421 degrees of freedom
## Multiple R-squared: 0.8677, Adjusted R-squared: 0.8642
## F-statistic: 245.3 on 38 and 1421 DF, p-value: < 2.2e-16
# residuals
par(mfrow=c(2,2))
plot(ml.v1.step)The residual analysis indicates that there is still a little bit of work that we could do to improve the model.
Predict
Selected model 2 for the predictions
house_test_df <- read_csv("https://github.com/dsimband/DATA605/blob/main/Final/house-prices/test.csv?raw=true")
# reokace b/a with 0 where applicable
house_test_df <- house_test_df %>% mutate(
LotFrontage = ifelse(is.na(LotFrontage), 0, LotFrontage),
MasVnrArea = ifelse(is.na(MasVnrArea), 0, MasVnrArea),
GarageYrBlt = ifelse(is.na(MasVnrArea), mean(GarageYrBlt, na.rm = TRUE), MasVnrArea),
FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF),
PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
Street = ifelse(is.na(Street), "None", Street),
Alley = ifelse(is.na(Alley), "None", Alley),
GarageType = ifelse(is.na(GarageType), "None", GarageType),
GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
GarageCond = ifelse(is.na(GarageCond), "None", GarageCond),
BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
KitchenQual = ifelse(is.na(KitchenQual), "Gd", KitchenQual),
GrLivArea = ifelse(is.na(GrLivArea), 0, GrLivArea),
BsmtUnfSF = ifelse(is.na(BsmtUnfSF), 0, BsmtUnfSF)
)
# identify variables with null values
all_nas <- colSums(sapply(house_test_df[names(house_test_df)], is.na))
all_nas[which(all_nas > 0)]## MSZoning Utilities Exterior1st Exterior2nd MasVnrType BsmtCond
## 4 2 1 1 16 45
## BsmtExposure BsmtFinType1 BsmtFinType2 BsmtFinSF2 BsmtFullBath BsmtHalfBath
## 44 42 42 1 2 2
## Functional GarageCars GarageArea Fence MiscFeature SaleType
## 2 1 1 1169 1408 1
house_test_df <- house_test_df %>%
mutate(ExterCond=(as.factor(ExterCond)),
MSSubClass=(as.factor(MSSubClass)),
MSZoning=(as.factor(MSZoning)),
Street=(as.factor(Street)),
Alley=(as.factor(Alley)),
LotShape=(as.factor(LotShape)),
LandContour=(as.factor(LandContour)),
LotConfig=(as.factor(LotConfig)),
HouseStyle=(as.factor(HouseStyle)),
RoofStyle=(as.factor(RoofStyle)),
RoofMatl=(as.factor(RoofMatl)),
BsmtQual=(as.factor(BsmtQual)),
BsmtCond=(as.factor(BsmtCond)),
BsmtFinType1=(as.factor(BsmtFinType1)),
KitchenQual=(as.factor(KitchenQual)),
FireplaceQu=(as.factor(FireplaceQu)),
GarageType=(as.factor(GarageType)),
GarageQual=(as.factor(GarageQual)),
SaleCondition=(as.factor(SaleCondition)),
OverallCond=c(as.factor(OverallCond)),
CentralAir=(as.factor(CentralAir)),
PoolQC=(as.factor(PoolQC))
)
# predict new values
house_test_df$SalePrice <- predict.lm(ml.v1.step, house_test_df, type = "response")
# plot housing prices vs neighborhood
ggplot(house_test_df, aes(x=SalePrice, fill=Neighborhood)) + geom_histogram()# create results dataframe
result_df <- house_test_df %>% dplyr::select(Id,SalePrice)
# identify na predictions
all_nas <- colSums(sapply(result_df[names(result_df)], is.na))
all_nas[which(all_nas > 0)]## named numeric(0)
# write results
write.csv(result_df,"./Final/house-prices/result.csv", row.names = FALSE)Submit to Kaggle competion
The model score once submited to the Kaggle competion was 0.16978.
Submit
Leader Board