\(~\)
# For Problem 1
library(igraph)
library(openintro)
library(Matrix)
# For Problem 2
library(OpenImageR)
library(caret)
library(nnet)
# For Problem 3
library(MASS)
library(matrixcalc)
library(GGally)
# For all Problems
library(ggplot2)
library(tidyverse)
library(hrbrthemes)
\(~\)
\(~\)
\(~\)
\(r_f = A * r_i = \begin{bmatrix} 0.0556 \\ 0.139 \\ 0.0835 \\ 0.25 \\ 0.139\\ 0.167 \end{bmatrix}\)
\(~\)
\(~\)
\(~\)
# create matrix
<- matrix(c(0, (1/2), (1/2), 0, 0, 0, 0, 0, 0, 0, 0, 0, (1/3), (1/3), 0, 0, (1/3), 0, 0, 0, 0, 0, (1/2), (1/2), 0, 0, 0, (1/2), 0, (1/2), 0, 0, 0, 1, 0, 0), nrow = 6, byrow = T)
A A
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0000000 0.5000000 0.5 0.0 0.0000000 0.0
## [2,] 0.0000000 0.0000000 0.0 0.0 0.0000000 0.0
## [3,] 0.3333333 0.3333333 0.0 0.0 0.3333333 0.0
## [4,] 0.0000000 0.0000000 0.0 0.0 0.5000000 0.5
## [5,] 0.0000000 0.0000000 0.0 0.5 0.0000000 0.5
## [6,] 0.0000000 0.0000000 0.0 1.0 0.0000000 0.0
#formatC(A, format = "f", digits = 4)
<- 6 # number of webpages
n
# creating new row to replace matrix A second row
<- rep(1/6, n)
row2
# Removing second row from matrix A
<- A[-2,]
A_2
# Adding r into the second row and creating matrix A_2
<- matrix(rbind(A_2[1,], row2, A_2[- (1), ]), 6)
A_2 A_2
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0000000 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3333333 0.3333333 0.0000000 0.0000000 0.3333333 0.0000000
## [4,] 0.0000000 0.0000000 0.0000000 0.0000000 0.5000000 0.5000000
## [5,] 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.5000000
## [6,] 0.0000000 0.0000000 0.0000000 1.0000000 0.0000000 0.0000000
#A_2 <- A + (apply(A, 1, sum) !=1) * 1 / n
#formatC(A_2, format = "f", digits = 4)
# decay form B matrix
<- 0.85
decay <- nrow(A_2)
n
<- decay * A_2 + ((1 - decay) / n)
B formatC(B, format = "f", digits = 4)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "0.0250" "0.4500" "0.4500" "0.0250" "0.0250" "0.0250"
## [2,] "0.1667" "0.1667" "0.1667" "0.1667" "0.1667" "0.1667"
## [3,] "0.3083" "0.3083" "0.0250" "0.0250" "0.3083" "0.0250"
## [4,] "0.0250" "0.0250" "0.0250" "0.0250" "0.4500" "0.4500"
## [5,] "0.0250" "0.0250" "0.0250" "0.4500" "0.0250" "0.4500"
## [6,] "0.0250" "0.0250" "0.0250" "0.8750" "0.0250" "0.0250"
\(~\)
# We will take the second row of 0.167 to perform the iterations on B when r = B^n * r
# We'll do a couple of iterations increasing by 10 starting at 0
<- 0
n <- matrix.power(t(B), n) %*% row2
r_0 r_0
## [,1]
## [1,] 0.1666667
## [2,] 0.1666667
## [3,] 0.1666667
## [4,] 0.1666667
## [5,] 0.1666667
## [6,] 0.1666667
<- 10
n <- matrix.power(t(B), n) %*% row2
r_10 r_10
## [,1]
## [1,] 0.05205661
## [2,] 0.07428990
## [3,] 0.05782138
## [4,] 0.34797267
## [5,] 0.19975859
## [6,] 0.26810085
<- 20
n <- matrix.power(t(B), n) %*% row2
r_20 r_20
## [,1]
## [1,] 0.05170616
## [2,] 0.07368173
## [3,] 0.05741406
## [4,] 0.34870083
## [5,] 0.19990313
## [6,] 0.26859408
<- 30
n <- matrix.power(t(B), n) %*% row2
r_30 r_30
## [,1]
## [1,] 0.05170475
## [2,] 0.07367927
## [3,] 0.05741242
## [4,] 0.34870367
## [5,] 0.19990381
## [6,] 0.26859607
<- 40
n <- matrix.power(t(B), n) %*% row2
r_40 r_40
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
<- 50
n <- matrix.power(t(B), n) %*% row2
r_50 r_50
## [,1]
## [1,] 0.05170475
## [2,] 0.07367926
## [3,] 0.05741241
## [4,] 0.34870369
## [5,] 0.19990381
## [6,] 0.26859608
# By iteration 30 we found the convergence
<- matrix.power(t(B), 30) %*% row2 iter_converg
\(~\)
<- eigen(B)
eigen_decomp eigen_decomp
## eigen() decomposition
## $values
## [1] 1.00000000 0.57619235 -0.42500001 -0.42499999 -0.34991524 -0.08461044
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.4082483 -0.7278031 -5.345224e-01 5.345225e-01 -0.795670150
## [2,] -0.4082483 -0.3721164 -5.216180e-09 -5.216180e-09 0.059710287
## [3,] -0.4082483 -0.5389259 5.345225e-01 -5.345225e-01 0.602762996
## [4,] -0.4082483 0.1174605 -2.672613e-01 2.672612e-01 0.002611877
## [5,] -0.4082483 0.1174605 -2.672613e-01 2.672612e-01 0.002611877
## [6,] -0.4082483 0.1174605 5.345225e-01 -5.345224e-01 0.002611877
## [,6]
## [1,] 0.486246420
## [2,] -0.673469294
## [3,] 0.556554233
## [4,] -0.009145393
## [5,] -0.009145393
## [6,] -0.009145393
# turning values as numeric and getting the max value of 1
<- which.max(eigen(B)$values)
max_value_decomp print(paste('The largest eigenvalue is', max(max_value_decomp)))
## [1] "The largest eigenvalue is 1"
# corresponding vector of max eigenvalue
<- as.numeric((1/sum(eigen_decomp$vectors[,1]))*eigen_decomp$vectors[,1])
eigen_decomp2 sum(eigen_decomp2)
## [1] 1
\(~\)
# Here we are using the library 'igraph' installed at the begining to complete the question
#graph_A <- graph.adjacency(A, weighted = T)
<- graph_from_adjacency_matrix(A, weighted = T)
graph_A plot(graph_A)
# verifying that you get the same PageRank vector as the two approached above
<- page.rank(graph_A)$vector
pageRank
<- (round(iter_converg, 4) == round(pageRank, 4))
results results
## [,1]
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
\(~\)
# Load data set from computer, file is too large to upload to free github account
# for knitting purposes I am not printing out the entire data set because it is too long.
<- read.csv('/Users/letiix3/Desktop/Data-605/Week-15/digit-recognizer/train.csv')
train head(train[1:15], n = 5) # set to load columns 1-10 and 5 rows
## label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9
## 1 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 0 0 0 0
## 4 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## pixel10 pixel11 pixel12 pixel13
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
\(~\)
# label is the first column and divide all pixels by 255
= train[,1]
labels <- train[,-1]/255
train_data
# dimensions of data frame
dim(train_data)
## [1] 42000 784
# creating a plot to see how one image looks
<- matrix(unlist(train_data[10, -1]), nrow = 28, byrow = T)
image1 image(image1, col = grey.colors(255))
# image must be rotated
<- function(x) t(apply(x, 2, rev))
rotate
# final product; will be using this code to apply it to the first 10 images
<- rotate(matrix(unlist(train_data[10, -1]), nrow = 28, byrow = T))
image1 image(image1, col = grey.colors(255))
# Apply code above to images 1:10 of the whole data set
par(mfrow = c (2, 5))
# images must be rotated
<- function(x) t(apply(x, 2, rev))
rotate
# Using a for loop for all 10 images
for (i in 1:10){
<- rotate(matrix(unlist(train_data[i, -1]), nrow = 28, byrow = T))
m image(m, col = grey.colors(255))
}
\(~\)
The frequency distribution of the numbers ranges between 0.095 - 0.11.
<- as.data.frame(table(labels)/42000)
train_frequency
# bar graph for frequency distribution
%>%
train_frequency ggplot(aes(x = labels, y = Freq, fill = Freq)) +
geom_bar(stat = 'identity') +
scale_fill_gradient(low = "blue", high = "red")
table(labels)/42000
## labels
## 0 1 2 3 4 5 6
## 0.09838095 0.11152381 0.09945238 0.10359524 0.09695238 0.09035714 0.09850000
## 7 8 9
## 0.10478571 0.09673810 0.09971429
\(~\)
Based on this I am going to assume the low values means the mean of each number are closer to the center of the images.
# Using colMeans() I was only getting 0's.
colMeans(train_data)[1:20]
## pixel0 pixel1 pixel2 pixel3 pixel4 pixel5
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## pixel6 pixel7 pixel8 pixel9 pixel10 pixel11
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## pixel12 pixel13 pixel14 pixel15 pixel16 pixel17
## 1.176471e-05 4.388422e-05 2.016807e-05 8.403361e-07 0.000000e+00 0.000000e+00
## pixel18 pixel19
## 0.000000e+00 0.000000e+00
\(~\)
# covariance
<- train_data
train_cov
# pca
<- prcomp(train_cov)
train_pca <- (cumsum(train_pca$sdev^2) / sum(train_pca$sdev^2))
train_cumvar
# 95% variance
<- which.max(train_cumvar >= .95)
cumvar_95 print(paste0("At 95% variance there were ", (cumvar_95), " components generated."))
## [1] "At 95% variance there were 154 components generated."
# 100% variance
print(paste0("At 100% variance thereshould be 784 components generated representing each column in the dataset."))
## [1] "At 100% variance thereshould be 784 components generated representing each column in the dataset."
plot(train_cumvar)
\(~\)
Images generated by PCA typically have noise.
par(mfrow = c (2, 5))
# images must be rotated
<- function(x) t(apply(x, 2, rev))
rotate
# Using my for loop it reproduced static only
for (i in 1:10){
image(1:28, 1:28, array(train_pca$x[,i], dim = c(28, 28)))
}
# Using code from another student you can visualize more of a number that's blurry
for (i in 1:10){
<- matrix(train_pca$rotation[1:784], nrow = 28, ncol = 28)
img image(img, useRaster = T, axes = F)
}
\(~\)
You see a more pronounced figure 8.
# Selecting only the 8's
<- train_data %>%
eights filter(labels == 8)
<- eights[,2:ncol(eights)]
eights
# Reducing pixels to 255
<- eights / 255
eights_reduced <- prcomp(eights_reduced)
eights_pca
#
<- (cumsum(eights_pca$sdev^2) / sum(eights_pca$sdev^2))
eights_cumvar
# 100% variance
<- which.max(eights_cumvar >= 1)
eights_100 eights_100
## [1] 537
plot(eights_cumvar)
par(mfrow = c (2, 5))
# images must be rotated
<- function(x) t(apply(x, 2, rev))
rotate
# When using my code, same thing happens as before
for (i in 1:10){
<- rotate(matrix(eights_pca$x[,i], nrow = 28, byrow = T))
m image(m, col = grey.colors(255))
}
for (i in 1:10){
<- matrix(eights_pca$rotation[1:784], nrow = 28, ncol = 28)
img image(img, useRaster = T, axes = F)
}
\(~\)
# create model
<- as.factor(train$label)
train_label <- train[2:785] / 255
train_data <- train$label
train_data_label
<- nnet::multinom(labels ~., data = train_data, MaxNWts = 10000000) model
## # weights: 7860 (7065 variable)
## initial value 96708.573906
## iter 10 value 25322.714106
## iter 20 value 20402.086316
## iter 30 value 19312.872829
## iter 40 value 18703.256586
## iter 50 value 18197.815143
## iter 60 value 17732.985798
## iter 70 value 16739.962157
## iter 80 value 14961.658448
## iter 90 value 13446.085942
## iter 100 value 12442.636014
## final value 12442.636014
## stopped after 100 iterations
The accuracy of the confusion matrix is 92.45%
# make the prediction and confusion matrix
<- predict(model, train_data)
prediction_model confusionMatrix(prediction_model, train_label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 3994 0 19 11 4 35 17 5 19 12
## 1 3 4588 59 32 20 39 28 37 134 18
## 2 11 13 3753 88 17 19 17 37 21 10
## 3 8 12 65 3879 9 91 1 9 91 53
## 4 13 6 60 10 3852 55 35 44 38 136
## 5 33 12 20 162 5 3386 45 10 132 33
## 6 35 3 38 15 22 52 3973 2 19 3
## 7 7 8 54 35 7 28 2 4076 18 87
## 8 20 32 85 78 22 51 17 4 3519 25
## 9 8 10 24 41 114 39 2 177 72 3811
##
## Overall Statistics
##
## Accuracy : 0.9245
## 95% CI : (0.922, 0.9271)
## No Information Rate : 0.1115
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9161
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.96660 0.9795 0.89849 0.89152 0.94597 0.89223
## Specificity 0.99678 0.9901 0.99384 0.99100 0.98953 0.98817
## Pos Pred Value 0.97036 0.9254 0.94155 0.91963 0.90657 0.88223
## Neg Pred Value 0.99636 0.9974 0.98885 0.98751 0.99417 0.98928
## Prevalence 0.09838 0.1115 0.09945 0.10360 0.09695 0.09036
## Detection Rate 0.09510 0.1092 0.08936 0.09236 0.09171 0.08062
## Detection Prevalence 0.09800 0.1180 0.09490 0.10043 0.10117 0.09138
## Balanced Accuracy 0.98169 0.9848 0.94617 0.94126 0.96775 0.94020
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.9604 0.92615 0.86611 0.90998
## Specificity 0.9950 0.99346 0.99120 0.98712
## Pos Pred Value 0.9546 0.94308 0.91331 0.88669
## Neg Pred Value 0.9957 0.99137 0.98574 0.99000
## Prevalence 0.0985 0.10479 0.09674 0.09971
## Detection Rate 0.0946 0.09705 0.08379 0.09074
## Detection Prevalence 0.0991 0.10290 0.09174 0.10233
## Balanced Accuracy 0.9777 0.95981 0.92865 0.94855
\(~\)
\(~\)
# Read csv file renaming train.csv to house_train
<- read.csv('/Users/letiix3/Desktop/Data-605/Week-15/House_Price/train.csv', header = T, sep = ",")
house_train head(house_train[1:15], n = 5)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2
## 1 AllPub Inside Gtl CollgCr Norm Norm
## 2 AllPub FR2 Gtl Veenker Feedr Norm
## 3 AllPub Inside Gtl CollgCr Norm Norm
## 4 AllPub Corner Gtl Crawfor Norm Norm
## 5 AllPub FR2 Gtl NoRidge Norm Norm
# glimpse of data set columns 1 - 10
glimpse(house_train[1:10])
## Rows: 1,460
## Columns: 10
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20, 2…
## $ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "RL"…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, NA…
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6120,…
## $ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "P…
## $ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", "R…
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "L…
## $ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"…
# summary of data set with filter of numeric values only
%>%
house_train select_if(is.numeric) %>%
summary()
## Id MSSubClass LotFrontage LotArea
## Min. : 1.0 Min. : 20.0 Min. : 21.00 Min. : 1300
## 1st Qu.: 365.8 1st Qu.: 20.0 1st Qu.: 59.00 1st Qu.: 7554
## Median : 730.5 Median : 50.0 Median : 69.00 Median : 9478
## Mean : 730.5 Mean : 56.9 Mean : 70.05 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00 3rd Qu.: 11602
## Max. :1460.0 Max. :190.0 Max. :313.00 Max. :215245
## NA's :259
## OverallQual OverallCond YearBuilt YearRemodAdd
## Min. : 1.000 Min. :1.000 Min. :1872 Min. :1950
## 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967
## Median : 6.000 Median :5.000 Median :1973 Median :1994
## Mean : 6.099 Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :10.000 Max. :9.000 Max. :2010 Max. :2010
##
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 0.0 Median : 383.5 Median : 0.00 Median : 477.5
## Mean : 103.7 Mean : 443.6 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 166.0 3rd Qu.: 712.2 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :1600.0 Max. :5644.0 Max. :1474.00 Max. :2336.0
## NA's :8
## TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF
## Min. : 0.0 Min. : 334 Min. : 0 Min. : 0.000
## 1st Qu.: 795.8 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## Median : 991.5 Median :1087 Median : 0 Median : 0.000
## Mean :1057.4 Mean :1163 Mean : 347 Mean : 5.845
## 3rd Qu.:1298.2 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## Max. :6110.0 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Fireplaces GarageYrBlt GarageCars GarageArea
## Min. :0.000 Min. :1900 Min. :0.000 Min. : 0.0
## 1st Qu.:0.000 1st Qu.:1961 1st Qu.:1.000 1st Qu.: 334.5
## Median :1.000 Median :1980 Median :2.000 Median : 480.0
## Mean :0.613 Mean :1979 Mean :1.767 Mean : 473.0
## 3rd Qu.:1.000 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :3.000 Max. :2010 Max. :4.000 Max. :1418.0
## NA's :81
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea MiscVal MoSold
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 5.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 6.000
## Mean : 15.06 Mean : 2.759 Mean : 43.49 Mean : 6.322
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :480.00 Max. :738.000 Max. :15500.00 Max. :12.000
##
## YrSold SalePrice
## Min. :2006 Min. : 34900
## 1st Qu.:2007 1st Qu.:129975
## Median :2008 Median :163000
## Mean :2008 Mean :180921
## 3rd Qu.:2009 3rd Qu.:214000
## Max. :2010 Max. :755000
##
\(~\)
# plot to see overview of SalePrice
%>%
house_trainggplot( aes(x = SalePrice)) +
geom_histogram( binwidth = 30, fill = "pink", color = "#e9ecef", alpha = 0.9) +
ggtitle("Histogram of 'SalePrice'") +
theme_ipsum() +
theme(
plot.title = element_text(size = 15)
)
# selecting random columns to create a Correlogram
%>%
house_trainggpairs(columns = 44:53, ggplot2::aes(colour = 'species'))
\(~\)
colnames(house_train)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
# plot for SalePrice and LotArea
%>%
house_train ggplot(aes(x = SalePrice, y = LotArea, color = YearBuilt)) +
geom_point(size = 3) +
geom_smooth(method = "lm", color = 'red') +
theme_ipsum()
## `geom_smooth()` using formula 'y ~ x'
# plot for YearBuilt and SalePrice
%>%
house_train ggplot(aes(x = YearBuilt, y = SalePrice, color = YrSold)) +
geom_point(size = 3) +
geom_smooth(method = "lm", color = 'red') +
theme_ipsum()
## `geom_smooth()` using formula 'y ~ x'
\(~\)
<- house_train %>%
cor_house ::select(LotArea, YearBuilt, X1stFlrSF)
dplyr<- cor(cor_house)
cor_m cor_m
## LotArea YearBuilt X1stFlrSF
## LotArea 1.00000000 0.01422765 0.2994746
## YearBuilt 0.01422765 1.00000000 0.2819859
## X1stFlrSF 0.29947458 0.28198586 1.0000000
\(~\)
Based on the tests below, we can reject the null hypothesis that states that the correlation is zero and accept the alternative hypothesis. We should be worried about familywise error.
cor.test(house_train$LotArea, house_train$YearBuilt, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_train$LotArea and house_train$YearBuilt
## t = 0.54332, df = 1458, p-value = 0.587
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.01934322 0.04776648
## sample estimates:
## cor
## 0.01422765
cor.test(house_train$LotArea, house_train$X1stFlrSF, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_train$LotArea and house_train$X1stFlrSF
## t = 11.985, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2686127 0.3297222
## sample estimates:
## cor
## 0.2994746
cor.test(house_train$X1stFlrSF, house_train$YearBuilt, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_train$X1stFlrSF and house_train$YearBuilt
## t = 11.223, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2507977 0.3125892
## sample estimates:
## cor
## 0.2819859
\(~\)
# invert correlation matrix from above
<- solve(cor_m)
invert_cor invert_cor
## LotArea YearBuilt X1stFlrSF
## LotArea 1.1050234 0.0842977 -0.3546972
## YearBuilt 0.0842977 1.0928157 -0.3334036
## X1stFlrSF -0.3546972 -0.3334036 1.2002379
# precision
<- round(cor_m %*% invert_cor)
precision_cor precision_cor
## LotArea YearBuilt X1stFlrSF
## LotArea 1 0 0
## YearBuilt 0 1 0
## X1stFlrSF 0 0 1
# LU decomposition
<- lu.decomposition(cor_m)
lu_decom_cor lu_decom_cor
## $L
## [,1] [,2] [,3]
## [1,] 1.00000000 0.0000000 0
## [2,] 0.01422765 1.0000000 0
## [3,] 0.29947458 0.2777813 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0.01422765 0.2994746
## [2,] 0 0.99979757 0.2777250
## [3,] 0 0.00000000 0.8331682
\(~\)
# Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.
%>%
house_trainfilter(OpenPorchSF < 350) %>%
ggplot( aes(x = OpenPorchSF)) +
geom_histogram( binwidth = 15, fill = "purple", color = "#e9ecef", alpha = 0.9) +
ggtitle("Histogram of 'OpenPorchSF'") +
theme_ipsum() +
theme(
plot.title = element_text(size=15)
)
# exponential distribution
<- fitdistr(house_train$OpenPorchSF, 'exponential')
expon_OpenPorchSF
# lambda
<- expon_OpenPorchSF$estimate
lamb_OpenPorchSF lamb_OpenPorchSF
## rate
## 0.02143151
# sample of 1000
<- rexp(1000, lamb_OpenPorchSF)
exp_samp summary(exp_samp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0342 14.4774 31.2669 46.6193 65.6411 400.7690
# Plot histogram and compare it with original histogram
hist(exp_samp, main = "Histogram of Exponential Sample of 'OpenPorchSF'")
\(~\)
# 5th and 95th percentiles
<- qexp(0.05, lamb_OpenPorchSF)
lower lower
## [1] 2.393359
<- qexp(0.95, lamb_OpenPorchSF)
upper upper
## [1] 139.7817
# empirical 5th and 95th percentile
quantile(house_train$OpenPorchSF, c(0.05, 0.95))
## 5% 95%
## 0.00 175.05
print(paste('The 5th percentile is 0.00 and the 95th percentile is 175.05.'))
## [1] "The 5th percentile is 0.00 and the 95th percentile is 175.05."
\(~\)
# selection columns that are numeric only
<- house_train %>%
house_train2 ::select_if(is.numeric)
dplyr
# Check for missing values in data
colSums(is.na(house_train2))
## Id MSSubClass LotFrontage LotArea OverallQual
## 0 0 259 0 0
## OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1
## 0 0 0 8 0
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF
## 0 0 0 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
## 0 0 0 0 0
## GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF
## 81 0 0 0 0
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
## 0 0 0 0 0
## MoSold YrSold SalePrice
## 0 0 0
<- lm(SalePrice ~., house_train2)
model_house summary(model_house)
##
## Call:
## lm(formula = SalePrice ~ ., data = house_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442182 -16955 -2824 15125 318183
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.351e+05 1.701e+06 -0.197 0.843909
## Id -1.205e+00 2.658e+00 -0.453 0.650332
## MSSubClass -2.001e+02 3.451e+01 -5.797 8.84e-09 ***
## LotFrontage -1.160e+02 6.126e+01 -1.894 0.058503 .
## LotArea 5.422e-01 1.575e-01 3.442 0.000599 ***
## OverallQual 1.866e+04 1.482e+03 12.592 < 2e-16 ***
## OverallCond 5.239e+03 1.368e+03 3.830 0.000135 ***
## YearBuilt 3.164e+02 8.766e+01 3.610 0.000321 ***
## YearRemodAdd 1.194e+02 8.668e+01 1.378 0.168607
## MasVnrArea 3.141e+01 7.022e+00 4.473 8.54e-06 ***
## BsmtFinSF1 1.736e+01 5.838e+00 2.973 0.003014 **
## BsmtFinSF2 8.342e+00 8.766e+00 0.952 0.341532
## BsmtUnfSF 5.005e+00 5.277e+00 0.948 0.343173
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.597e+01 7.360e+00 6.246 6.02e-10 ***
## X2ndFlrSF 4.663e+01 6.102e+00 7.641 4.72e-14 ***
## LowQualFinSF 3.341e+01 2.794e+01 1.196 0.232009
## GrLivArea NA NA NA NA
## BsmtFullBath 9.043e+03 3.198e+03 2.828 0.004776 **
## BsmtHalfBath 2.465e+03 5.073e+03 0.486 0.627135
## FullBath 5.433e+03 3.531e+03 1.539 0.124182
## HalfBath -1.098e+03 3.321e+03 -0.331 0.740945
## BedroomAbvGr -1.022e+04 2.155e+03 -4.742 2.40e-06 ***
## KitchenAbvGr -2.202e+04 6.710e+03 -3.282 0.001063 **
## TotRmsAbvGrd 5.464e+03 1.487e+03 3.674 0.000251 ***
## Fireplaces 4.372e+03 2.189e+03 1.998 0.046020 *
## GarageYrBlt -4.728e+01 9.106e+01 -0.519 0.603742
## GarageCars 1.685e+04 3.491e+03 4.827 1.58e-06 ***
## GarageArea 6.274e+00 1.213e+01 0.517 0.605002
## WoodDeckSF 2.144e+01 1.002e+01 2.139 0.032662 *
## OpenPorchSF -2.252e+00 1.949e+01 -0.116 0.907998
## EnclosedPorch 7.295e+00 2.062e+01 0.354 0.723590
## X3SsnPorch 3.349e+01 3.758e+01 0.891 0.373163
## ScreenPorch 5.805e+01 2.041e+01 2.844 0.004532 **
## PoolArea -6.052e+01 2.990e+01 -2.024 0.043204 *
## MiscVal -3.761e+00 6.960e+00 -0.540 0.589016
## MoSold -2.217e+02 4.229e+02 -0.524 0.600188
## YrSold -2.474e+02 8.458e+02 -0.293 0.769917
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36800 on 1085 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.8096, Adjusted R-squared: 0.8034
## F-statistic: 131.8 on 35 and 1085 DF, p-value: < 2.2e-16
plot(model_house)
<- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold, house_train2)
model_house2 summary(model_house2)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea +
## BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF +
## X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath +
## FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
## Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF +
## OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch +
## PoolArea + MiscVal + MoSold + YrSold, data = house_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442865 -16873 -2581 14998 318042
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.232e+05 1.701e+06 -0.190 0.849317
## MSSubClass -2.005e+02 3.449e+01 -5.814 8.03e-09 ***
## LotFrontage -1.161e+02 6.124e+01 -1.896 0.058203 .
## LotArea 5.454e-01 1.573e-01 3.466 0.000548 ***
## OverallQual 1.870e+04 1.478e+03 12.646 < 2e-16 ***
## OverallCond 5.227e+03 1.367e+03 3.824 0.000139 ***
## YearBuilt 3.170e+02 8.762e+01 3.617 0.000311 ***
## YearRemodAdd 1.206e+02 8.661e+01 1.392 0.164174
## MasVnrArea 3.160e+01 7.006e+00 4.511 7.15e-06 ***
## BsmtFinSF1 1.739e+01 5.835e+00 2.980 0.002947 **
## BsmtFinSF2 8.362e+00 8.763e+00 0.954 0.340205
## BsmtUnfSF 5.006e+00 5.275e+00 0.949 0.342890
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.591e+01 7.356e+00 6.241 6.21e-10 ***
## X2ndFlrSF 4.668e+01 6.099e+00 7.654 4.28e-14 ***
## LowQualFinSF 3.415e+01 2.788e+01 1.225 0.220788
## GrLivArea NA NA NA NA
## BsmtFullBath 8.980e+03 3.194e+03 2.812 0.005018 **
## BsmtHalfBath 2.490e+03 5.071e+03 0.491 0.623487
## FullBath 5.390e+03 3.529e+03 1.527 0.126941
## HalfBath -1.119e+03 3.320e+03 -0.337 0.736244
## BedroomAbvGr -1.023e+04 2.154e+03 -4.750 2.30e-06 ***
## KitchenAbvGr -2.193e+04 6.704e+03 -3.271 0.001105 **
## TotRmsAbvGrd 5.440e+03 1.486e+03 3.661 0.000263 ***
## Fireplaces 4.375e+03 2.188e+03 2.000 0.045793 *
## GarageYrBlt -4.914e+01 9.093e+01 -0.540 0.589011
## GarageCars 1.679e+04 3.487e+03 4.815 1.68e-06 ***
## GarageArea 6.488e+00 1.211e+01 0.536 0.592338
## WoodDeckSF 2.155e+01 1.002e+01 2.151 0.031713 *
## OpenPorchSF -2.315e+00 1.948e+01 -0.119 0.905404
## EnclosedPorch 7.233e+00 2.061e+01 0.351 0.725733
## X3SsnPorch 3.458e+01 3.749e+01 0.922 0.356593
## ScreenPorch 5.797e+01 2.040e+01 2.842 0.004572 **
## PoolArea -6.126e+01 2.984e+01 -2.053 0.040326 *
## MiscVal -3.850e+00 6.955e+00 -0.554 0.579980
## MoSold -2.240e+02 4.227e+02 -0.530 0.596213
## YrSold -2.536e+02 8.454e+02 -0.300 0.764216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36790 on 1086 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.8095, Adjusted R-squared: 0.8036
## F-statistic: 135.7 on 34 and 1086 DF, p-value: < 2.2e-16
plot(model_house2)
# Using only significant columns
<- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + ScreenPorch + PoolArea, house_train2)
model_house3 summary(model_house3)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 +
## X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr +
## TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + ScreenPorch +
## PoolArea, data = house_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -451835 -17454 -2047 14390 317137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.454e+05 1.003e+05 -8.427 < 2e-16 ***
## MSSubClass -1.821e+02 3.184e+01 -5.719 1.36e-08 ***
## LotFrontage -9.469e+01 5.782e+01 -1.638 0.101765
## LotArea 5.560e-01 1.544e-01 3.602 0.000329 ***
## OverallQual 1.897e+04 1.323e+03 14.339 < 2e-16 ***
## OverallCond 5.179e+03 1.105e+03 4.687 3.10e-06 ***
## YearBuilt 3.958e+02 5.090e+01 7.775 1.64e-14 ***
## MasVnrArea 3.239e+01 6.789e+00 4.770 2.07e-06 ***
## BsmtFinSF1 1.037e+01 3.495e+00 2.967 0.003067 **
## X1stFlrSF 5.745e+01 5.596e+00 10.266 < 2e-16 ***
## X2ndFlrSF 4.995e+01 4.845e+00 10.309 < 2e-16 ***
## BsmtFullBath 9.782e+03 2.779e+03 3.521 0.000447 ***
## BedroomAbvGr -1.025e+04 1.902e+03 -5.388 8.61e-08 ***
## KitchenAbvGr -1.324e+04 5.656e+03 -2.340 0.019451 *
## TotRmsAbvGrd 5.553e+03 1.398e+03 3.971 7.58e-05 ***
## Fireplaces 3.310e+03 2.070e+03 1.599 0.110111
## GarageCars 1.101e+04 1.916e+03 5.747 1.16e-08 ***
## WoodDeckSF 2.416e+01 9.638e+00 2.507 0.012305 *
## ScreenPorch 5.425e+01 1.969e+01 2.755 0.005956 **
## PoolArea -5.982e+01 2.883e+01 -2.075 0.038214 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36680 on 1175 degrees of freedom
## (265 observations deleted due to missingness)
## Multiple R-squared: 0.8086, Adjusted R-squared: 0.8055
## F-statistic: 261.3 on 19 and 1175 DF, p-value: < 2.2e-16
plot(model_house3)
# Removing 2 more columns
<- lm(SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + GarageCars + WoodDeckSF + ScreenPorch + PoolArea, house_train2)
model_house4 summary(model_house4)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + X1stFlrSF +
## X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr +
## TotRmsAbvGrd + GarageCars + WoodDeckSF + ScreenPorch + PoolArea,
## data = house_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -474172 -16306 -2075 13986 299145
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.297e+05 8.859e+04 -9.366 < 2e-16 ***
## MSSubClass -1.623e+02 2.582e+01 -6.284 4.37e-10 ***
## LotArea 4.401e-01 9.932e-02 4.431 1.01e-05 ***
## OverallQual 1.884e+04 1.121e+03 16.813 < 2e-16 ***
## OverallCond 5.066e+03 9.229e+02 5.489 4.77e-08 ***
## YearBuilt 3.868e+02 4.497e+01 8.602 < 2e-16 ***
## MasVnrArea 2.960e+01 5.876e+00 5.037 5.34e-07 ***
## BsmtFinSF1 1.020e+01 2.973e+00 3.430 0.000621 ***
## X1stFlrSF 5.910e+01 4.551e+00 12.985 < 2e-16 ***
## X2ndFlrSF 4.969e+01 4.071e+00 12.206 < 2e-16 ***
## BsmtFullBath 8.982e+03 2.374e+03 3.784 0.000161 ***
## BedroomAbvGr -1.046e+04 1.641e+03 -6.371 2.52e-10 ***
## KitchenAbvGr -1.441e+04 5.050e+03 -2.854 0.004373 **
## TotRmsAbvGrd 5.468e+03 1.214e+03 4.503 7.25e-06 ***
## GarageCars 1.045e+04 1.695e+03 6.165 9.13e-10 ***
## WoodDeckSF 2.516e+01 7.905e+00 3.182 0.001492 **
## ScreenPorch 5.435e+01 1.673e+01 3.249 0.001185 **
## PoolArea -3.060e+01 2.342e+01 -1.307 0.191504
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34830 on 1434 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.8092, Adjusted R-squared: 0.807
## F-statistic: 357.9 on 17 and 1434 DF, p-value: < 2.2e-16
# Using model_house4 we'll create a scatterplot
plot(model_house4)
\(~\)
# Predict prices for test data
<- read.csv('/Users/letiix3/Desktop/Data-605/Week-15/House_Price/test.csv')
house_test <- house_test %>%
house_test2 ::select_if(is.numeric) %>%
dplyrreplace(is.na(.),0)
<- predict(model_house4, house_test2, type = "response")
prediction head(prediction)
## 1 2 3 4 5 6
## 125568.8 172694.7 169683.1 198876.9 198717.8 181050.2
# Prepare data frame for submission
<- data.frame(Id = house_test2$Id, SalePrice = prediction)
kaggle_prediction head(kaggle_prediction)
## Id SalePrice
## 1 1461 125568.8
## 2 1462 172694.7
## 3 1463 169683.1
## 4 1464 198876.9
## 5 1465 198717.8
## 6 1466 181050.2
# commenting out so it doesn't reproduce
#write.csv(kaggle_prediction, file = "submission_prediction.csv", row.names=FALSE)
\(~\)
Username: letisalba
Score: 0.34307
\(~\)
\(~\)