#Load libraries
suppressMessages(suppressWarnings(library(readr)))
suppressMessages(suppressWarnings(library(kableExtra)))
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(knitr)))
suppressMessages(suppressWarnings(library(psych)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(usdm)))
suppressMessages(suppressWarnings(library(mice)))
suppressMessages(suppressWarnings(library(ggiraph)))
suppressMessages(suppressWarnings(library(cowplot)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(corrgram)))
suppressMessages(suppressWarnings(library(caTools)))
suppressMessages(suppressWarnings(library(caret)))
suppressMessages(suppressWarnings(library(ROCR)))
suppressMessages(suppressWarnings(library(pROC)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(qqplotr)))
suppressMessages(suppressWarnings(library(moments)))
suppressMessages(suppressWarnings(library(car)))
suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(geoR)))
suppressMessages(suppressWarnings(library(xtable)))
suppressMessages(suppressWarnings(library(plyr)))
suppressMessages(suppressWarnings(library(Hmisc)))
suppressMessages(suppressWarnings(library(corrplot)))
suppressMessages(suppressWarnings(library(PerformanceAnalytics)))
suppressMessages(suppressWarnings(library(ggpubr)))
suppressMessages(suppressWarnings(library(matrixcalc)))
suppressMessages(suppressWarnings(library(alr3)))
suppressMessages(suppressWarnings(library(bestglm)))
suppressMessages(suppressWarnings(library(car)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(scales)))
suppressMessages(suppressWarnings(library(Matrix)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(mlr)))
suppressMessages(suppressWarnings(library(corrr)))
#download the files and then load them from storage
F1 <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data605_Final_Project/master/F1.csv", stringsAsFactors = FALSE)
F1
## Y1 Y2 Y3 Y4 X1 X2 X3 X4
## 1 20.3 20.8 28.4 20.2 9.3 7.4 9.5 9.3
## 2 19.1 14.6 21.5 18.6 4.1 6.4 3.7 12.4
## 3 19.3 18.0 20.8 22.6 22.4 8.5 11.7 19.9
## 4 20.9 7.3 22.2 11.4 9.1 9.5 7.4 6.9
## 5 22.0 19.4 21.6 23.6 15.8 11.8 5.3 -1.0
## 6 23.5 13.5 21.8 24.0 7.1 8.8 7.4 10.6
## 7 13.8 14.7 25.2 26.0 15.9 8.4 7.4 6.4
## 8 18.8 15.3 22.5 26.8 6.9 5.1 8.6 10.6
## 9 20.9 12.6 21.1 19.7 16.0 11.4 9.1 1.2
## 10 18.6 13.0 21.7 22.7 6.7 15.1 11.4 7.7
## 11 22.3 13.1 21.4 16.8 8.2 12.6 8.4 15.5
## 12 17.6 10.3 20.8 20.2 16.0 8.0 7.3 6.9
## 13 20.8 14.9 23.0 21.7 6.4 10.3 11.3 13.7
## 14 28.7 14.8 17.4 20.9 11.8 10.4 4.4 3.7
## 15 15.2 16.2 21.3 26.9 3.5 9.5 9.3 4.4
## 16 20.9 15.7 15.1 16.3 21.7 9.5 10.9 11.5
## 17 18.4 16.3 17.8 19.9 12.2 15.1 10.9 4.2
## 18 10.3 11.5 26.4 15.5 9.3 6.6 7.7 13.9
## 19 26.3 12.2 21.6 26.5 8.0 15.4 7.7 12.9
## 20 28.1 11.8 22.5 21.7 6.2 8.2 11.5 1.2
#define X and Y.
X <- F1$X1
Y <- F1$Y1
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
# get quartiles
#"x" is 3d quartile of X variable
#"y" is 2d quartile of X variable
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.50 6.85 9.20 10.83 15.82 22.40
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.30 18.55 20.55 20.29 22.07 28.70
The 3rd quartile of the X variable = 15.82 The 1st quartile of the Y variable = 18.55 So, x = 15.82 and y = 18.55
x <- 15.82
y <- 18.55
df<-data.frame(cbind(X,Y))
PA_and_B <- nrow(subset(df, X > x & Y > y))/nrow(df)
PA <- nrow(subset(df, X > x))/nrow(df)
PB <- nrow(subset(df, Y > y))/nrow(df)
PC <- nrow(subset(df, X < x))/nrow(df)
PC_and_B <- nrow(subset(df, X < x & Y > y))/nrow(df)
# a. P(X>x | Y>y)
pA_given_B <- PA_and_B/PB
pA_given_B
## [1] 0.2
P(X>x | Y>y) = .2 or 20%, which means that there is 20% probablity of X>x or X will be greater than than it 3rd quartile value of 15.82 given that the Y is greater than its 1st quartile value of 18.55.
# b. P(X>x, Y>y)
PA_and_B
## [1] 0.15
P(X>x, Y>y) = .15 or 15%, which means that there is 15% probablity of X>x or X will be greater than than it 3rd quartile value of 15.82 while Y is greater than its 1st quartile value of 18.55.
# c. P(X<x|Y>y)
PC_given_B <- PC_and_B/PB
PC_given_B
## [1] 0.8
P(X < x|Y>y) = .8 or 80%, which means that there is 80% probablity of X < x or X will be smaller than than it 3rd quartile value of 15.82 given that the Y is greater than its 1st quartile value of 18.55.
data_tbl <- as.data.frame(cbind.data.frame(X, Y, t1 = ifelse(X >
x, ">1st quartile", "<=1st quartile"), Total = ifelse(Y > y, ">3d quartile",
"<=3d quartile")))
tbl <- addmargins(table(data_tbl$t1, data_tbl$Total, dnn = c("X/Y")))
tbl
## NA
## X/Y <=3d quartile >3d quartile Sum
## <=1st quartile 3 12 15
## >1st quartile 2 3 5
## Sum 5 15 20
A <- tbl[2, 3]
A
## [1] 5
B <- tbl[3, 2]
B
## [1] 15
total <- tbl[3, 3]
total
## [1] 20
A_AND_B <- tbl[2, 2]
# P(A)
PA <- A/total
# P(B)
PB <- B/total
# P(A INT B)
PA_INT_B <- A_AND_B/total
# P(A|B) = P(A INT B) / P(B)
PA_GIVEN_B <- PA_INT_B/PB
# P(A) * P(B)
PA_INTO_PB <- (PA * PB)
PA_GIVEN_B
## [1] 0.2
PA_INTO_PB
## [1] 0.1875
chisq.test(X,Y)
## Warning in chisq.test(X, Y): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: X and Y
## X-squared = 306.67, df = 289, p-value = 0.2272
we can see that P(A|B) is 0.2 and P(A) * P(B) is 0.1875. They are very near. Therefore splitting the training data in this manner is going to make them independent. Also from the Chisq test we can see that P-Value is .22, so we cannot reject the null hypothesis.
#download the train data
train <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data605_Final_Project/master/train.csv", header = TRUE, stringsAsFactors = FALSE)
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
Subsetting the dataset to get only the numeric columns.
First i will plot scatterplots for Gross living area and Sale price.
ggplot(train, aes(x=GrLivArea, y=SalePrice)) + geom_jitter(color='seagreen4') + theme_classic() +
labs(title ='Scatter Plot of Gross living area vs Sale price') + theme(plot.title = element_text(hjust = 5000))
Scatter plot for Masonry veneer area in square feet and scale price
ggplot(train, aes(x=MasVnrArea, y=SalePrice)) + geom_jitter(color='seagreen4') + theme_classic() +
labs(title ='Scatter Plot of Masonry veneer area vs Sale price') + theme(plot.title = element_text(hjust = 5000))
## Warning: Removed 8 rows containing missing values (geom_point).
From the above scatter plots we can see that there is some kind of relation between the independent and dependent variables.
Lets now so some descriptive analysis on the available data.
num <- unlist(lapply(train, is.numeric))
train_num <- train[, num]
summary(train_num)
## Id MSSubClass LotFrontage LotArea
## Min. : 1.0 Min. : 20.0 Min. : 21.00 Min. : 1300
## 1st Qu.: 365.8 1st Qu.: 20.0 1st Qu.: 59.00 1st Qu.: 7554
## Median : 730.5 Median : 50.0 Median : 69.00 Median : 9478
## Mean : 730.5 Mean : 56.9 Mean : 70.05 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00 3rd Qu.: 11602
## Max. :1460.0 Max. :190.0 Max. :313.00 Max. :215245
## NA's :259
## OverallQual OverallCond YearBuilt YearRemodAdd
## Min. : 1.000 Min. :1.000 Min. :1872 Min. :1950
## 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967
## Median : 6.000 Median :5.000 Median :1973 Median :1994
## Mean : 6.099 Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :10.000 Max. :9.000 Max. :2010 Max. :2010
##
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 0.0 Median : 383.5 Median : 0.00 Median : 477.5
## Mean : 103.7 Mean : 443.6 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 166.0 3rd Qu.: 712.2 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :1600.0 Max. :5644.0 Max. :1474.00 Max. :2336.0
## NA's :8
## TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF
## Min. : 0.0 Min. : 334 Min. : 0 Min. : 0.000
## 1st Qu.: 795.8 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## Median : 991.5 Median :1087 Median : 0 Median : 0.000
## Mean :1057.4 Mean :1163 Mean : 347 Mean : 5.845
## 3rd Qu.:1298.2 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## Max. :6110.0 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Fireplaces GarageYrBlt GarageCars GarageArea
## Min. :0.000 Min. :1900 Min. :0.000 Min. : 0.0
## 1st Qu.:0.000 1st Qu.:1961 1st Qu.:1.000 1st Qu.: 334.5
## Median :1.000 Median :1980 Median :2.000 Median : 480.0
## Mean :0.613 Mean :1979 Mean :1.767 Mean : 473.0
## 3rd Qu.:1.000 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :3.000 Max. :2010 Max. :4.000 Max. :1418.0
## NA's :81
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea MiscVal MoSold
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 5.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 6.000
## Mean : 15.06 Mean : 2.759 Mean : 43.49 Mean : 6.322
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :480.00 Max. :738.000 Max. :15500.00 Max. :12.000
##
## YrSold SalePrice
## Min. :2006 Min. : 34900
## 1st Qu.:2007 1st Qu.:129975
## Median :2008 Median :163000
## Mean :2008 Mean :180921
## 3rd Qu.:2009 3rd Qu.:214000
## Max. :2010 Max. :755000
##
Selected variables are: SalePrice,TotalBsmtSF,GrLivArea
train_cor <- train[c("SalePrice", "TotalBsmtSF", "GrLivArea")]
train_cor_matrix <- cor(train_cor, use = "complete.obs")
train_cor_matrix
## SalePrice TotalBsmtSF GrLivArea
## SalePrice 1.0000000 0.6135806 0.7086245
## TotalBsmtSF 0.6135806 1.0000000 0.4548682
## GrLivArea 0.7086245 0.4548682 1.0000000
The Matrix suggests that there are strong to moderate corelation exists between these three variables. ‘Saleprice’ has strong corelations with ‘TotalBsmtSF’ and ‘GrLivArea’ with corelation coefficients of .61 and .708 respectively while ‘TotalBsmtSF’ and ‘GrLivArea’ have moderate corelation between them with coefficient of .45.
pairs.panels(train_cor_matrix)
cor.test(train$TotalBsmtSF, train$SalePrice, method = "pearson", conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$TotalBsmtSF and train$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
cor.test(train$GrLivArea, train$SalePrice, method = "pearson", conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(train$GrLivArea, train$TotalBsmtSF, method = "pearson", conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$GrLivArea and train$TotalBsmtSF
## t = 19.503, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4278380 0.4810855
## sample estimates:
## cor
## 0.4548682
Since all three p-values are less than .05, the variables are significantly correlated.
Yes, there are variables in this dataset that might have impact on the corelation of the the pairs of selected variables that are being tested here. There is a scope for familywise error which might cause rejecting of true Null hypothesis.
Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
train_cor_matrix
## SalePrice TotalBsmtSF GrLivArea
## SalePrice 1.0000000 0.6135806 0.7086245
## TotalBsmtSF 0.6135806 1.0000000 0.4548682
## GrLivArea 0.7086245 0.4548682 1.0000000
pre_matrix <- solve(train_cor_matrix)
pre_matrix
## SalePrice TotalBsmtSF GrLivArea
## SalePrice 2.5582310 -0.93946422 -1.38549273
## TotalBsmtSF -0.9394642 1.60588442 -0.06473842
## GrLivArea -1.3854927 -0.06473842 2.01124151
round((train_cor_matrix %*% pre_matrix), 2)
## SalePrice TotalBsmtSF GrLivArea
## SalePrice 1 0 0
## TotalBsmtSF 0 1 0
## GrLivArea 0 0 1
round((pre_matrix %*% train_cor_matrix), 2)
## SalePrice TotalBsmtSF GrLivArea
## SalePrice 1 0 0
## TotalBsmtSF 0 1 0
## GrLivArea 0 0 1
Both the matrix is identical.
cor_lu <- lu(train_cor_matrix)
cor_lu_exd <- expand(cor_lu)
L_cor <- cor_lu_exd$L
U_cor <- cor_lu_exd$U
L_cor
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3]
## [1,] 1.00000000 . .
## [2,] 0.61358055 1.00000000 .
## [3,] 0.70862448 0.03218829 1.00000000
U_cor
## 3 x 3 Matrix of class "dtrMatrix"
## [,1] [,2] [,3]
## [1,] 1.0000000 0.6135806 0.7086245
## [2,] . 0.6235189 0.0200700
## [3,] . . 0.4972053
pre_lu <- lu(pre_matrix)
pre_lu_exd <- expand(pre_lu)
L_pre <- pre_lu_exd$L
U_pre <- pre_lu_exd$U
L_pre
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3]
## [1,] 1.0000000 . .
## [2,] -0.3672320 1.0000000 .
## [3,] -0.5415823 -0.4548682 1.0000000
U_pre
## 3 x 3 Matrix of class "dtrMatrix"
## [,1] [,2] [,3]
## [1,] 2.5582310 -0.9394642 -1.3854927
## [2,] . 1.2608831 -0.5735356
## [3,] . . 1.0000000
Lets multiply the lower and uper matrix and see if it returns the original matrices or not.
L_cor %*% U_cor
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 1.0000000 0.6135806 0.7086245
## [2,] 0.6135806 1.0000000 0.4548682
## [3,] 0.7086245 0.4548682 1.0000000
L_pre %*% U_pre
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 2.5582310 -0.93946422 -1.38549273
## [2,] -0.9394642 1.60588442 -0.06473842
## [3,] -1.3854927 -0.06473842 2.01124151
It returns the original matrices.
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of ??? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ???)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
min(train$GrLivArea)
## [1] 334
The minimum value is 334, so i dont think we need shifting.
Fitting the exponential probability density function:
expo <- fitdistr(train$GrLivArea, densfun = "exponential")
options(scipen = 999)
expo$estimate
## rate
## 0.000659864
smpl <- rexp(1000, expo$estimate)
hist(train$GrLivArea)
hist(smpl)
The samples data is more skewed than the original data.
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF):
P = ecdf(smpl)
plot(P)
c <- quantile(P, c(0.05, 0.95))
c
## 5% 95%
## 69.04413 4516.50970
The 5th and 95th percentiles of the samples (simulated data) are 87.16779 and 4759.05707 respectively.
Generate a 95% confidence interval from the empirical data, assuming normality:
er <- qnorm(0.975) * sd(train$GrLivArea)/sqrt(length(train$GrLivArea))
conf_95 <- c(mean(train$GrLivArea) - er, mean(train$GrLivArea) + er)
conf_95
## [1] 1488.509 1542.418
The 95% confidence interval is 1488.509 1542.418
Provide the empirical 5th percentile and 95th percentile of the data:
e <- quantile(X, c(0.05, 0.95))
e
## 5% 95%
## 4.070 21.735
The 5th and 95th percentiles are 4.070 and 21.735 respectively.
#download the train data
test <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data605_Final_Project/master/test.csv", header = TRUE, stringsAsFactors = FALSE)
train <- cbind.data.frame(train, RecType = "Train")
test <- cbind.data.frame(test, RecType = "Test")
train_test <- rbind.data.frame(train, test, stringsAsFactors = FALSE)
We will see the missing values in the dataset. For this i have used Amelia package
missmap(train_test, main = "Missing values vs observed", color='dodgerblue')
We can see there are lots of missing values. We will replace the missing values.
#Missing value handling
train_test[sapply(train_test, is.factor)] <- lapply(train_test[sapply(train_test, is.factor)], as.character)
train_test$GarageYrBlt[is.na(train_test$GarageYrBlt)] <- train_test$YearBuilt[is.na(train_test$GarageYrBlt)]
train_test$LotFrontage[is.na(train_test$LotFrontage)] <- 0
train_test$MasVnrArea[is.na(train_test$MasVnrArea)] <- 0
train_test$Alley[is.na(train_test$Alley)] <- 'None'
train_test$Utilities[is.na(train_test$Utilities)] <- 'NoSeWa'
train_test$MasVnrType[is.na(train_test$MasVnrType)] <- 'None'
train_test$BsmtQual[is.na(train_test$BsmtQual)] <- 'None'
train_test$BsmtCond[is.na(train_test$BsmtCond)] <- 'Xa'
train_test$BsmtExposure[is.na(train_test$BsmtExposure)] <- 'Xb'
train_test$BsmtFinType1[is.na(train_test$BsmtFinType1)] <- 'Xc'
train_test$BsmtFinType2[is.na(train_test$BsmtFinType2)] <- 'Xd'
train_test$GarageType [is.na(train_test$GarageType )] <- 'Xe'
train_test$GarageFinish[is.na(train_test$GarageFinish)] <- 'Xf'
train_test$GarageQual[is.na(train_test$GarageQual)] <- 'Xg'
train_test$GarageCond[is.na(train_test$GarageCond)] <- 'Xh'
train_test$Electrical[is.na(train_test$Electrical)] <- 'None'
train_test$FireplaceQu[is.na(train_test$FireplaceQu)] <- 'None'
train_test$PoolQC[is.na(train_test$PoolQC)] <- 'None'
train_test$Fence[is.na(train_test$Fence)] <- 'None'
train_test$MiscFeature[is.na(train_test$MiscFeature)] <- 'None'
train_test$MSZoning[is.na(train_test$MSZoning)] <- 'C (all)'
train_test$Utilities[is.na(train_test$Utilities)] <- 'AllPub'
train_test$BsmtFullBath[is.na(train_test$BsmtFullBath)] <- 0
train_test$BsmtHalfBath[is.na(train_test$BsmtHalfBath)] <- 0
train_test$Exterior1st[is.na(train_test$Exterior1st)] <- 'BrkFace'
train_test$Exterior2nd[is.na(train_test$Exterior2nd)] <- 'BrkFace'
train_test$Functional[is.na(train_test$Functional)] <- 'Typ'
train_test$BsmtFinSF1[is.na(train_test$BsmtFinSF1)] <- 0
train_test$BsmtFinSF2[is.na(train_test$BsmtFinSF2)] <- 0
train_test$BsmtUnfSF[is.na(train_test$BsmtUnfSF)] <- 0
train_test$TotalBsmtSF[is.na(train_test$TotalBsmtSF)] <- 0
train_test$GarageCars[is.na(train_test$GarageCars)] <- 0
train_test$GarageArea[is.na(train_test$GarageArea)] <- 0
train_test$SaleType[is.na(train_test$SaleType)] <- 'None'
train_test$SalePrice[is.na(train$SalePrice)] <- 0
train_test[sapply(train_test, is.character)] <- lapply(train_test[sapply(train_test, is.character)], as.factor)
missmap(train_test, main = "Missing values vs observed", color='dodgerblue')
Creating dummy values:
train_test <- createDummyFeatures(train_test, method = "reference")
Outlier treatment using mean:
train_test$TotalBsmtSF[train_test$TotalBsmtSF > 6000] <- mean(train_test$TotalBsmtSF[train_test$TotalBsmtSF < 6000])
train_test$X1stFlrSF[train_test$X1stFlrSF > 4000] <- mean(train_test$X1stFlrSF[train_test$X1stFlrSF < 4000])
# Split the combined transformed dataset into train and test.
data_train <- train_test[train_test$Train == 1, ]
data_test <- train_test[train_test$Train == 0, ]
# remove id and train/test flag variables from train dataset
data_train <- subset(data_train, select = -c(Id,Train))
# remove SalePrice variables from test dataset
data_test <- subset(data_test, select = -c(SalePrice,Train))
model1 <- lm(SalePrice ~ ., data = data_train)
summary(model1)
##
## Call:
## lm(formula = SalePrice ~ ., data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -176856 -9170 0 9595 176856
##
## Coefficients: (9 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 50689.0010 1058003.9350 0.048
## MSSubClass -55.2068 82.5886 -0.668
## LotFrontage 6.4440 22.9780 0.280
## LotArea 0.7171 0.1084 6.615
## OverallQual 6793.1233 1012.8117 6.707
## OverallCond 5746.9516 870.2750 6.604
## YearBuilt 332.7269 80.2992 4.144
## YearRemodAdd 105.2342 55.4179 1.899
## MasVnrArea 20.7177 5.7846 3.582
## BsmtFinSF1 -53.8230 10.1663 -5.294
## BsmtFinSF2 -60.4971 12.7641 -4.740
## BsmtUnfSF -71.2768 10.6182 -6.713
## TotalBsmtSF 91.9050 11.1332 8.255
## X1stFlrSF 44.7238 5.6343 7.938
## X2ndFlrSF 62.0943 5.6794 10.933
## LowQualFinSF -3.9424 19.0313 -0.207
## GrLivArea NA NA NA
## BsmtFullBath 1530.8180 1976.4359 0.775
## BsmtHalfBath -483.5565 3021.9614 -0.160
## FullBath 3653.7232 2194.5297 1.665
## HalfBath 1820.5860 2093.5514 0.870
## BedroomAbvGr -3613.9719 1361.0583 -2.655
## KitchenAbvGr -13758.1760 5675.1922 -2.424
## TotRmsAbvGrd 1820.0935 954.6153 1.907
## Fireplaces 6136.5930 2558.0173 2.399
## GarageYrBlt -43.2256 59.1637 -0.731
## GarageCars 3998.7968 2276.2568 1.757
## GarageArea 19.1837 7.8565 2.442
## WoodDeckSF 15.1037 5.8619 2.577
## OpenPorchSF 0.4612 11.5602 0.040
## EnclosedPorch 3.2862 12.4680 0.264
## X3SsnPorch 34.0992 22.3310 1.527
## ScreenPorch 35.4880 12.4837 2.843
## PoolArea 679.8321 226.5222 3.001
## MiscVal 0.3329 6.1047 0.055
## MoSold -470.0028 244.7582 -1.920
## YrSold -555.5184 514.2482 -1.080
## MSZoning.FV 32368.6035 11991.3668 2.699
## MSZoning.RH 22264.5836 11889.5405 1.873
## MSZoning.RL 24939.1691 10209.7142 2.443
## MSZoning.RM 21597.6512 9578.8161 2.255
## Pave 32943.8397 12161.4633 2.709
## Alley.None -1195.2818 4209.3297 -0.284
## Alley.Pave -798.4830 6010.9140 -0.133
## LotShape.IR2 4950.9585 4209.0660 1.176
## LotShape.IR3 6301.9099 8799.9104 0.716
## LotShape.Reg 1714.0784 1634.1378 1.049
## LandContour.HLS 7596.2278 5121.6497 1.483
## LandContour.Low -11283.9130 6399.2483 -1.763
## LandContour.Lvl 5406.5527 3698.8123 1.462
## NoSeWa -36754.2087 26340.1673 -1.395
## LotConfig.CulDSac 7846.5972 3247.0002 2.417
## LotConfig.FR2 -7774.6258 3994.4303 -1.946
## LotConfig.FR3 -17364.5956 12539.9268 -1.385
## LotConfig.Inside -1588.8280 1754.5448 -0.906
## LandSlope.Mod 7562.4065 3967.0352 1.906
## LandSlope.Sev -41944.4765 11367.3478 -3.690
## Neighborhood.Blueste 7346.8466 19195.8672 0.383
## Neighborhood.BrDale -2784.9689 10950.3445 -0.254
## Neighborhood.BrkSide -5632.4232 9462.4767 -0.595
## Neighborhood.ClearCr -14428.1968 9194.2720 -1.569
## Neighborhood.CollgCr -10074.3769 7247.9563 -1.390
## Neighborhood.Crawfor 11794.8702 8528.9234 1.383
## Neighborhood.Edwards -21466.0733 7990.3517 -2.686
## Neighborhood.Gilbert -11027.7674 7656.6699 -1.440
## Neighborhood.IDOTRR -12175.4592 10725.5453 -1.135
## Neighborhood.MeadowV -6846.9204 11155.9525 -0.614
## Neighborhood.Mitchel -20799.5958 8156.3894 -2.550
## Neighborhood.NAmes -17234.9247 7831.4109 -2.201
## Neighborhood.NoRidge 25670.2938 8411.6426 3.052
## Neighborhood.NPkVill 13767.7905 13999.8855 0.983
## Neighborhood.NridgHt 18135.3828 7506.0347 2.416
## Neighborhood.NWAmes -17247.1156 7988.6001 -2.159
## Neighborhood.OldTown -14150.2066 9656.0406 -1.465
## Neighborhood.Sawyer -10999.0385 8104.8033 -1.357
## Neighborhood.SawyerW -2769.1570 7777.8750 -0.356
## Neighborhood.Somerst -2273.3989 8996.0717 -0.253
## Neighborhood.StoneBr 39211.4348 8276.5754 4.738
## Neighborhood.SWISU -8868.6284 9663.1893 -0.918
## Neighborhood.Timber -10051.1403 8124.5569 -1.237
## Neighborhood.Veenker -133.5884 10475.1850 -0.013
## Condition1.Feedr 7248.7615 5001.1014 1.449
## Condition1.Norm 16431.1558 4177.7955 3.933
## Condition1.PosA 10210.2183 9917.1421 1.030
## Condition1.PosN 15074.7180 7458.3092 2.021
## Condition1.RRAe -15660.7293 9046.2706 -1.731
## Condition1.RRAn 13073.4902 6930.1084 1.886
## Condition1.RRNe -3507.3862 17448.5854 -0.201
## Condition1.RRNn 11393.5511 12823.4239 0.888
## Condition2.Feedr -5562.1856 23372.0914 -0.238
## Condition2.Norm -10123.3463 20253.0443 -0.500
## Condition2.PosA 44010.0113 36915.3183 1.192
## Condition2.PosN -238028.6293 27584.9350 -8.629
## Condition2.RRAe -127924.1298 64974.9579 -1.969
## Condition2.RRAn -23276.0583 31462.3156 -0.740
## Condition2.RRNn -3269.1597 27026.1939 -0.121
## BldgType.2fmCon -3262.0367 12468.3219 -0.262
## BldgType.Duplex -7023.1556 7398.3669 -0.949
## BldgType.Twnhs -19170.7485 9946.0704 -1.927
## BldgType.TwnhsE -15087.9058 8980.9617 -1.680
## HouseStyle.1.5Unf 11478.1041 7920.1433 1.449
## HouseStyle.1Story 5129.2357 4375.1508 1.172
## HouseStyle.2.5Fin -16637.2641 12367.1848 -1.345
## HouseStyle.2.5Unf -9649.1947 9179.2375 -1.051
## HouseStyle.2Story -5928.7905 3492.3136 -1.698
## HouseStyle.SFoyer 1215.9024 6249.0104 0.195
## HouseStyle.SLvl 3525.4545 5575.7096 0.632
## RoofStyle.Gable 9308.6550 18428.0613 0.505
## RoofStyle.Gambrel 12059.0245 20171.4278 0.598
## RoofStyle.Hip 9111.2795 18503.5835 0.492
## RoofStyle.Mansard 20166.7008 21440.5001 0.941
## RoofStyle.Shed 100880.7892 34483.1489 2.926
## RoofMatl.CompShg -54296.9586 11367.2977 -4.777
## RoofMatl.Membran 41083.7971 34697.1186 1.184
## RoofMatl.Metal 9834.9706 33718.9583 0.292
## RoofMatl.Roll -66711.0218 27853.0229 -2.395
## RoofMatl.Tar.Grv -52615.4146 21259.8848 -2.475
## RoofMatl.WdShake -63038.3677 18578.2800 -3.393
## RoofMatl.WdShngl NA NA NA
## Exterior1st.AsphShn -24195.4755 32928.8507 -0.735
## Exterior1st.BrkComm -3023.5319 27749.1176 -0.109
## Exterior1st.BrkFace 7068.4398 12737.8031 0.555
## Exterior1st.CBlock -13593.9162 27210.2182 -0.500
## Exterior1st.CemntBd -12686.5376 18978.1897 -0.668
## Exterior1st.HdBoard -13545.8252 12924.9211 -1.048
## Exterior1st.ImStucc -22821.8303 28107.2603 -0.812
## Exterior1st.MetalSd -6271.8254 14574.6095 -0.430
## Exterior1st.Plywood -14551.3915 12744.5251 -1.142
## Exterior1st.Stone -1700.3219 24262.9298 -0.070
## Exterior1st.Stucco -7937.1013 14057.4876 -0.565
## Exterior1st.VinylSd -14356.5799 13305.7748 -1.079
## Exterior1st.Wd.Sdng -14434.6938 12353.9010 -1.168
## Exterior1st.WdShing -10289.8294 13330.6819 -0.772
## Exterior2nd.AsphShn 12180.8473 22153.0182 0.550
## Exterior2nd.Brk.Cmn 4809.2516 20037.3534 0.240
## Exterior2nd.BrkFace 4450.0695 13192.9607 0.337
## Exterior2nd.CBlock NA NA NA
## Exterior2nd.CmentBd 13038.4093 18662.3035 0.699
## Exterior2nd.HdBoard 8669.7864 12399.9421 0.699
## Exterior2nd.ImStucc 17346.8038 14310.8094 1.212
## Exterior2nd.MetalSd 5936.4460 14183.5770 0.419
## Exterior2nd.Other -18046.9297 27052.9052 -0.667
## Exterior2nd.Plywood 7069.2296 12027.5892 0.588
## Exterior2nd.Stone -10722.8301 17100.9661 -0.627
## Exterior2nd.Stucco 6206.5429 13577.4090 0.457
## Exterior2nd.VinylSd 13226.3548 12776.4154 1.035
## Exterior2nd.Wd.Sdng 12446.6804 11908.3766 1.045
## Exterior2nd.Wd.Shng 5766.9319 12443.3492 0.463
## MasVnrType.BrkFace 4260.1179 6827.5285 0.624
## MasVnrType.None 7258.4283 6899.7149 1.052
## MasVnrType.Stone 9288.4757 7226.5287 1.285
## ExterQual.Fa -7381.9536 11077.3828 -0.666
## ExterQual.Gd -20674.8230 4781.9894 -4.323
## ExterQual.TA -19968.0053 5297.2892 -3.769
## ExterCond.Fa -2752.1863 18037.5478 -0.153
## ExterCond.Gd -7210.9145 17211.0281 -0.419
## ExterCond.Po 6791.5332 31629.3968 0.215
## ExterCond.TA -4172.5334 17176.3980 -0.243
## Foundation.CBlock 2919.2579 3164.5389 0.922
## Foundation.PConc 3926.5051 3416.6052 1.149
## Foundation.Slab -7107.7539 10032.5960 -0.708
## Foundation.Stone 10019.6240 11395.3194 0.879
## Foundation.Wood -26521.3624 14720.3859 -1.802
## BsmtQual.Fa -11693.8164 6324.6293 -1.849
## BsmtQual.Gd -18112.9265 3322.4962 -5.452
## BsmtQual.None 37028.3295 36604.5517 1.012
## BsmtQual.TA -14420.0303 4133.3147 -3.489
## BsmtCond.Gd 116.1538 5271.6628 0.022
## BsmtCond.Po 67283.9021 29783.0774 2.259
## BsmtCond.TA 2882.8125 4237.8355 0.680
## BsmtCond.Xa NA NA NA
## BsmtExposure.Gd 14279.3862 2993.0810 4.771
## BsmtExposure.Mn -3519.6978 3014.4004 -1.168
## BsmtExposure.No -5216.1079 2177.1604 -2.396
## BsmtExposure.Xb -11104.6108 22959.4392 -0.484
## BsmtFinType1.BLQ 2920.5408 2792.8595 1.046
## BsmtFinType1.GLQ 5672.5895 2518.4459 2.252
## BsmtFinType1.LwQ -3398.6030 3739.2399 -0.909
## BsmtFinType1.Rec 159.8008 2993.1071 0.053
## BsmtFinType1.Unf 2693.4108 2909.4863 0.926
## BsmtFinType1.Xc NA NA NA
## BsmtFinType2.BLQ -12902.5450 7554.8913 -1.708
## BsmtFinType2.GLQ -2793.6188 9339.0802 -0.299
## BsmtFinType2.LwQ -14181.3034 7386.0856 -1.920
## BsmtFinType2.Rec -10080.5241 7102.2129 -1.419
## BsmtFinType2.Unf -8153.2144 7562.2190 -1.078
## BsmtFinType2.Xd -28466.2919 24945.1604 -1.141
## Heating.GasA 8673.7104 25537.5471 0.340
## Heating.GasW 6062.2166 26321.6313 0.230
## Heating.Grav 775.1689 28029.8596 0.028
## Heating.OthW -12340.4600 31401.0400 -0.393
## Heating.Wall 22259.8027 29685.3382 0.750
## HeatingQC.Fa 475.1322 4703.8348 0.101
## HeatingQC.Gd -4087.1845 2068.1473 -1.976
## HeatingQC.Po 2250.6789 26516.8270 0.085
## HeatingQC.TA -3293.9653 2067.9185 -1.593
## Y -63.8371 3864.3371 -0.017
## Electrical.FuseF 6.7674 5741.3344 0.001
## Electrical.FuseP -8720.2176 18552.4726 -0.470
## Electrical.Mix -42334.2987 44393.8001 -0.954
## Electrical.None 10953.3425 24035.9446 0.456
## Electrical.SBrkr -2033.1728 2944.8710 -0.690
## KitchenQual.Fa -19738.5036 6187.0103 -3.190
## KitchenQual.Gd -23667.9092 3476.7663 -6.807
## KitchenQual.TA -22719.4821 3918.1827 -5.798
## Functional.Maj2 -1325.4995 14356.4816 -0.092
## Functional.Min1 6944.2282 8583.7018 0.809
## Functional.Min2 8728.2441 8620.9007 1.012
## Functional.Mod -5292.6771 10535.8713 -0.502
## Functional.Sev -41005.9777 29566.0313 -1.387
## Functional.Typ 18234.1483 7450.5590 2.447
## FireplaceQu.Fa -859.2110 6871.6106 -0.125
## FireplaceQu.Gd 2831.6547 5311.9619 0.533
## FireplaceQu.None 8755.1760 6219.5933 1.408
## FireplaceQu.Po 12398.4490 7906.4940 1.568
## FireplaceQu.TA 3823.8994 5521.7779 0.693
## GarageType.Attchd 20173.2972 10985.8658 1.836
## GarageType.Basment 24640.4455 12741.7138 1.934
## GarageType.BuiltIn 19959.3727 11452.1918 1.743
## GarageType.CarPort 25327.8378 14664.5199 1.727
## GarageType.Detchd 23289.0183 11003.8188 2.116
## GarageType.Xe 22803.9775 20768.7623 1.098
## GarageFinish.RFn -2373.6793 1957.9719 -1.212
## GarageFinish.Unf -610.3553 2425.3806 -0.252
## GarageFinish.Xf NA NA NA
## GarageQual.Fa -126512.1485 30162.5660 -4.194
## GarageQual.Gd -121022.6143 30955.2407 -3.910
## GarageQual.Po -144089.8550 38501.9031 -3.742
## GarageQual.TA -120117.7800 29863.3903 -4.022
## GarageQual.Xg NA NA NA
## GarageCond.Fa 112696.3238 34780.9610 3.240
## GarageCond.Gd 111760.4259 36144.7929 3.092
## GarageCond.Po 118816.6649 37350.1430 3.181
## GarageCond.TA 114372.5942 34476.0278 3.317
## GarageCond.Xh NA NA NA
## PavedDrive.P -3138.2039 5557.2082 -0.565
## PavedDrive.Y -107.8460 3459.9058 -0.031
## PoolQC.Fa -158215.0117 40812.1637 -3.877
## PoolQC.Gd -128651.1973 36766.3454 -3.499
## PoolQC.None 251031.6645 122502.1228 2.049
## Fence.GdWo 8006.5217 4900.7688 1.634
## Fence.MnPrv 9478.6905 4000.2866 2.370
## Fence.MnWw 3311.8991 8207.3731 0.404
## Fence.None 8897.4672 3668.5233 2.425
## MiscFeature.None 4930.3832 97063.3133 0.051
## MiscFeature.Othr 18980.0713 90633.2774 0.209
## MiscFeature.Shed 7132.8294 92995.8744 0.077
## MiscFeature.TenC 35670.3586 96465.3906 0.370
## SaleType.Con 25412.6404 17533.2722 1.449
## SaleType.ConLD 16662.3932 9665.1446 1.724
## SaleType.ConLI 4554.0371 11546.3439 0.394
## SaleType.ConLw 663.9818 12134.3903 0.055
## SaleType.CWD 15164.2739 12853.3066 1.180
## SaleType.New 21778.8614 15393.4483 1.415
## SaleType.None NA NA NA
## SaleType.Oth 8139.3331 14480.4758 0.562
## SaleType.WD -346.1417 4173.1517 -0.083
## SaleCondition.AdjLand 9352.4799 14581.6309 0.641
## SaleCondition.Alloca 711.6552 8853.9899 0.080
## SaleCondition.Family 826.3703 6080.5347 0.136
## SaleCondition.Normal 6733.0346 2901.9214 2.320
## SaleCondition.Partial -729.8389 14817.4379 -0.049
## Pr(>|t|)
## (Intercept) 0.961796
## MSSubClass 0.503970
## LotFrontage 0.779187
## LotArea 0.000000000055541275 ***
## OverallQual 0.000000000030413426 ***
## OverallCond 0.000000000059996271 ***
## YearBuilt 0.000036575706995979 ***
## YearRemodAdd 0.057813 .
## MasVnrArea 0.000355 ***
## BsmtFinSF1 0.000000141848206828 ***
## BsmtFinSF2 0.000002395504235854 ***
## BsmtUnfSF 0.000000000029327932 ***
## TotalBsmtSF 0.000000000000000395 ***
## X1stFlrSF 0.000000000000004675 ***
## X2ndFlrSF < 0.0000000000000002 ***
## LowQualFinSF 0.835924
## GrLivArea NA
## BsmtFullBath 0.438766
## BsmtHalfBath 0.872897
## FullBath 0.096188 .
## HalfBath 0.384683
## BedroomAbvGr 0.008029 **
## KitchenAbvGr 0.015485 *
## TotRmsAbvGrd 0.056806 .
## Fireplaces 0.016592 *
## GarageYrBlt 0.465159
## GarageCars 0.079215 .
## GarageArea 0.014758 *
## WoodDeckSF 0.010095 *
## OpenPorchSF 0.968186
## EnclosedPorch 0.792156
## X3SsnPorch 0.127027
## ScreenPorch 0.004548 **
## PoolArea 0.002745 **
## MiscVal 0.956521
## MoSold 0.055059 .
## YrSold 0.280245
## MSZoning.FV 0.007045 **
## MSZoning.RH 0.061363 .
## MSZoning.RL 0.014721 *
## MSZoning.RM 0.024329 *
## Pave 0.006847 **
## Alley.None 0.776490
## Alley.Pave 0.894343
## LotShape.IR2 0.239723
## LotShape.IR3 0.474047
## LotShape.Reg 0.294425
## LandContour.HLS 0.138293
## LandContour.Low 0.078100 .
## LandContour.Lvl 0.144084
## NoSeWa 0.163162
## LotConfig.CulDSac 0.015815 *
## LotConfig.FR2 0.051842 .
## LotConfig.FR3 0.166386
## LotConfig.Inside 0.365355
## LandSlope.Mod 0.056847 .
## LandSlope.Sev 0.000234 ***
## Neighborhood.Blueste 0.701987
## Neighborhood.BrDale 0.799286
## Neighborhood.BrkSide 0.551796
## Neighborhood.ClearCr 0.116850
## Neighborhood.CollgCr 0.164797
## Neighborhood.Crawfor 0.166943
## Neighborhood.Edwards 0.007320 **
## Neighborhood.Gilbert 0.150047
## Neighborhood.IDOTRR 0.256524
## Neighborhood.MeadowV 0.539499
## Neighborhood.Mitchel 0.010892 *
## Neighborhood.NAmes 0.027943 *
## Neighborhood.NoRidge 0.002325 **
## Neighborhood.NPkVill 0.325597
## Neighborhood.NridgHt 0.015835 *
## Neighborhood.NWAmes 0.031049 *
## Neighborhood.OldTown 0.143065
## Neighborhood.Sawyer 0.175003
## Neighborhood.SawyerW 0.721880
## Neighborhood.Somerst 0.800535
## Neighborhood.StoneBr 0.000002418783958876 ***
## Neighborhood.SWISU 0.358920
## Neighborhood.Timber 0.216279
## Neighborhood.Veenker 0.989827
## Condition1.Feedr 0.147476
## Condition1.Norm 0.000088678513726580 ***
## Condition1.PosA 0.303426
## Condition1.PosN 0.043480 *
## Condition1.RRAe 0.083675 .
## Condition1.RRAn 0.059471 .
## Condition1.RRNe 0.840723
## Condition1.RRNn 0.374451
## Condition2.Feedr 0.811934
## Condition2.Norm 0.617277
## Condition2.PosA 0.233422
## Condition2.PosN < 0.0000000000000002 ***
## Condition2.RRAe 0.049202 *
## Condition2.RRAn 0.459561
## Condition2.RRNn 0.903741
## BldgType.2fmCon 0.793654
## BldgType.Duplex 0.342666
## BldgType.Twnhs 0.054156 .
## BldgType.TwnhsE 0.093218 .
## HouseStyle.1.5Unf 0.147533
## HouseStyle.1Story 0.241285
## HouseStyle.2.5Fin 0.178789
## HouseStyle.2.5Unf 0.293378
## HouseStyle.2Story 0.089828 .
## HouseStyle.SFoyer 0.845758
## HouseStyle.SLvl 0.527318
## RoofStyle.Gable 0.613556
## RoofStyle.Gambrel 0.550068
## RoofStyle.Hip 0.622522
## RoofStyle.Mansard 0.347104
## RoofStyle.Shed 0.003503 **
## RoofMatl.CompShg 0.000002001571365855 ***
## RoofMatl.Membran 0.236619
## RoofMatl.Metal 0.770585
## RoofMatl.Roll 0.016767 *
## RoofMatl.Tar.Grv 0.013465 *
## RoofMatl.WdShake 0.000713 ***
## RoofMatl.WdShngl NA
## Exterior1st.AsphShn 0.462616
## Exterior1st.BrkComm 0.913253
## Exterior1st.BrkFace 0.579053
## Exterior1st.CBlock 0.617456
## Exterior1st.CemntBd 0.503955
## Exterior1st.HdBoard 0.294830
## Exterior1st.ImStucc 0.416978
## Exterior1st.MetalSd 0.667036
## Exterior1st.Plywood 0.253774
## Exterior1st.Stone 0.944142
## Exterior1st.Stucco 0.572439
## Exterior1st.VinylSd 0.280815
## Exterior1st.Wd.Sdng 0.242863
## Exterior1st.WdShing 0.440330
## Exterior2nd.AsphShn 0.582524
## Exterior2nd.Brk.Cmn 0.810360
## Exterior2nd.BrkFace 0.735945
## Exterior2nd.CBlock NA
## Exterior2nd.CmentBd 0.484906
## Exterior2nd.HdBoard 0.484575
## Exterior2nd.ImStucc 0.225693
## Exterior2nd.MetalSd 0.675624
## Exterior2nd.Other 0.504837
## Exterior2nd.Plywood 0.556809
## Exterior2nd.Stone 0.530758
## Exterior2nd.Stucco 0.647665
## Exterior2nd.VinylSd 0.300775
## Exterior2nd.Wd.Sdng 0.296138
## Exterior2nd.Wd.Shng 0.643122
## MasVnrType.BrkFace 0.532770
## MasVnrType.None 0.293015
## MasVnrType.Stone 0.198923
## ExterQual.Fa 0.505284
## ExterQual.Gd 0.000016624325938440 ***
## ExterQual.TA 0.000171 ***
## ExterCond.Fa 0.878754
## ExterCond.Gd 0.675312
## ExterCond.Po 0.830020
## ExterCond.TA 0.808107
## Foundation.CBlock 0.356457
## Foundation.PConc 0.250684
## Foundation.Slab 0.478793
## Foundation.Stone 0.379427
## Foundation.Wood 0.071846 .
## BsmtQual.Fa 0.064712 .
## BsmtQual.Gd 0.000000060484298429 ***
## BsmtQual.None 0.311943
## BsmtQual.TA 0.000503 ***
## BsmtCond.Gd 0.982425
## BsmtCond.Po 0.024053 *
## BsmtCond.TA 0.496473
## BsmtCond.Xa NA
## BsmtExposure.Gd 0.000002058928569490 ***
## BsmtExposure.Mn 0.243188
## BsmtExposure.No 0.016734 *
## BsmtExposure.Xb 0.628713
## BsmtFinType1.BLQ 0.295901
## BsmtFinType1.GLQ 0.024475 *
## BsmtFinType1.LwQ 0.363583
## BsmtFinType1.Rec 0.957430
## BsmtFinType1.Unf 0.354769
## BsmtFinType1.Xc NA
## BsmtFinType2.BLQ 0.087923 .
## BsmtFinType2.GLQ 0.764891
## BsmtFinType2.LwQ 0.055093 .
## BsmtFinType2.Rec 0.156055
## BsmtFinType2.Unf 0.281182
## BsmtFinType2.Xd 0.254032
## Heating.GasA 0.734183
## Heating.GasW 0.817888
## Heating.Grav 0.977942
## Heating.OthW 0.694392
## Heating.Wall 0.453486
## HeatingQC.Fa 0.919560
## HeatingQC.Gd 0.048353 *
## HeatingQC.Po 0.932373
## HeatingQC.TA 0.111447
## Y 0.986823
## Electrical.FuseF 0.999060
## Electrical.FuseP 0.638419
## Electrical.Mix 0.340473
## Electrical.None 0.648683
## Electrical.SBrkr 0.490068
## KitchenQual.Fa 0.001458 **
## KitchenQual.Gd 0.000000000015617754 ***
## KitchenQual.TA 0.000000008536782491 ***
## Functional.Maj2 0.926453
## Functional.Min1 0.418674
## Functional.Min2 0.311525
## Functional.Mod 0.615514
## Functional.Sev 0.165720
## Functional.Typ 0.014533 *
## FireplaceQu.Fa 0.900514
## FireplaceQu.Gd 0.594082
## FireplaceQu.None 0.159484
## FireplaceQu.Po 0.117112
## FireplaceQu.TA 0.488749
## GarageType.Attchd 0.066560 .
## GarageType.Basment 0.053366 .
## GarageType.BuiltIn 0.081616 .
## GarageType.CarPort 0.084396 .
## GarageType.Detchd 0.034511 *
## GarageType.Xe 0.272426
## GarageFinish.RFn 0.225629
## GarageFinish.Unf 0.801352
## GarageFinish.Xf NA
## GarageQual.Fa 0.000029367740820109 ***
## GarageQual.Gd 0.000097592545543651 ***
## GarageQual.Po 0.000191 ***
## GarageQual.TA 0.000061227032228221 ***
## GarageQual.Xg NA
## GarageCond.Fa 0.001227 **
## GarageCond.Gd 0.002034 **
## GarageCond.Po 0.001504 **
## GarageCond.TA 0.000935 ***
## GarageCond.Xh NA
## PavedDrive.P 0.572377
## PavedDrive.Y 0.975139
## PoolQC.Fa 0.000112 ***
## PoolQC.Gd 0.000484 ***
## PoolQC.None 0.040658 *
## Fence.GdWo 0.102577
## Fence.MnPrv 0.017969 *
## Fence.MnWw 0.686632
## Fence.None 0.015439 *
## MiscFeature.None 0.959497
## MiscFeature.Othr 0.834159
## MiscFeature.Shed 0.938875
## MiscFeature.TenC 0.711616
## SaleType.Con 0.147487
## SaleType.ConLD 0.084970 .
## SaleType.ConLI 0.693345
## SaleType.ConLw 0.956371
## SaleType.CWD 0.238314
## SaleType.New 0.157381
## SaleType.None NA
## SaleType.Oth 0.574159
## SaleType.WD 0.933909
## SaleCondition.AdjLand 0.521393
## SaleCondition.Alloca 0.935951
## SaleCondition.Family 0.891920
## SaleCondition.Normal 0.020496 *
## SaleCondition.Partial 0.960724
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22570 on 1207 degrees of freedom
## Multiple R-squared: 0.9332, Adjusted R-squared: 0.9193
## F-statistic: 66.94 on 252 and 1207 DF, p-value: < 0.00000000000000022
R-squared is 0.93, This means 93% variance of the sale price can be explained by predictor variables in the model. F-statistic is 66.94 with 1207 of degree of freedom and p-value is also very small.
model2 <- lm(SalePrice ~ LotArea + OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF +
X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + Fireplaces + GarageArea + ScreenPorch + PoolArea +
MSZoning.FV + MSZoning.RL + Pave + LotConfig.CulDSac + LandSlope.Sev + Neighborhood.Edwards +
Neighborhood.Mitchel + Neighborhood.NAmes + Neighborhood.NoRidge + Neighborhood.NridgHt + Neighborhood.NWAmes +
Neighborhood.StoneBr + Condition1.Norm + Condition2.PosN + Condition2.RRAe + RoofStyle.Shed +
RoofMatl.CompShg + RoofMatl.Tar.Grv + RoofMatl.WdShake + ExterQual.Gd + ExterQual.TA + BsmtQual.Gd + BsmtQual.TA +
BsmtExposure.Gd + BsmtExposure.No + KitchenQual.Fa + KitchenQual.Gd + KitchenQual.TA +
Functional.Typ + GarageQual.Fa + GarageQual.Gd + GarageQual.Po + GarageQual.TA + GarageCond.Fa +
GarageCond.Gd + GarageCond.Po + GarageCond.TA + PoolQC.Fa + PoolQC.Gd + PoolQC.None, data = data_train)
summary(model2)
##
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + OverallCond +
## YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF +
## TotalBsmtSF + X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr +
## Fireplaces + GarageArea + ScreenPorch + PoolArea + MSZoning.FV +
## MSZoning.RL + Pave + LotConfig.CulDSac + LandSlope.Sev +
## Neighborhood.Edwards + Neighborhood.Mitchel + Neighborhood.NAmes +
## Neighborhood.NoRidge + Neighborhood.NridgHt + Neighborhood.NWAmes +
## Neighborhood.StoneBr + Condition1.Norm + Condition2.PosN +
## Condition2.RRAe + RoofStyle.Shed + RoofMatl.CompShg + RoofMatl.Tar.Grv +
## RoofMatl.WdShake + ExterQual.Gd + ExterQual.TA + BsmtQual.Gd +
## BsmtQual.TA + BsmtExposure.Gd + BsmtExposure.No + KitchenQual.Fa +
## KitchenQual.Gd + KitchenQual.TA + Functional.Typ + GarageQual.Fa +
## GarageQual.Gd + GarageQual.Po + GarageQual.TA + GarageCond.Fa +
## GarageCond.Gd + GarageCond.Po + GarageCond.TA + PoolQC.Fa +
## PoolQC.Gd + PoolQC.None, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189512 -11179 0 11300 189512
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -938190.68680 124186.41803 -7.555
## LotArea 0.73554 0.08809 8.350
## OverallQual 7595.54490 905.78430 8.386
## OverallCond 7000.29721 689.52510 10.152
## YearBuilt 378.70965 42.18476 8.977
## MasVnrArea 9.35417 4.46110 2.097
## BsmtFinSF1 -49.99432 8.59135 -5.819
## BsmtFinSF2 -62.47200 9.58936 -6.515
## BsmtUnfSF -64.99445 8.76936 -7.412
## TotalBsmtSF 89.82543 9.09925 9.872
## X1stFlrSF 57.99102 4.01495 14.444
## X2ndFlrSF 60.90316 2.58524 23.558
## BedroomAbvGr -2987.13098 1114.72709 -2.680
## KitchenAbvGr -23415.19408 3354.71928 -6.980
## Fireplaces 3571.10089 1257.95527 2.839
## GarageArea 32.79753 4.79670 6.838
## ScreenPorch 30.48516 12.28064 2.482
## PoolArea 567.08748 170.86035 3.319
## MSZoning.FV 23137.87498 3874.44977 5.972
## MSZoning.RL 12134.39489 2168.61810 5.595
## Pave 47703.24966 10609.97969 4.496
## LotConfig.CulDSac 7224.96443 2751.28865 2.626
## LandSlope.Sev -48862.33442 9040.62587 -5.405
## Neighborhood.Edwards -12037.74394 2819.69870 -4.269
## Neighborhood.Mitchel -16232.09807 3818.56820 -4.251
## Neighborhood.NAmes -10573.24544 2198.59669 -4.809
## Neighborhood.NoRidge 31073.51663 4479.96403 6.936
## Neighborhood.NridgHt 19984.90501 3616.65653 5.526
## Neighborhood.NWAmes -14416.30363 3247.89442 -4.439
## Neighborhood.StoneBr 34315.91779 5265.91340 6.517
## Condition1.Norm 8586.78138 1974.78021 4.348
## Condition2.PosN -224536.76857 18115.51178 -12.395
## Condition2.RRAe -90279.30221 37925.02755 -2.380
## RoofStyle.Shed 65340.78938 28694.28331 2.277
## RoofMatl.CompShg -40099.69136 9054.74520 -4.429
## RoofMatl.Tar.Grv -60414.85471 11756.19954 -5.139
## RoofMatl.WdShake -49513.75006 15416.32738 -3.212
## ExterQual.Gd -25348.96717 4024.72647 -6.298
## ExterQual.TA -27049.54576 4224.18382 -6.403
## BsmtQual.Gd -22890.78640 2549.24314 -8.979
## BsmtQual.TA -18593.85046 2732.45006 -6.805
## BsmtExposure.Gd 16233.52733 2758.07699 5.886
## BsmtExposure.No -4733.71014 1638.95501 -2.888
## KitchenQual.Fa -23677.02118 5571.32885 -4.250
## KitchenQual.Gd -25199.44588 3401.10654 -7.409
## KitchenQual.TA -27154.87080 3794.09727 -7.157
## Functional.Typ 14567.61731 2787.61949 5.226
## GarageQual.Fa -99047.72202 27266.19186 -3.633
## GarageQual.Gd -94081.71264 27875.22260 -3.375
## GarageQual.Po -104831.42807 33276.15816 -3.150
## GarageQual.TA -99518.81798 26984.43691 -3.688
## GarageCond.Fa 87558.61820 27775.92974 3.152
## GarageCond.Gd 74122.37368 28700.60498 2.583
## GarageCond.Po 86241.02267 30269.59928 2.849
## GarageCond.TA 89485.97371 27301.79633 3.278
## PoolQC.Fa -117774.30077 26623.72670 -4.424
## PoolQC.Gd -100054.08168 32196.77108 -3.108
## PoolQC.None 213222.69770 92957.18077 2.294
## Pr(>|t|)
## (Intercept) 0.0000000000000754 ***
## LotArea < 0.0000000000000002 ***
## OverallQual < 0.0000000000000002 ***
## OverallCond < 0.0000000000000002 ***
## YearBuilt < 0.0000000000000002 ***
## MasVnrArea 0.036187 *
## BsmtFinSF1 0.0000000073193130 ***
## BsmtFinSF2 0.0000000001012574 ***
## BsmtUnfSF 0.0000000000002150 ***
## TotalBsmtSF < 0.0000000000000002 ***
## X1stFlrSF < 0.0000000000000002 ***
## X2ndFlrSF < 0.0000000000000002 ***
## BedroomAbvGr 0.007455 **
## KitchenAbvGr 0.0000000000045475 ***
## Fireplaces 0.004593 **
## GarageArea 0.0000000000119914 ***
## ScreenPorch 0.013167 *
## PoolArea 0.000927 ***
## MSZoning.FV 0.0000000029673728 ***
## MSZoning.RL 0.0000000264275476 ***
## Pave 0.0000074925231460 ***
## LotConfig.CulDSac 0.008733 **
## LandSlope.Sev 0.0000000761778431 ***
## Neighborhood.Edwards 0.0000209400557431 ***
## Neighborhood.Mitchel 0.0000227054093716 ***
## Neighborhood.NAmes 0.0000016796722710 ***
## Neighborhood.NoRidge 0.0000000000061357 ***
## Neighborhood.NridgHt 0.0000000390532310 ***
## Neighborhood.NWAmes 0.0000097611614729 ***
## Neighborhood.StoneBr 0.0000000001000278 ***
## Condition1.Norm 0.0000147160750638 ***
## Condition2.PosN < 0.0000000000000002 ***
## Condition2.RRAe 0.017424 *
## RoofStyle.Shed 0.022928 *
## RoofMatl.CompShg 0.0000102218882677 ***
## RoofMatl.Tar.Grv 0.0000003152216592 ***
## RoofMatl.WdShake 0.001349 **
## ExterQual.Gd 0.0000000004018333 ***
## ExterQual.TA 0.0000000002067092 ***
## BsmtQual.Gd < 0.0000000000000002 ***
## BsmtQual.TA 0.0000000000149461 ***
## BsmtExposure.Gd 0.0000000049485673 ***
## BsmtExposure.No 0.003933 **
## KitchenQual.Fa 0.0000228093086633 ***
## KitchenQual.Gd 0.0000000000002187 ***
## KitchenQual.TA 0.0000000000013241 ***
## Functional.Typ 0.0000001995910146 ***
## GarageQual.Fa 0.000291 ***
## GarageQual.Gd 0.000758 ***
## GarageQual.Po 0.001665 **
## GarageQual.TA 0.000235 ***
## GarageCond.Fa 0.001654 **
## GarageCond.Gd 0.009906 **
## GarageCond.Po 0.004448 **
## GarageCond.TA 0.001072 **
## PoolQC.Fa 0.0000104544586912 ***
## PoolQC.Gd 0.001924 **
## PoolQC.None 0.021951 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24290 on 1402 degrees of freedom
## Multiple R-squared: 0.9102, Adjusted R-squared: 0.9065
## F-statistic: 249.2 on 57 and 1402 DF, p-value: < 0.00000000000000022
This model did not give any performance improvement. R-squared is 0.91, This means 91% variance of the sale price can be explained by predictor variables in the model. F-statistic is 249.2 with 1402 of degree of freedom and p-value remains the same.
model1_data <- data_test
model2_data <- data_test
# modelColumns <- colnames(HouseDF) testDF_model <-
# testData[,colnames(testData) %in% modelColumns]
model1_data$salePrice <- predict(model1, data_test)
## Warning in predict.lm(model1, data_test): prediction from a rank-deficient
## fit may be misleading
model2_data$salePrice <- predict(model2, data_test)
Id <- data_test$Id
# Kaggle dataset for model1
salePrice <- model1_data$salePrice
kaggleData1 <- data.frame(cbind(Id, salePrice))
kaggleData1[is.na(kaggleData1)] <- 0
# write.csv(kaggleData_modelDF,'kaggleData_model.csv')
# Kaggle dataset for model2
salePrice <- model2_data$salePrice
kaggleData2 <- data.frame(cbind(Id, salePrice))
kaggleData2[is.na(kaggleData2)] <- 0
write.csv(kaggleData1,'C:/Users/rites/Documents/GitHub/Data605_Final_Project/kaggleout1.csv', row.names = F)
write.csv(kaggleData2,'C:/Users/rites/Documents/GitHub/Data605_Final_Project/kaggleout2.csv', row.names = F)
For my 1st model: R-squared is 0.93, This means 93% variance of the sale price can be explained by predictor variables in the model. F-statistic is 66.94 with 1207 of degree of freedom and p-value is also very small. Kaggke score is 0.47336
For my 2nd model: This model did not give any performance improvement. R-squared is 0.91, This means 91% variance of the sale price can be explained by predictor variables in the model. F-statistic is 249.2 with 1402 of degree of freedom and p-value remains the same.