FINAL PROJECT

IS 605 FUNDAMENTALS OF COMPUTATIONAL MATHEMATICS

#Load libraries
suppressMessages(suppressWarnings(library(readr)))
suppressMessages(suppressWarnings(library(kableExtra)))
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(knitr)))
suppressMessages(suppressWarnings(library(psych)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(usdm)))
suppressMessages(suppressWarnings(library(mice)))
suppressMessages(suppressWarnings(library(ggiraph)))
suppressMessages(suppressWarnings(library(cowplot)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(corrgram)))
suppressMessages(suppressWarnings(library(caTools)))
suppressMessages(suppressWarnings(library(caret)))
suppressMessages(suppressWarnings(library(ROCR)))
suppressMessages(suppressWarnings(library(pROC)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(qqplotr)))
suppressMessages(suppressWarnings(library(moments)))
suppressMessages(suppressWarnings(library(car)))
suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(geoR)))
suppressMessages(suppressWarnings(library(xtable)))
suppressMessages(suppressWarnings(library(plyr)))
suppressMessages(suppressWarnings(library(Hmisc)))
suppressMessages(suppressWarnings(library(corrplot)))
suppressMessages(suppressWarnings(library(PerformanceAnalytics)))
suppressMessages(suppressWarnings(library(ggpubr)))
suppressMessages(suppressWarnings(library(matrixcalc)))
suppressMessages(suppressWarnings(library(alr3)))
suppressMessages(suppressWarnings(library(bestglm)))
suppressMessages(suppressWarnings(library(car)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(scales)))
suppressMessages(suppressWarnings(library(Matrix)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(mlr)))
suppressMessages(suppressWarnings(library(corrr)))

Problem 1

Pick one of the quantitative independent variables (Xi) from the data set below, and define that variable as X. Also, pick one of the dependent variables (Yi) below, and define that as Y.

#download the files and then load them from storage
F1 <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data605_Final_Project/master/F1.csv", stringsAsFactors = FALSE)
F1
##      Y1   Y2   Y3   Y4   X1   X2   X3   X4
## 1  20.3 20.8 28.4 20.2  9.3  7.4  9.5  9.3
## 2  19.1 14.6 21.5 18.6  4.1  6.4  3.7 12.4
## 3  19.3 18.0 20.8 22.6 22.4  8.5 11.7 19.9
## 4  20.9  7.3 22.2 11.4  9.1  9.5  7.4  6.9
## 5  22.0 19.4 21.6 23.6 15.8 11.8  5.3 -1.0
## 6  23.5 13.5 21.8 24.0  7.1  8.8  7.4 10.6
## 7  13.8 14.7 25.2 26.0 15.9  8.4  7.4  6.4
## 8  18.8 15.3 22.5 26.8  6.9  5.1  8.6 10.6
## 9  20.9 12.6 21.1 19.7 16.0 11.4  9.1  1.2
## 10 18.6 13.0 21.7 22.7  6.7 15.1 11.4  7.7
## 11 22.3 13.1 21.4 16.8  8.2 12.6  8.4 15.5
## 12 17.6 10.3 20.8 20.2 16.0  8.0  7.3  6.9
## 13 20.8 14.9 23.0 21.7  6.4 10.3 11.3 13.7
## 14 28.7 14.8 17.4 20.9 11.8 10.4  4.4  3.7
## 15 15.2 16.2 21.3 26.9  3.5  9.5  9.3  4.4
## 16 20.9 15.7 15.1 16.3 21.7  9.5 10.9 11.5
## 17 18.4 16.3 17.8 19.9 12.2 15.1 10.9  4.2
## 18 10.3 11.5 26.4 15.5  9.3  6.6  7.7 13.9
## 19 26.3 12.2 21.6 26.5  8.0 15.4  7.7 12.9
## 20 28.1 11.8 22.5 21.7  6.2  8.2 11.5  1.2
#define X and Y.  
X <- F1$X1
Y <- F1$Y1

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

# get quartiles
#"x" is 3d quartile of X variable
#"y" is 2d quartile of X variable
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.50    6.85    9.20   10.83   15.82   22.40
summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.30   18.55   20.55   20.29   22.07   28.70

The 3rd quartile of the X variable = 15.82 The 1st quartile of the Y variable = 18.55 So, x = 15.82 and y = 18.55

x <- 15.82
y <- 18.55
df<-data.frame(cbind(X,Y))
PA_and_B <- nrow(subset(df, X > x & Y > y))/nrow(df)
PA <- nrow(subset(df, X > x))/nrow(df)
PB <- nrow(subset(df, Y > y))/nrow(df)
PC <- nrow(subset(df, X < x))/nrow(df)
PC_and_B <- nrow(subset(df, X < x & Y > y))/nrow(df)

a. P(X>x | Y>y)

# a. P(X>x | Y>y)
pA_given_B <- PA_and_B/PB
pA_given_B
## [1] 0.2

P(X>x | Y>y) = .2 or 20%, which means that there is 20% probablity of X>x or X will be greater than than it 3rd quartile value of 15.82 given that the Y is greater than its 1st quartile value of 18.55.

b. P(X>x, Y>y)

# b. P(X>x, Y>y)
PA_and_B
## [1] 0.15

P(X>x, Y>y) = .15 or 15%, which means that there is 15% probablity of X>x or X will be greater than than it 3rd quartile value of 15.82 while Y is greater than its 1st quartile value of 18.55.

c. P(X < x | Y>y)

# c. P(X<x|Y>y)
PC_given_B <- PC_and_B/PB
PC_given_B
## [1] 0.8

P(X < x|Y>y) = .8 or 80%, which means that there is 80% probablity of X < x or X will be smaller than than it 3rd quartile value of 15.82 given that the Y is greater than its 1st quartile value of 18.55.

Table of Counts

data_tbl <- as.data.frame(cbind.data.frame(X, Y, t1 = ifelse(X > 
    x, ">1st quartile", "<=1st quartile"), Total = ifelse(Y > y, ">3d quartile", 
    "<=3d quartile")))

tbl <- addmargins(table(data_tbl$t1, data_tbl$Total, dnn = c("X/Y")))
tbl
##                 NA
## X/Y              <=3d quartile >3d quartile Sum
##   <=1st quartile             3           12  15
##   >1st quartile              2            3   5
##   Sum                        5           15  20

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

Mathematical test:

A <- tbl[2, 3]
A
## [1] 5
B <- tbl[3, 2]
B
## [1] 15
total <- tbl[3, 3]
total
## [1] 20
A_AND_B <- tbl[2, 2]

# P(A)
PA <- A/total

# P(B)
PB <- B/total

# P(A INT B)
PA_INT_B <- A_AND_B/total


# P(A|B) = P(A INT B) / P(B)

PA_GIVEN_B <- PA_INT_B/PB

# P(A) * P(B)

PA_INTO_PB <- (PA * PB)

PA_GIVEN_B
## [1] 0.2
PA_INTO_PB
## [1] 0.1875

Chi Square test

chisq.test(X,Y) 
## Warning in chisq.test(X, Y): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  X and Y
## X-squared = 306.67, df = 289, p-value = 0.2272

we can see that P(A|B) is 0.2 and P(A) * P(B) is 0.1875. They are very near. Therefore splitting the training data in this manner is going to make them independent. Also from the Chisq test we can see that P-Value is .22, so we cannot reject the null hypothesis.

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

#download the train data
train <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data605_Final_Project/master/train.csv", header = TRUE, stringsAsFactors = FALSE)

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Subsetting the dataset to get only the numeric columns.

First i will plot scatterplots for Gross living area and Sale price.

ggplot(train, aes(x=GrLivArea, y=SalePrice)) + geom_jitter(color='seagreen4') + theme_classic() + 
labs(title ='Scatter Plot of Gross living area vs Sale price') + theme(plot.title = element_text(hjust = 5000)) 

Scatter plot for Masonry veneer area in square feet and scale price

ggplot(train, aes(x=MasVnrArea, y=SalePrice)) + geom_jitter(color='seagreen4') + theme_classic() + 
labs(title ='Scatter Plot of Masonry veneer area vs Sale price') + theme(plot.title = element_text(hjust = 5000)) 
## Warning: Removed 8 rows containing missing values (geom_point).

From the above scatter plots we can see that there is some kind of relation between the independent and dependent variables.

Lets now so some descriptive analysis on the available data.

Descriptive statistics and plots

num <- unlist(lapply(train, is.numeric))
train_num <- train[, num]
summary(train_num)
##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 59.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 69.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 70.05   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 80.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                   NA's   :259                      
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF       X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
## 

Bar plots to check relation to Sale price.

Histograms for Ordinal Data

Derive a correlation matrix for any THREE quantitative variables in the dataset

Selected variables are: SalePrice,TotalBsmtSF,GrLivArea

train_cor <- train[c("SalePrice", "TotalBsmtSF", "GrLivArea")]
train_cor_matrix <- cor(train_cor, use = "complete.obs")
train_cor_matrix
##             SalePrice TotalBsmtSF GrLivArea
## SalePrice   1.0000000   0.6135806 0.7086245
## TotalBsmtSF 0.6135806   1.0000000 0.4548682
## GrLivArea   0.7086245   0.4548682 1.0000000

The Matrix suggests that there are strong to moderate corelation exists between these three variables. ‘Saleprice’ has strong corelations with ‘TotalBsmtSF’ and ‘GrLivArea’ with corelation coefficients of .61 and .708 respectively while ‘TotalBsmtSF’ and ‘GrLivArea’ have moderate corelation between them with coefficient of .45.

pairs.panels(train_cor_matrix)

Corelation test bwteen each pair:

Test between ‘TotalBsmtSF’ and ‘SalePrice’

cor.test(train$TotalBsmtSF, train$SalePrice, method = "pearson", conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$TotalBsmtSF and train$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806

Test between ‘GrLivArea’ and ‘SalePrice’

cor.test(train$GrLivArea, train$SalePrice, method = "pearson", conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245

Test between ‘GrLivArea’ and ‘TotalBsmtSF’

cor.test(train$GrLivArea, train$TotalBsmtSF, method = "pearson", conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$GrLivArea and train$TotalBsmtSF
## t = 19.503, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4278380 0.4810855
## sample estimates:
##       cor 
## 0.4548682

Since all three p-values are less than .05, the variables are significantly correlated.

Would you be worried about familywise error? Why or why not?

Yes, there are variables in this dataset that might have impact on the corelation of the the pairs of selected variables that are being tested here. There is a scope for familywise error which might cause rejecting of true Null hypothesis.

Linear Algebra and Correlation:

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Correlation Matrix:

train_cor_matrix
##             SalePrice TotalBsmtSF GrLivArea
## SalePrice   1.0000000   0.6135806 0.7086245
## TotalBsmtSF 0.6135806   1.0000000 0.4548682
## GrLivArea   0.7086245   0.4548682 1.0000000

Precision matrix:

pre_matrix <- solve(train_cor_matrix)
pre_matrix
##              SalePrice TotalBsmtSF   GrLivArea
## SalePrice    2.5582310 -0.93946422 -1.38549273
## TotalBsmtSF -0.9394642  1.60588442 -0.06473842
## GrLivArea   -1.3854927 -0.06473842  2.01124151

Multiplication of correlation matrix by the precision matrix:

round((train_cor_matrix %*% pre_matrix), 2)
##             SalePrice TotalBsmtSF GrLivArea
## SalePrice           1           0         0
## TotalBsmtSF         0           1         0
## GrLivArea           0           0         1

Multiplication of precision matrix by the correlation matrix:

round((pre_matrix %*% train_cor_matrix), 2)
##             SalePrice TotalBsmtSF GrLivArea
## SalePrice           1           0         0
## TotalBsmtSF         0           1         0
## GrLivArea           0           0         1

Both the matrix is identical.

LU decomposition of corelation matrix:

cor_lu <- lu(train_cor_matrix)
cor_lu_exd <- expand(cor_lu)

L_cor <- cor_lu_exd$L
U_cor <- cor_lu_exd$U

L_cor
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]      
## [1,] 1.00000000          .          .
## [2,] 0.61358055 1.00000000          .
## [3,] 0.70862448 0.03218829 1.00000000
U_cor
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000 0.6135806 0.7086245
## [2,]         . 0.6235189 0.0200700
## [3,]         .         . 0.4972053

LU decomposition of precision matrix:

pre_lu <- lu(pre_matrix)
pre_lu_exd <- expand(pre_lu)

L_pre <- pre_lu_exd$L
U_pre <- pre_lu_exd$U

L_pre
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]      
## [1,]  1.0000000          .          .
## [2,] -0.3672320  1.0000000          .
## [3,] -0.5415823 -0.4548682  1.0000000
U_pre
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]      
## [1,]  2.5582310 -0.9394642 -1.3854927
## [2,]          .  1.2608831 -0.5735356
## [3,]          .          .  1.0000000

Lets multiply the lower and uper matrix and see if it returns the original matrices or not.

L_cor %*% U_cor
## 3 x 3 Matrix of class "dgeMatrix"
##           [,1]      [,2]      [,3]
## [1,] 1.0000000 0.6135806 0.7086245
## [2,] 0.6135806 1.0000000 0.4548682
## [3,] 0.7086245 0.4548682 1.0000000
L_pre %*% U_pre
## 3 x 3 Matrix of class "dgeMatrix"
##            [,1]        [,2]        [,3]
## [1,]  2.5582310 -0.93946422 -1.38549273
## [2,] -0.9394642  1.60588442 -0.06473842
## [3,] -1.3854927 -0.06473842  2.01124151

It returns the original matrices.

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of ??? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ???)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

min(train$GrLivArea)
## [1] 334

The minimum value is 334, so i dont think we need shifting.

Fitting the exponential probability density function:

expo <- fitdistr(train$GrLivArea, densfun = "exponential")
options(scipen = 999)
expo$estimate
##        rate 
## 0.000659864
smpl <- rexp(1000, expo$estimate)

Histogram of the samples and the original :

hist(train$GrLivArea)

hist(smpl)

The samples data is more skewed than the original data.

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF):

P = ecdf(smpl)
plot(P)

c <- quantile(P, c(0.05, 0.95))
c
##         5%        95% 
##   69.04413 4516.50970

The 5th and 95th percentiles of the samples (simulated data) are 87.16779 and 4759.05707 respectively.

Generate a 95% confidence interval from the empirical data, assuming normality:

er <- qnorm(0.975) * sd(train$GrLivArea)/sqrt(length(train$GrLivArea))
conf_95 <- c(mean(train$GrLivArea) - er, mean(train$GrLivArea) + er)
conf_95
## [1] 1488.509 1542.418

The 95% confidence interval is 1488.509 1542.418

Provide the empirical 5th percentile and 95th percentile of the data:

e <- quantile(X, c(0.05, 0.95))
e
##     5%    95% 
##  4.070 21.735

The 5th and 95th percentiles are 4.070 and 21.735 respectively.

Modeling:

Data preparation

#download the train data
test <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data605_Final_Project/master/test.csv", header = TRUE, stringsAsFactors = FALSE)
 
train <- cbind.data.frame(train, RecType = "Train")
test <- cbind.data.frame(test, RecType = "Test")

train_test <- rbind.data.frame(train, test, stringsAsFactors = FALSE)

We will see the missing values in the dataset. For this i have used Amelia package

Missing value plot before treatment:

missmap(train_test, main = "Missing values vs observed",  color='dodgerblue')

We can see there are lots of missing values. We will replace the missing values.

#Missing value handling

train_test[sapply(train_test, is.factor)] <- lapply(train_test[sapply(train_test, is.factor)], as.character)

train_test$GarageYrBlt[is.na(train_test$GarageYrBlt)] <- train_test$YearBuilt[is.na(train_test$GarageYrBlt)] 
train_test$LotFrontage[is.na(train_test$LotFrontage)] <- 0
train_test$MasVnrArea[is.na(train_test$MasVnrArea)] <- 0
train_test$Alley[is.na(train_test$Alley)] <- 'None'
train_test$Utilities[is.na(train_test$Utilities)] <- 'NoSeWa'
train_test$MasVnrType[is.na(train_test$MasVnrType)] <- 'None'
train_test$BsmtQual[is.na(train_test$BsmtQual)] <- 'None'
train_test$BsmtCond[is.na(train_test$BsmtCond)] <- 'Xa'
train_test$BsmtExposure[is.na(train_test$BsmtExposure)] <- 'Xb'
train_test$BsmtFinType1[is.na(train_test$BsmtFinType1)] <- 'Xc'
train_test$BsmtFinType2[is.na(train_test$BsmtFinType2)] <- 'Xd'
train_test$GarageType [is.na(train_test$GarageType )] <- 'Xe'
train_test$GarageFinish[is.na(train_test$GarageFinish)] <- 'Xf'
train_test$GarageQual[is.na(train_test$GarageQual)] <- 'Xg'
train_test$GarageCond[is.na(train_test$GarageCond)] <- 'Xh'
train_test$Electrical[is.na(train_test$Electrical)] <- 'None'
train_test$FireplaceQu[is.na(train_test$FireplaceQu)] <- 'None'
train_test$PoolQC[is.na(train_test$PoolQC)] <- 'None'
train_test$Fence[is.na(train_test$Fence)] <- 'None'
train_test$MiscFeature[is.na(train_test$MiscFeature)] <- 'None'
train_test$MSZoning[is.na(train_test$MSZoning)] <- 'C (all)'
train_test$Utilities[is.na(train_test$Utilities)] <- 'AllPub'
train_test$BsmtFullBath[is.na(train_test$BsmtFullBath)] <- 0
train_test$BsmtHalfBath[is.na(train_test$BsmtHalfBath)] <- 0
train_test$Exterior1st[is.na(train_test$Exterior1st)] <- 'BrkFace'
train_test$Exterior2nd[is.na(train_test$Exterior2nd)] <- 'BrkFace'
train_test$Functional[is.na(train_test$Functional)] <- 'Typ'
train_test$BsmtFinSF1[is.na(train_test$BsmtFinSF1)] <- 0
train_test$BsmtFinSF2[is.na(train_test$BsmtFinSF2)] <- 0
train_test$BsmtUnfSF[is.na(train_test$BsmtUnfSF)] <- 0
train_test$TotalBsmtSF[is.na(train_test$TotalBsmtSF)] <- 0
train_test$GarageCars[is.na(train_test$GarageCars)] <- 0
train_test$GarageArea[is.na(train_test$GarageArea)] <- 0
train_test$SaleType[is.na(train_test$SaleType)] <- 'None'
train_test$SalePrice[is.na(train$SalePrice)] <- 0


train_test[sapply(train_test, is.character)] <- lapply(train_test[sapply(train_test, is.character)], as.factor)

Missing value plot after treatment:

missmap(train_test, main = "Missing values vs observed",  color='dodgerblue')

Creating dummy values:

train_test <- createDummyFeatures(train_test, method = "reference")

Outlier treatment using mean:

train_test$TotalBsmtSF[train_test$TotalBsmtSF > 6000] <- mean(train_test$TotalBsmtSF[train_test$TotalBsmtSF < 6000])
train_test$X1stFlrSF[train_test$X1stFlrSF > 4000] <- mean(train_test$X1stFlrSF[train_test$X1stFlrSF < 4000])
# Split the combined transformed dataset into train and test.
data_train <- train_test[train_test$Train == 1, ]
data_test <- train_test[train_test$Train == 0, ]

# remove id and train/test flag variables from train dataset
data_train <- subset(data_train, select = -c(Id,Train))

# remove SalePrice variables from test dataset
data_test <- subset(data_test, select = -c(SalePrice,Train))

Model 1: Taking all the variables:

model1 <- lm(SalePrice ~ ., data = data_train)
summary(model1)
## 
## Call:
## lm(formula = SalePrice ~ ., data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -176856   -9170       0    9595  176856 
## 
## Coefficients: (9 not defined because of singularities)
##                           Estimate   Std. Error t value
## (Intercept)             50689.0010 1058003.9350   0.048
## MSSubClass                -55.2068      82.5886  -0.668
## LotFrontage                 6.4440      22.9780   0.280
## LotArea                     0.7171       0.1084   6.615
## OverallQual              6793.1233    1012.8117   6.707
## OverallCond              5746.9516     870.2750   6.604
## YearBuilt                 332.7269      80.2992   4.144
## YearRemodAdd              105.2342      55.4179   1.899
## MasVnrArea                 20.7177       5.7846   3.582
## BsmtFinSF1                -53.8230      10.1663  -5.294
## BsmtFinSF2                -60.4971      12.7641  -4.740
## BsmtUnfSF                 -71.2768      10.6182  -6.713
## TotalBsmtSF                91.9050      11.1332   8.255
## X1stFlrSF                  44.7238       5.6343   7.938
## X2ndFlrSF                  62.0943       5.6794  10.933
## LowQualFinSF               -3.9424      19.0313  -0.207
## GrLivArea                       NA           NA      NA
## BsmtFullBath             1530.8180    1976.4359   0.775
## BsmtHalfBath             -483.5565    3021.9614  -0.160
## FullBath                 3653.7232    2194.5297   1.665
## HalfBath                 1820.5860    2093.5514   0.870
## BedroomAbvGr            -3613.9719    1361.0583  -2.655
## KitchenAbvGr           -13758.1760    5675.1922  -2.424
## TotRmsAbvGrd             1820.0935     954.6153   1.907
## Fireplaces               6136.5930    2558.0173   2.399
## GarageYrBlt               -43.2256      59.1637  -0.731
## GarageCars               3998.7968    2276.2568   1.757
## GarageArea                 19.1837       7.8565   2.442
## WoodDeckSF                 15.1037       5.8619   2.577
## OpenPorchSF                 0.4612      11.5602   0.040
## EnclosedPorch               3.2862      12.4680   0.264
## X3SsnPorch                 34.0992      22.3310   1.527
## ScreenPorch                35.4880      12.4837   2.843
## PoolArea                  679.8321     226.5222   3.001
## MiscVal                     0.3329       6.1047   0.055
## MoSold                   -470.0028     244.7582  -1.920
## YrSold                   -555.5184     514.2482  -1.080
## MSZoning.FV             32368.6035   11991.3668   2.699
## MSZoning.RH             22264.5836   11889.5405   1.873
## MSZoning.RL             24939.1691   10209.7142   2.443
## MSZoning.RM             21597.6512    9578.8161   2.255
## Pave                    32943.8397   12161.4633   2.709
## Alley.None              -1195.2818    4209.3297  -0.284
## Alley.Pave               -798.4830    6010.9140  -0.133
## LotShape.IR2             4950.9585    4209.0660   1.176
## LotShape.IR3             6301.9099    8799.9104   0.716
## LotShape.Reg             1714.0784    1634.1378   1.049
## LandContour.HLS          7596.2278    5121.6497   1.483
## LandContour.Low        -11283.9130    6399.2483  -1.763
## LandContour.Lvl          5406.5527    3698.8123   1.462
## NoSeWa                 -36754.2087   26340.1673  -1.395
## LotConfig.CulDSac        7846.5972    3247.0002   2.417
## LotConfig.FR2           -7774.6258    3994.4303  -1.946
## LotConfig.FR3          -17364.5956   12539.9268  -1.385
## LotConfig.Inside        -1588.8280    1754.5448  -0.906
## LandSlope.Mod            7562.4065    3967.0352   1.906
## LandSlope.Sev          -41944.4765   11367.3478  -3.690
## Neighborhood.Blueste     7346.8466   19195.8672   0.383
## Neighborhood.BrDale     -2784.9689   10950.3445  -0.254
## Neighborhood.BrkSide    -5632.4232    9462.4767  -0.595
## Neighborhood.ClearCr   -14428.1968    9194.2720  -1.569
## Neighborhood.CollgCr   -10074.3769    7247.9563  -1.390
## Neighborhood.Crawfor    11794.8702    8528.9234   1.383
## Neighborhood.Edwards   -21466.0733    7990.3517  -2.686
## Neighborhood.Gilbert   -11027.7674    7656.6699  -1.440
## Neighborhood.IDOTRR    -12175.4592   10725.5453  -1.135
## Neighborhood.MeadowV    -6846.9204   11155.9525  -0.614
## Neighborhood.Mitchel   -20799.5958    8156.3894  -2.550
## Neighborhood.NAmes     -17234.9247    7831.4109  -2.201
## Neighborhood.NoRidge    25670.2938    8411.6426   3.052
## Neighborhood.NPkVill    13767.7905   13999.8855   0.983
## Neighborhood.NridgHt    18135.3828    7506.0347   2.416
## Neighborhood.NWAmes    -17247.1156    7988.6001  -2.159
## Neighborhood.OldTown   -14150.2066    9656.0406  -1.465
## Neighborhood.Sawyer    -10999.0385    8104.8033  -1.357
## Neighborhood.SawyerW    -2769.1570    7777.8750  -0.356
## Neighborhood.Somerst    -2273.3989    8996.0717  -0.253
## Neighborhood.StoneBr    39211.4348    8276.5754   4.738
## Neighborhood.SWISU      -8868.6284    9663.1893  -0.918
## Neighborhood.Timber    -10051.1403    8124.5569  -1.237
## Neighborhood.Veenker     -133.5884   10475.1850  -0.013
## Condition1.Feedr         7248.7615    5001.1014   1.449
## Condition1.Norm         16431.1558    4177.7955   3.933
## Condition1.PosA         10210.2183    9917.1421   1.030
## Condition1.PosN         15074.7180    7458.3092   2.021
## Condition1.RRAe        -15660.7293    9046.2706  -1.731
## Condition1.RRAn         13073.4902    6930.1084   1.886
## Condition1.RRNe         -3507.3862   17448.5854  -0.201
## Condition1.RRNn         11393.5511   12823.4239   0.888
## Condition2.Feedr        -5562.1856   23372.0914  -0.238
## Condition2.Norm        -10123.3463   20253.0443  -0.500
## Condition2.PosA         44010.0113   36915.3183   1.192
## Condition2.PosN       -238028.6293   27584.9350  -8.629
## Condition2.RRAe       -127924.1298   64974.9579  -1.969
## Condition2.RRAn        -23276.0583   31462.3156  -0.740
## Condition2.RRNn         -3269.1597   27026.1939  -0.121
## BldgType.2fmCon         -3262.0367   12468.3219  -0.262
## BldgType.Duplex         -7023.1556    7398.3669  -0.949
## BldgType.Twnhs         -19170.7485    9946.0704  -1.927
## BldgType.TwnhsE        -15087.9058    8980.9617  -1.680
## HouseStyle.1.5Unf       11478.1041    7920.1433   1.449
## HouseStyle.1Story        5129.2357    4375.1508   1.172
## HouseStyle.2.5Fin      -16637.2641   12367.1848  -1.345
## HouseStyle.2.5Unf       -9649.1947    9179.2375  -1.051
## HouseStyle.2Story       -5928.7905    3492.3136  -1.698
## HouseStyle.SFoyer        1215.9024    6249.0104   0.195
## HouseStyle.SLvl          3525.4545    5575.7096   0.632
## RoofStyle.Gable          9308.6550   18428.0613   0.505
## RoofStyle.Gambrel       12059.0245   20171.4278   0.598
## RoofStyle.Hip            9111.2795   18503.5835   0.492
## RoofStyle.Mansard       20166.7008   21440.5001   0.941
## RoofStyle.Shed         100880.7892   34483.1489   2.926
## RoofMatl.CompShg       -54296.9586   11367.2977  -4.777
## RoofMatl.Membran        41083.7971   34697.1186   1.184
## RoofMatl.Metal           9834.9706   33718.9583   0.292
## RoofMatl.Roll          -66711.0218   27853.0229  -2.395
## RoofMatl.Tar.Grv       -52615.4146   21259.8848  -2.475
## RoofMatl.WdShake       -63038.3677   18578.2800  -3.393
## RoofMatl.WdShngl                NA           NA      NA
## Exterior1st.AsphShn    -24195.4755   32928.8507  -0.735
## Exterior1st.BrkComm     -3023.5319   27749.1176  -0.109
## Exterior1st.BrkFace      7068.4398   12737.8031   0.555
## Exterior1st.CBlock     -13593.9162   27210.2182  -0.500
## Exterior1st.CemntBd    -12686.5376   18978.1897  -0.668
## Exterior1st.HdBoard    -13545.8252   12924.9211  -1.048
## Exterior1st.ImStucc    -22821.8303   28107.2603  -0.812
## Exterior1st.MetalSd     -6271.8254   14574.6095  -0.430
## Exterior1st.Plywood    -14551.3915   12744.5251  -1.142
## Exterior1st.Stone       -1700.3219   24262.9298  -0.070
## Exterior1st.Stucco      -7937.1013   14057.4876  -0.565
## Exterior1st.VinylSd    -14356.5799   13305.7748  -1.079
## Exterior1st.Wd.Sdng    -14434.6938   12353.9010  -1.168
## Exterior1st.WdShing    -10289.8294   13330.6819  -0.772
## Exterior2nd.AsphShn     12180.8473   22153.0182   0.550
## Exterior2nd.Brk.Cmn      4809.2516   20037.3534   0.240
## Exterior2nd.BrkFace      4450.0695   13192.9607   0.337
## Exterior2nd.CBlock              NA           NA      NA
## Exterior2nd.CmentBd     13038.4093   18662.3035   0.699
## Exterior2nd.HdBoard      8669.7864   12399.9421   0.699
## Exterior2nd.ImStucc     17346.8038   14310.8094   1.212
## Exterior2nd.MetalSd      5936.4460   14183.5770   0.419
## Exterior2nd.Other      -18046.9297   27052.9052  -0.667
## Exterior2nd.Plywood      7069.2296   12027.5892   0.588
## Exterior2nd.Stone      -10722.8301   17100.9661  -0.627
## Exterior2nd.Stucco       6206.5429   13577.4090   0.457
## Exterior2nd.VinylSd     13226.3548   12776.4154   1.035
## Exterior2nd.Wd.Sdng     12446.6804   11908.3766   1.045
## Exterior2nd.Wd.Shng      5766.9319   12443.3492   0.463
## MasVnrType.BrkFace       4260.1179    6827.5285   0.624
## MasVnrType.None          7258.4283    6899.7149   1.052
## MasVnrType.Stone         9288.4757    7226.5287   1.285
## ExterQual.Fa            -7381.9536   11077.3828  -0.666
## ExterQual.Gd           -20674.8230    4781.9894  -4.323
## ExterQual.TA           -19968.0053    5297.2892  -3.769
## ExterCond.Fa            -2752.1863   18037.5478  -0.153
## ExterCond.Gd            -7210.9145   17211.0281  -0.419
## ExterCond.Po             6791.5332   31629.3968   0.215
## ExterCond.TA            -4172.5334   17176.3980  -0.243
## Foundation.CBlock        2919.2579    3164.5389   0.922
## Foundation.PConc         3926.5051    3416.6052   1.149
## Foundation.Slab         -7107.7539   10032.5960  -0.708
## Foundation.Stone        10019.6240   11395.3194   0.879
## Foundation.Wood        -26521.3624   14720.3859  -1.802
## BsmtQual.Fa            -11693.8164    6324.6293  -1.849
## BsmtQual.Gd            -18112.9265    3322.4962  -5.452
## BsmtQual.None           37028.3295   36604.5517   1.012
## BsmtQual.TA            -14420.0303    4133.3147  -3.489
## BsmtCond.Gd               116.1538    5271.6628   0.022
## BsmtCond.Po             67283.9021   29783.0774   2.259
## BsmtCond.TA              2882.8125    4237.8355   0.680
## BsmtCond.Xa                     NA           NA      NA
## BsmtExposure.Gd         14279.3862    2993.0810   4.771
## BsmtExposure.Mn         -3519.6978    3014.4004  -1.168
## BsmtExposure.No         -5216.1079    2177.1604  -2.396
## BsmtExposure.Xb        -11104.6108   22959.4392  -0.484
## BsmtFinType1.BLQ         2920.5408    2792.8595   1.046
## BsmtFinType1.GLQ         5672.5895    2518.4459   2.252
## BsmtFinType1.LwQ        -3398.6030    3739.2399  -0.909
## BsmtFinType1.Rec          159.8008    2993.1071   0.053
## BsmtFinType1.Unf         2693.4108    2909.4863   0.926
## BsmtFinType1.Xc                 NA           NA      NA
## BsmtFinType2.BLQ       -12902.5450    7554.8913  -1.708
## BsmtFinType2.GLQ        -2793.6188    9339.0802  -0.299
## BsmtFinType2.LwQ       -14181.3034    7386.0856  -1.920
## BsmtFinType2.Rec       -10080.5241    7102.2129  -1.419
## BsmtFinType2.Unf        -8153.2144    7562.2190  -1.078
## BsmtFinType2.Xd        -28466.2919   24945.1604  -1.141
## Heating.GasA             8673.7104   25537.5471   0.340
## Heating.GasW             6062.2166   26321.6313   0.230
## Heating.Grav              775.1689   28029.8596   0.028
## Heating.OthW           -12340.4600   31401.0400  -0.393
## Heating.Wall            22259.8027   29685.3382   0.750
## HeatingQC.Fa              475.1322    4703.8348   0.101
## HeatingQC.Gd            -4087.1845    2068.1473  -1.976
## HeatingQC.Po             2250.6789   26516.8270   0.085
## HeatingQC.TA            -3293.9653    2067.9185  -1.593
## Y                         -63.8371    3864.3371  -0.017
## Electrical.FuseF            6.7674    5741.3344   0.001
## Electrical.FuseP        -8720.2176   18552.4726  -0.470
## Electrical.Mix         -42334.2987   44393.8001  -0.954
## Electrical.None         10953.3425   24035.9446   0.456
## Electrical.SBrkr        -2033.1728    2944.8710  -0.690
## KitchenQual.Fa         -19738.5036    6187.0103  -3.190
## KitchenQual.Gd         -23667.9092    3476.7663  -6.807
## KitchenQual.TA         -22719.4821    3918.1827  -5.798
## Functional.Maj2         -1325.4995   14356.4816  -0.092
## Functional.Min1          6944.2282    8583.7018   0.809
## Functional.Min2          8728.2441    8620.9007   1.012
## Functional.Mod          -5292.6771   10535.8713  -0.502
## Functional.Sev         -41005.9777   29566.0313  -1.387
## Functional.Typ          18234.1483    7450.5590   2.447
## FireplaceQu.Fa           -859.2110    6871.6106  -0.125
## FireplaceQu.Gd           2831.6547    5311.9619   0.533
## FireplaceQu.None         8755.1760    6219.5933   1.408
## FireplaceQu.Po          12398.4490    7906.4940   1.568
## FireplaceQu.TA           3823.8994    5521.7779   0.693
## GarageType.Attchd       20173.2972   10985.8658   1.836
## GarageType.Basment      24640.4455   12741.7138   1.934
## GarageType.BuiltIn      19959.3727   11452.1918   1.743
## GarageType.CarPort      25327.8378   14664.5199   1.727
## GarageType.Detchd       23289.0183   11003.8188   2.116
## GarageType.Xe           22803.9775   20768.7623   1.098
## GarageFinish.RFn        -2373.6793    1957.9719  -1.212
## GarageFinish.Unf         -610.3553    2425.3806  -0.252
## GarageFinish.Xf                 NA           NA      NA
## GarageQual.Fa         -126512.1485   30162.5660  -4.194
## GarageQual.Gd         -121022.6143   30955.2407  -3.910
## GarageQual.Po         -144089.8550   38501.9031  -3.742
## GarageQual.TA         -120117.7800   29863.3903  -4.022
## GarageQual.Xg                   NA           NA      NA
## GarageCond.Fa          112696.3238   34780.9610   3.240
## GarageCond.Gd          111760.4259   36144.7929   3.092
## GarageCond.Po          118816.6649   37350.1430   3.181
## GarageCond.TA          114372.5942   34476.0278   3.317
## GarageCond.Xh                   NA           NA      NA
## PavedDrive.P            -3138.2039    5557.2082  -0.565
## PavedDrive.Y             -107.8460    3459.9058  -0.031
## PoolQC.Fa             -158215.0117   40812.1637  -3.877
## PoolQC.Gd             -128651.1973   36766.3454  -3.499
## PoolQC.None            251031.6645  122502.1228   2.049
## Fence.GdWo               8006.5217    4900.7688   1.634
## Fence.MnPrv              9478.6905    4000.2866   2.370
## Fence.MnWw               3311.8991    8207.3731   0.404
## Fence.None               8897.4672    3668.5233   2.425
## MiscFeature.None         4930.3832   97063.3133   0.051
## MiscFeature.Othr        18980.0713   90633.2774   0.209
## MiscFeature.Shed         7132.8294   92995.8744   0.077
## MiscFeature.TenC        35670.3586   96465.3906   0.370
## SaleType.Con            25412.6404   17533.2722   1.449
## SaleType.ConLD          16662.3932    9665.1446   1.724
## SaleType.ConLI           4554.0371   11546.3439   0.394
## SaleType.ConLw            663.9818   12134.3903   0.055
## SaleType.CWD            15164.2739   12853.3066   1.180
## SaleType.New            21778.8614   15393.4483   1.415
## SaleType.None                   NA           NA      NA
## SaleType.Oth             8139.3331   14480.4758   0.562
## SaleType.WD              -346.1417    4173.1517  -0.083
## SaleCondition.AdjLand    9352.4799   14581.6309   0.641
## SaleCondition.Alloca      711.6552    8853.9899   0.080
## SaleCondition.Family      826.3703    6080.5347   0.136
## SaleCondition.Normal     6733.0346    2901.9214   2.320
## SaleCondition.Partial    -729.8389   14817.4379  -0.049
##                                   Pr(>|t|)    
## (Intercept)                       0.961796    
## MSSubClass                        0.503970    
## LotFrontage                       0.779187    
## LotArea               0.000000000055541275 ***
## OverallQual           0.000000000030413426 ***
## OverallCond           0.000000000059996271 ***
## YearBuilt             0.000036575706995979 ***
## YearRemodAdd                      0.057813 .  
## MasVnrArea                        0.000355 ***
## BsmtFinSF1            0.000000141848206828 ***
## BsmtFinSF2            0.000002395504235854 ***
## BsmtUnfSF             0.000000000029327932 ***
## TotalBsmtSF           0.000000000000000395 ***
## X1stFlrSF             0.000000000000004675 ***
## X2ndFlrSF             < 0.0000000000000002 ***
## LowQualFinSF                      0.835924    
## GrLivArea                               NA    
## BsmtFullBath                      0.438766    
## BsmtHalfBath                      0.872897    
## FullBath                          0.096188 .  
## HalfBath                          0.384683    
## BedroomAbvGr                      0.008029 ** 
## KitchenAbvGr                      0.015485 *  
## TotRmsAbvGrd                      0.056806 .  
## Fireplaces                        0.016592 *  
## GarageYrBlt                       0.465159    
## GarageCars                        0.079215 .  
## GarageArea                        0.014758 *  
## WoodDeckSF                        0.010095 *  
## OpenPorchSF                       0.968186    
## EnclosedPorch                     0.792156    
## X3SsnPorch                        0.127027    
## ScreenPorch                       0.004548 ** 
## PoolArea                          0.002745 ** 
## MiscVal                           0.956521    
## MoSold                            0.055059 .  
## YrSold                            0.280245    
## MSZoning.FV                       0.007045 ** 
## MSZoning.RH                       0.061363 .  
## MSZoning.RL                       0.014721 *  
## MSZoning.RM                       0.024329 *  
## Pave                              0.006847 ** 
## Alley.None                        0.776490    
## Alley.Pave                        0.894343    
## LotShape.IR2                      0.239723    
## LotShape.IR3                      0.474047    
## LotShape.Reg                      0.294425    
## LandContour.HLS                   0.138293    
## LandContour.Low                   0.078100 .  
## LandContour.Lvl                   0.144084    
## NoSeWa                            0.163162    
## LotConfig.CulDSac                 0.015815 *  
## LotConfig.FR2                     0.051842 .  
## LotConfig.FR3                     0.166386    
## LotConfig.Inside                  0.365355    
## LandSlope.Mod                     0.056847 .  
## LandSlope.Sev                     0.000234 ***
## Neighborhood.Blueste              0.701987    
## Neighborhood.BrDale               0.799286    
## Neighborhood.BrkSide              0.551796    
## Neighborhood.ClearCr              0.116850    
## Neighborhood.CollgCr              0.164797    
## Neighborhood.Crawfor              0.166943    
## Neighborhood.Edwards              0.007320 ** 
## Neighborhood.Gilbert              0.150047    
## Neighborhood.IDOTRR               0.256524    
## Neighborhood.MeadowV              0.539499    
## Neighborhood.Mitchel              0.010892 *  
## Neighborhood.NAmes                0.027943 *  
## Neighborhood.NoRidge              0.002325 ** 
## Neighborhood.NPkVill              0.325597    
## Neighborhood.NridgHt              0.015835 *  
## Neighborhood.NWAmes               0.031049 *  
## Neighborhood.OldTown              0.143065    
## Neighborhood.Sawyer               0.175003    
## Neighborhood.SawyerW              0.721880    
## Neighborhood.Somerst              0.800535    
## Neighborhood.StoneBr  0.000002418783958876 ***
## Neighborhood.SWISU                0.358920    
## Neighborhood.Timber               0.216279    
## Neighborhood.Veenker              0.989827    
## Condition1.Feedr                  0.147476    
## Condition1.Norm       0.000088678513726580 ***
## Condition1.PosA                   0.303426    
## Condition1.PosN                   0.043480 *  
## Condition1.RRAe                   0.083675 .  
## Condition1.RRAn                   0.059471 .  
## Condition1.RRNe                   0.840723    
## Condition1.RRNn                   0.374451    
## Condition2.Feedr                  0.811934    
## Condition2.Norm                   0.617277    
## Condition2.PosA                   0.233422    
## Condition2.PosN       < 0.0000000000000002 ***
## Condition2.RRAe                   0.049202 *  
## Condition2.RRAn                   0.459561    
## Condition2.RRNn                   0.903741    
## BldgType.2fmCon                   0.793654    
## BldgType.Duplex                   0.342666    
## BldgType.Twnhs                    0.054156 .  
## BldgType.TwnhsE                   0.093218 .  
## HouseStyle.1.5Unf                 0.147533    
## HouseStyle.1Story                 0.241285    
## HouseStyle.2.5Fin                 0.178789    
## HouseStyle.2.5Unf                 0.293378    
## HouseStyle.2Story                 0.089828 .  
## HouseStyle.SFoyer                 0.845758    
## HouseStyle.SLvl                   0.527318    
## RoofStyle.Gable                   0.613556    
## RoofStyle.Gambrel                 0.550068    
## RoofStyle.Hip                     0.622522    
## RoofStyle.Mansard                 0.347104    
## RoofStyle.Shed                    0.003503 ** 
## RoofMatl.CompShg      0.000002001571365855 ***
## RoofMatl.Membran                  0.236619    
## RoofMatl.Metal                    0.770585    
## RoofMatl.Roll                     0.016767 *  
## RoofMatl.Tar.Grv                  0.013465 *  
## RoofMatl.WdShake                  0.000713 ***
## RoofMatl.WdShngl                        NA    
## Exterior1st.AsphShn               0.462616    
## Exterior1st.BrkComm               0.913253    
## Exterior1st.BrkFace               0.579053    
## Exterior1st.CBlock                0.617456    
## Exterior1st.CemntBd               0.503955    
## Exterior1st.HdBoard               0.294830    
## Exterior1st.ImStucc               0.416978    
## Exterior1st.MetalSd               0.667036    
## Exterior1st.Plywood               0.253774    
## Exterior1st.Stone                 0.944142    
## Exterior1st.Stucco                0.572439    
## Exterior1st.VinylSd               0.280815    
## Exterior1st.Wd.Sdng               0.242863    
## Exterior1st.WdShing               0.440330    
## Exterior2nd.AsphShn               0.582524    
## Exterior2nd.Brk.Cmn               0.810360    
## Exterior2nd.BrkFace               0.735945    
## Exterior2nd.CBlock                      NA    
## Exterior2nd.CmentBd               0.484906    
## Exterior2nd.HdBoard               0.484575    
## Exterior2nd.ImStucc               0.225693    
## Exterior2nd.MetalSd               0.675624    
## Exterior2nd.Other                 0.504837    
## Exterior2nd.Plywood               0.556809    
## Exterior2nd.Stone                 0.530758    
## Exterior2nd.Stucco                0.647665    
## Exterior2nd.VinylSd               0.300775    
## Exterior2nd.Wd.Sdng               0.296138    
## Exterior2nd.Wd.Shng               0.643122    
## MasVnrType.BrkFace                0.532770    
## MasVnrType.None                   0.293015    
## MasVnrType.Stone                  0.198923    
## ExterQual.Fa                      0.505284    
## ExterQual.Gd          0.000016624325938440 ***
## ExterQual.TA                      0.000171 ***
## ExterCond.Fa                      0.878754    
## ExterCond.Gd                      0.675312    
## ExterCond.Po                      0.830020    
## ExterCond.TA                      0.808107    
## Foundation.CBlock                 0.356457    
## Foundation.PConc                  0.250684    
## Foundation.Slab                   0.478793    
## Foundation.Stone                  0.379427    
## Foundation.Wood                   0.071846 .  
## BsmtQual.Fa                       0.064712 .  
## BsmtQual.Gd           0.000000060484298429 ***
## BsmtQual.None                     0.311943    
## BsmtQual.TA                       0.000503 ***
## BsmtCond.Gd                       0.982425    
## BsmtCond.Po                       0.024053 *  
## BsmtCond.TA                       0.496473    
## BsmtCond.Xa                             NA    
## BsmtExposure.Gd       0.000002058928569490 ***
## BsmtExposure.Mn                   0.243188    
## BsmtExposure.No                   0.016734 *  
## BsmtExposure.Xb                   0.628713    
## BsmtFinType1.BLQ                  0.295901    
## BsmtFinType1.GLQ                  0.024475 *  
## BsmtFinType1.LwQ                  0.363583    
## BsmtFinType1.Rec                  0.957430    
## BsmtFinType1.Unf                  0.354769    
## BsmtFinType1.Xc                         NA    
## BsmtFinType2.BLQ                  0.087923 .  
## BsmtFinType2.GLQ                  0.764891    
## BsmtFinType2.LwQ                  0.055093 .  
## BsmtFinType2.Rec                  0.156055    
## BsmtFinType2.Unf                  0.281182    
## BsmtFinType2.Xd                   0.254032    
## Heating.GasA                      0.734183    
## Heating.GasW                      0.817888    
## Heating.Grav                      0.977942    
## Heating.OthW                      0.694392    
## Heating.Wall                      0.453486    
## HeatingQC.Fa                      0.919560    
## HeatingQC.Gd                      0.048353 *  
## HeatingQC.Po                      0.932373    
## HeatingQC.TA                      0.111447    
## Y                                 0.986823    
## Electrical.FuseF                  0.999060    
## Electrical.FuseP                  0.638419    
## Electrical.Mix                    0.340473    
## Electrical.None                   0.648683    
## Electrical.SBrkr                  0.490068    
## KitchenQual.Fa                    0.001458 ** 
## KitchenQual.Gd        0.000000000015617754 ***
## KitchenQual.TA        0.000000008536782491 ***
## Functional.Maj2                   0.926453    
## Functional.Min1                   0.418674    
## Functional.Min2                   0.311525    
## Functional.Mod                    0.615514    
## Functional.Sev                    0.165720    
## Functional.Typ                    0.014533 *  
## FireplaceQu.Fa                    0.900514    
## FireplaceQu.Gd                    0.594082    
## FireplaceQu.None                  0.159484    
## FireplaceQu.Po                    0.117112    
## FireplaceQu.TA                    0.488749    
## GarageType.Attchd                 0.066560 .  
## GarageType.Basment                0.053366 .  
## GarageType.BuiltIn                0.081616 .  
## GarageType.CarPort                0.084396 .  
## GarageType.Detchd                 0.034511 *  
## GarageType.Xe                     0.272426    
## GarageFinish.RFn                  0.225629    
## GarageFinish.Unf                  0.801352    
## GarageFinish.Xf                         NA    
## GarageQual.Fa         0.000029367740820109 ***
## GarageQual.Gd         0.000097592545543651 ***
## GarageQual.Po                     0.000191 ***
## GarageQual.TA         0.000061227032228221 ***
## GarageQual.Xg                           NA    
## GarageCond.Fa                     0.001227 ** 
## GarageCond.Gd                     0.002034 ** 
## GarageCond.Po                     0.001504 ** 
## GarageCond.TA                     0.000935 ***
## GarageCond.Xh                           NA    
## PavedDrive.P                      0.572377    
## PavedDrive.Y                      0.975139    
## PoolQC.Fa                         0.000112 ***
## PoolQC.Gd                         0.000484 ***
## PoolQC.None                       0.040658 *  
## Fence.GdWo                        0.102577    
## Fence.MnPrv                       0.017969 *  
## Fence.MnWw                        0.686632    
## Fence.None                        0.015439 *  
## MiscFeature.None                  0.959497    
## MiscFeature.Othr                  0.834159    
## MiscFeature.Shed                  0.938875    
## MiscFeature.TenC                  0.711616    
## SaleType.Con                      0.147487    
## SaleType.ConLD                    0.084970 .  
## SaleType.ConLI                    0.693345    
## SaleType.ConLw                    0.956371    
## SaleType.CWD                      0.238314    
## SaleType.New                      0.157381    
## SaleType.None                           NA    
## SaleType.Oth                      0.574159    
## SaleType.WD                       0.933909    
## SaleCondition.AdjLand             0.521393    
## SaleCondition.Alloca              0.935951    
## SaleCondition.Family              0.891920    
## SaleCondition.Normal              0.020496 *  
## SaleCondition.Partial             0.960724    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22570 on 1207 degrees of freedom
## Multiple R-squared:  0.9332, Adjusted R-squared:  0.9193 
## F-statistic: 66.94 on 252 and 1207 DF,  p-value: < 0.00000000000000022

R-squared is 0.93, This means 93% variance of the sale price can be explained by predictor variables in the model. F-statistic is 66.94 with 1207 of degree of freedom and p-value is also very small.

Model 2: Taking only the variables whoes P <.05.

model2 <- lm(SalePrice ~ LotArea + OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF  + TotalBsmtSF +
                X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + Fireplaces + GarageArea  + ScreenPorch + PoolArea +
               MSZoning.FV + MSZoning.RL  + Pave + LotConfig.CulDSac  + LandSlope.Sev + Neighborhood.Edwards +
               Neighborhood.Mitchel + Neighborhood.NAmes + Neighborhood.NoRidge + Neighborhood.NridgHt + Neighborhood.NWAmes + 
               Neighborhood.StoneBr + Condition1.Norm + Condition2.PosN + Condition2.RRAe + RoofStyle.Shed +
               RoofMatl.CompShg  + RoofMatl.Tar.Grv +  RoofMatl.WdShake + ExterQual.Gd + ExterQual.TA + BsmtQual.Gd + BsmtQual.TA +
               BsmtExposure.Gd + BsmtExposure.No  + KitchenQual.Fa + KitchenQual.Gd + KitchenQual.TA + 
               Functional.Typ  + GarageQual.Fa +  GarageQual.Gd + GarageQual.Po + GarageQual.TA + GarageCond.Fa +
               GarageCond.Gd + GarageCond.Po + GarageCond.TA + PoolQC.Fa + PoolQC.Gd + PoolQC.None, data = data_train)
summary(model2)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + OverallQual + OverallCond + 
##     YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + 
##     TotalBsmtSF + X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + 
##     Fireplaces + GarageArea + ScreenPorch + PoolArea + MSZoning.FV + 
##     MSZoning.RL + Pave + LotConfig.CulDSac + LandSlope.Sev + 
##     Neighborhood.Edwards + Neighborhood.Mitchel + Neighborhood.NAmes + 
##     Neighborhood.NoRidge + Neighborhood.NridgHt + Neighborhood.NWAmes + 
##     Neighborhood.StoneBr + Condition1.Norm + Condition2.PosN + 
##     Condition2.RRAe + RoofStyle.Shed + RoofMatl.CompShg + RoofMatl.Tar.Grv + 
##     RoofMatl.WdShake + ExterQual.Gd + ExterQual.TA + BsmtQual.Gd + 
##     BsmtQual.TA + BsmtExposure.Gd + BsmtExposure.No + KitchenQual.Fa + 
##     KitchenQual.Gd + KitchenQual.TA + Functional.Typ + GarageQual.Fa + 
##     GarageQual.Gd + GarageQual.Po + GarageQual.TA + GarageCond.Fa + 
##     GarageCond.Gd + GarageCond.Po + GarageCond.TA + PoolQC.Fa + 
##     PoolQC.Gd + PoolQC.None, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -189512  -11179       0   11300  189512 
## 
## Coefficients:
##                           Estimate    Std. Error t value
## (Intercept)          -938190.68680  124186.41803  -7.555
## LotArea                    0.73554       0.08809   8.350
## OverallQual             7595.54490     905.78430   8.386
## OverallCond             7000.29721     689.52510  10.152
## YearBuilt                378.70965      42.18476   8.977
## MasVnrArea                 9.35417       4.46110   2.097
## BsmtFinSF1               -49.99432       8.59135  -5.819
## BsmtFinSF2               -62.47200       9.58936  -6.515
## BsmtUnfSF                -64.99445       8.76936  -7.412
## TotalBsmtSF               89.82543       9.09925   9.872
## X1stFlrSF                 57.99102       4.01495  14.444
## X2ndFlrSF                 60.90316       2.58524  23.558
## BedroomAbvGr           -2987.13098    1114.72709  -2.680
## KitchenAbvGr          -23415.19408    3354.71928  -6.980
## Fireplaces              3571.10089    1257.95527   2.839
## GarageArea                32.79753       4.79670   6.838
## ScreenPorch               30.48516      12.28064   2.482
## PoolArea                 567.08748     170.86035   3.319
## MSZoning.FV            23137.87498    3874.44977   5.972
## MSZoning.RL            12134.39489    2168.61810   5.595
## Pave                   47703.24966   10609.97969   4.496
## LotConfig.CulDSac       7224.96443    2751.28865   2.626
## LandSlope.Sev         -48862.33442    9040.62587  -5.405
## Neighborhood.Edwards  -12037.74394    2819.69870  -4.269
## Neighborhood.Mitchel  -16232.09807    3818.56820  -4.251
## Neighborhood.NAmes    -10573.24544    2198.59669  -4.809
## Neighborhood.NoRidge   31073.51663    4479.96403   6.936
## Neighborhood.NridgHt   19984.90501    3616.65653   5.526
## Neighborhood.NWAmes   -14416.30363    3247.89442  -4.439
## Neighborhood.StoneBr   34315.91779    5265.91340   6.517
## Condition1.Norm         8586.78138    1974.78021   4.348
## Condition2.PosN      -224536.76857   18115.51178 -12.395
## Condition2.RRAe       -90279.30221   37925.02755  -2.380
## RoofStyle.Shed         65340.78938   28694.28331   2.277
## RoofMatl.CompShg      -40099.69136    9054.74520  -4.429
## RoofMatl.Tar.Grv      -60414.85471   11756.19954  -5.139
## RoofMatl.WdShake      -49513.75006   15416.32738  -3.212
## ExterQual.Gd          -25348.96717    4024.72647  -6.298
## ExterQual.TA          -27049.54576    4224.18382  -6.403
## BsmtQual.Gd           -22890.78640    2549.24314  -8.979
## BsmtQual.TA           -18593.85046    2732.45006  -6.805
## BsmtExposure.Gd        16233.52733    2758.07699   5.886
## BsmtExposure.No        -4733.71014    1638.95501  -2.888
## KitchenQual.Fa        -23677.02118    5571.32885  -4.250
## KitchenQual.Gd        -25199.44588    3401.10654  -7.409
## KitchenQual.TA        -27154.87080    3794.09727  -7.157
## Functional.Typ         14567.61731    2787.61949   5.226
## GarageQual.Fa         -99047.72202   27266.19186  -3.633
## GarageQual.Gd         -94081.71264   27875.22260  -3.375
## GarageQual.Po        -104831.42807   33276.15816  -3.150
## GarageQual.TA         -99518.81798   26984.43691  -3.688
## GarageCond.Fa          87558.61820   27775.92974   3.152
## GarageCond.Gd          74122.37368   28700.60498   2.583
## GarageCond.Po          86241.02267   30269.59928   2.849
## GarageCond.TA          89485.97371   27301.79633   3.278
## PoolQC.Fa            -117774.30077   26623.72670  -4.424
## PoolQC.Gd            -100054.08168   32196.77108  -3.108
## PoolQC.None           213222.69770   92957.18077   2.294
##                                  Pr(>|t|)    
## (Intercept)            0.0000000000000754 ***
## LotArea              < 0.0000000000000002 ***
## OverallQual          < 0.0000000000000002 ***
## OverallCond          < 0.0000000000000002 ***
## YearBuilt            < 0.0000000000000002 ***
## MasVnrArea                       0.036187 *  
## BsmtFinSF1             0.0000000073193130 ***
## BsmtFinSF2             0.0000000001012574 ***
## BsmtUnfSF              0.0000000000002150 ***
## TotalBsmtSF          < 0.0000000000000002 ***
## X1stFlrSF            < 0.0000000000000002 ***
## X2ndFlrSF            < 0.0000000000000002 ***
## BedroomAbvGr                     0.007455 ** 
## KitchenAbvGr           0.0000000000045475 ***
## Fireplaces                       0.004593 ** 
## GarageArea             0.0000000000119914 ***
## ScreenPorch                      0.013167 *  
## PoolArea                         0.000927 ***
## MSZoning.FV            0.0000000029673728 ***
## MSZoning.RL            0.0000000264275476 ***
## Pave                   0.0000074925231460 ***
## LotConfig.CulDSac                0.008733 ** 
## LandSlope.Sev          0.0000000761778431 ***
## Neighborhood.Edwards   0.0000209400557431 ***
## Neighborhood.Mitchel   0.0000227054093716 ***
## Neighborhood.NAmes     0.0000016796722710 ***
## Neighborhood.NoRidge   0.0000000000061357 ***
## Neighborhood.NridgHt   0.0000000390532310 ***
## Neighborhood.NWAmes    0.0000097611614729 ***
## Neighborhood.StoneBr   0.0000000001000278 ***
## Condition1.Norm        0.0000147160750638 ***
## Condition2.PosN      < 0.0000000000000002 ***
## Condition2.RRAe                  0.017424 *  
## RoofStyle.Shed                   0.022928 *  
## RoofMatl.CompShg       0.0000102218882677 ***
## RoofMatl.Tar.Grv       0.0000003152216592 ***
## RoofMatl.WdShake                 0.001349 ** 
## ExterQual.Gd           0.0000000004018333 ***
## ExterQual.TA           0.0000000002067092 ***
## BsmtQual.Gd          < 0.0000000000000002 ***
## BsmtQual.TA            0.0000000000149461 ***
## BsmtExposure.Gd        0.0000000049485673 ***
## BsmtExposure.No                  0.003933 ** 
## KitchenQual.Fa         0.0000228093086633 ***
## KitchenQual.Gd         0.0000000000002187 ***
## KitchenQual.TA         0.0000000000013241 ***
## Functional.Typ         0.0000001995910146 ***
## GarageQual.Fa                    0.000291 ***
## GarageQual.Gd                    0.000758 ***
## GarageQual.Po                    0.001665 ** 
## GarageQual.TA                    0.000235 ***
## GarageCond.Fa                    0.001654 ** 
## GarageCond.Gd                    0.009906 ** 
## GarageCond.Po                    0.004448 ** 
## GarageCond.TA                    0.001072 ** 
## PoolQC.Fa              0.0000104544586912 ***
## PoolQC.Gd                        0.001924 ** 
## PoolQC.None                      0.021951 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24290 on 1402 degrees of freedom
## Multiple R-squared:  0.9102, Adjusted R-squared:  0.9065 
## F-statistic: 249.2 on 57 and 1402 DF,  p-value: < 0.00000000000000022

This model did not give any performance improvement. R-squared is 0.91, This means 91% variance of the sale price can be explained by predictor variables in the model. F-statistic is 249.2 with 1402 of degree of freedom and p-value remains the same.

Prediction:

model1_data <- data_test
model2_data <- data_test
# modelColumns <- colnames(HouseDF) testDF_model <-
# testData[,colnames(testData) %in% modelColumns]

model1_data$salePrice <- predict(model1, data_test)
## Warning in predict.lm(model1, data_test): prediction from a rank-deficient
## fit may be misleading
model2_data$salePrice <- predict(model2, data_test)

Id <- data_test$Id
# Kaggle dataset for model1
salePrice <- model1_data$salePrice
kaggleData1 <- data.frame(cbind(Id, salePrice))
kaggleData1[is.na(kaggleData1)] <- 0
# write.csv(kaggleData_modelDF,'kaggleData_model.csv')

# Kaggle dataset for model2
salePrice <- model2_data$salePrice
kaggleData2 <- data.frame(cbind(Id, salePrice))
kaggleData2[is.na(kaggleData2)] <- 0

write.csv(kaggleData1,'C:/Users/rites/Documents/GitHub/Data605_Final_Project/kaggleout1.csv', row.names = F)
write.csv(kaggleData2,'C:/Users/rites/Documents/GitHub/Data605_Final_Project/kaggleout2.csv', row.names = F)

Kaggle.com user name : riteshlohiya999

Kaggle score:

Model1 : 0.47336

Model2: 0.50297

For my 1st model: R-squared is 0.93, This means 93% variance of the sale price can be explained by predictor variables in the model. F-statistic is 66.94 with 1207 of degree of freedom and p-value is also very small. Kaggke score is 0.47336

For my 2nd model: This model did not give any performance improvement. R-squared is 0.91, This means 91% variance of the sale price can be explained by predictor variables in the model. F-statistic is 249.2 with 1402 of degree of freedom and p-value remains the same.