DATA605_Final Project

DATA LOADING

library(MASS)

## Warning: package 'MASS' was built under R version 3.4.3

library(ggplot2)
library(tidyr)

## Warning: package 'tidyr' was built under R version 3.4.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.4.2

## corrplot 0.84 loaded

data <- read.csv("Data/train.csv")
head(data)

##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000

Defining variables x and y i m choosing x as Grlivarea and y as sale price

X <- data$LotArea
Y <- data$SalePrice

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities

x <- quantile(X, .25)
x

##    25% 
## 7553.5

y<- quantile(Y,.50)
y

##    50% 
## 163000

a)P(X>x|Y>y) Probability of X greater than first quartile of X (7553.5 ) given that Y is greater than 2nd quartile of Y (163,000).

total<-nrow(data)
px_py<-nrow(subset(data,LotArea>x & SalePrice>y))
p_y<-nrow(subset(data,SalePrice>y))
px <- nrow(subset(data,SalePrice>y))/total
p1<-px_py/total
p2<-p_y/total
a<-p1/p2
a

## [1] 0.8653846

P(X>x & Y>y) Probability that X is greater than first qurtile of X (7553.5 ) and Y (sale price) is greater than second quartile of Y (163,000)

p1

## [1] 0.4315068

c.P(Xy) Probability of X less than first quartile of X given Y is greater than second quartile of Y.

xl<-nrow(subset(data,LotArea<x & SalePrice>y))
yl<-nrow(subset(data,SalePrice>y))
p3 <-xl/total
p4<-yl/total
c<-p3/p4
c

## [1] 0.1346154

# X<=x, Y<=y
a1 <- subset(data, LotArea <= x & SalePrice <= y)
row1_col1 <- nrow(a1)

# X>x, Y<=y
a2 <- subset(data, LotArea > x & SalePrice <= y)
row1_col2 <- nrow(a1) 

# X<=x, Y>y
a3 <- subset(data, LotArea <= x & SalePrice > y)
row2_col1 <- nrow(a3) 

# X>x, Y>y
a4 <- subset(data, LotArea > x & SalePrice > y)
row2_col2 <- nrow(a4)

creating table

tab <- data.frame(c(row1_col1, row2_col1), c(row1_col2, row2_col2))

tab[3,] = tab[1,] + tab[2,]
tab[,3] = tab[,1] + tab[,2]

names(tab) <- c('X<=x', 'X>x', 'Total')
rownames(tab) <- c('Y<=y', 'Y>y', 'Total')

# Calculate the probabilities by dividing by the total number of rows:
prob <- tab / nrow(data)
round(prob, 4)

##         X<=x    X>x  Total
## Y<=y  0.1829 0.1829 0.3658
## Y>y   0.0671 0.4315 0.4986
## Total 0.2500 0.6144 0.8644

tab

##       X<=x X>x Total
## Y<=y   267 267   534
## Y>y     98 630   728
## Total  365 897  1262

Independence

Does splitting the training data in this fashion make them independent? In other words, does P(X|Y)=P(X)P(Y))? Check mathematically, and then evaluate by running a Chi Square test for association.

For independence :

P(XY)= P(X)P(Y) or

P(X|Y) = P(X)

px_py == px * p2

## [1] FALSE

The variables are independent mathematically. We will perform chi-square test to check the assumption. Chi-Square Test H0 : Sale Price and Lot Area are independent

Ha : Sale Price and Lot Area are not independent

chisq.test(X,Y)

## Warning in chisq.test(X, Y): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  X and Y
## X-squared = 735090, df = 709660, p-value < 2.2e-16

p-value is very small, we will reject null hypothesis. The variables are dependent. Lot area does effects the sale price of house.

Descriptive and inferential stats

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

plot(X,Y, col="#4caf50", main="Scatterplot of Lot Area and Sale Price", xlab = "Lot ARea", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) # regression line (y~x)

hist(X, col="green", main="Histogram of Lot Area", xlab = "Lot Area")

hist(Y, col="#80cbc4", main="Histogram of Sale Price", xlab = "Sale Price")

summary(X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245

boxplot(X)

summary(Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

boxplot(Y)

Derive a correlation matrix for any three quantitative variables in the dataset

myvars<-data.frame(data$OverallCond, data$YearBuilt,data$SalePrice)
head(myvars)

##   data.OverallCond data.YearBuilt data.SalePrice
## 1                5           2003         208500
## 2                8           1976         181500
## 3                5           2001         223500
## 4                5           1915         140000
## 5                5           2000         250000
## 6                5           1993         143000

myvars <- cor(myvars)
corrplot(myvars, method = "square")

cor.test(data$OverallCond + data$YearBuilt, data$SalePrice, conf.level = 0.92)

## 
##  Pearson's product-moment correlation
## 
## data:  data$OverallCond + data$YearBuilt and data$SalePrice
## t = 23.679, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.4930961 0.5593400
## sample estimates:
##       cor 
## 0.5270181

t.test(data$OverallCond + data$YearBuilt, data$SalePrice, conf.level = 0.92)

## 
##  Welch Two Sample t-test
## 
## data:  data$OverallCond + data$YearBuilt and data$SalePrice
## t = -86.068, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  -182586.8 -175302.0
## sample estimates:
##  mean of x  mean of y 
##   1976.843 180921.196

mymx<-as.matrix(cor(myvars))
#correlation matrix
mymx

##                  data.OverallCond data.YearBuilt data.SalePrice
## data.OverallCond        1.0000000     -0.9900717     -0.7869968
## data.YearBuilt         -0.9900717      1.0000000      0.6924618
## data.SalePrice         -0.7869968      0.6924618      1.0000000

ginvmymx<-ginv(mymx)
ginvmymx

##            [,1]       [,2]       [,3]
## [1,]  0.3305170 -0.4994715  0.4958592
## [2,] -0.4994715  0.8252836 -1.0587311
## [3,]  0.4958592 -1.0587311  2.1019152

#corr mat * precision mat
mymxginv<-mymx%*%ginvmymx
round(mymxginv,2)

##                   [,1]  [,2]  [,3]
## data.OverallCond  0.43 -0.48 -0.11
## data.YearBuilt   -0.48  0.59 -0.09
## data.SalePrice   -0.11 -0.09  0.98

ginvmymx<-ginvmymx%*%mymx
round(ginvmymx,2)

##      data.OverallCond data.YearBuilt data.SalePrice
## [1,]             0.43          -0.48          -0.11
## [2,]            -0.48           0.59          -0.09
## [3,]            -0.11          -0.09           0.98

With a 92 percent confidence level, the correlation between Overall condition plus Year Built and Sale Price is estimated to be between 0.49 and 0.55.

Calculus-Based Probability and Statistics

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function.

Checking minium value is above 0

min(data$LotArea)

## [1] 1300

fit <- fitdistr(data$LotArea, 'normal')
mean_fit <- fit$estimate[1]
sd_fit <- fit$estimate[2]
mean_fit

##     mean 
## 10516.83

sd_fit

##       sd 
## 9977.846

sample <-rnorm(1000,mean_fit, sd_fit)
hist(sample)

Modelling

After analyzing the data,we identified following independent variables could be used to generate our model.

Response Variable	SalePrice
MSSubClass	The building class
LotArea	Lot size in square feet
LotFrontage	Linear feet of street connected to property
OverallCond	Overall condition rating
OverallQual	Overall material and finish quality
YearBuilt	Original construction date
FullBath	Full bathrooms above grade
GarageArea	Size of garage in square feet
GarageCars	Size of garage in car capacity
PoolArea	Pool area in square feet
BsmtFullBath	Basement full bathrooms

Once we generated the model we found from the above only below variables were statistically significant. MSSubClass, LotArea, OverallCond, OverallQual, FullBath, GarageCars, BsmtFullBath

house <- data[,c("SalePrice", "MSSubClass", "LotArea",  "OverallCond",                                       "OverallQual", "FullBath", "GarageCars","BsmtFullBath")]
model <- lm(SalePrice ~ .,data=house)

summary(model)

## 
## Call:
## lm(formula = SalePrice ~ ., data = house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -242258  -23763   -3830   16971  384660 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.171e+05  8.025e+03 -14.589  < 2e-16 ***
## MSSubClass   -1.879e+02  2.635e+01  -7.132 1.56e-12 ***
## LotArea       9.354e-01  1.127e-01   8.297 2.40e-16 ***
## OverallCond   3.270e+03  1.004e+03   3.257  0.00115 ** 
## OverallQual   3.205e+04  1.069e+03  29.996  < 2e-16 ***
## FullBath      2.608e+04  2.517e+03  10.363  < 2e-16 ***
## GarageCars    2.005e+04  1.897e+03  10.567  < 2e-16 ***
## BsmtFullBath  2.085e+04  2.172e+03   9.601  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41350 on 1452 degrees of freedom
## Multiple R-squared:  0.7304, Adjusted R-squared:  0.7291 
## F-statistic: 561.9 on 7 and 1452 DF,  p-value: < 2.2e-16

reduced.model<- step (model, direction = "backward")

## Start:  AIC=31047.05
## SalePrice ~ MSSubClass + LotArea + OverallCond + OverallQual + 
##     FullBath + GarageCars + BsmtFullBath
## 
##                Df  Sum of Sq        RSS   AIC
## <none>                       2.4826e+12 31047
## - OverallCond   1 1.8132e+10 2.5007e+12 31056
## - MSSubClass    1 8.6963e+10 2.5696e+12 31095
## - LotArea       1 1.1772e+11 2.6003e+12 31113
## - BsmtFullBath  1 1.5762e+11 2.6402e+12 31135
## - FullBath      1 1.8360e+11 2.6662e+12 31149
## - GarageCars    1 1.9092e+11 2.6735e+12 31153
## - OverallQual   1 1.5384e+12 4.0210e+12 31749

summary(reduced.model)

## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallCond + 
##     OverallQual + FullBath + GarageCars + BsmtFullBath, data = house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -242258  -23763   -3830   16971  384660 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.171e+05  8.025e+03 -14.589  < 2e-16 ***
## MSSubClass   -1.879e+02  2.635e+01  -7.132 1.56e-12 ***
## LotArea       9.354e-01  1.127e-01   8.297 2.40e-16 ***
## OverallCond   3.270e+03  1.004e+03   3.257  0.00115 ** 
## OverallQual   3.205e+04  1.069e+03  29.996  < 2e-16 ***
## FullBath      2.608e+04  2.517e+03  10.363  < 2e-16 ***
## GarageCars    2.005e+04  1.897e+03  10.567  < 2e-16 ***
## BsmtFullBath  2.085e+04  2.172e+03   9.601  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41350 on 1452 degrees of freedom
## Multiple R-squared:  0.7304, Adjusted R-squared:  0.7291 
## F-statistic: 561.9 on 7 and 1452 DF,  p-value: < 2.2e-16

Based on the statistics, this is a good model. R^2 is close to 100 and F significate is less than 0.05. Also all the dependent variables are statistically significant (P stat is less than 0.05)

plot(reduced.model)

Plot are radomly scattered with no pattern for both model. The points in the Normal Q-Q plot are more or less on the line, indicating that residuals follow a normal distribution. In both Scale-Location plot and Residual Vs Leverage plots, the points are in the a group with none too far from center. Diagostic plots for the both the models are show below. Based on the plots we can say these are pretty good regression models.

test <- read.csv("Data/test.csv")
predicts <- predict(reduced.model, test)
predicts_df<-data.frame(cbind(test$Id,predicts))
names(predicts_df) <- c("Id", "SalePrice")
predicts_df[is.na(predicts_df)] <- median(house$SalePrice)
write.csv(predicts_df, "predict.csv", row.names = FALSE)

Kaggle score:- is 0.29801 username - harpreet18

knitr::include_graphics("Data/result.png")