library(MASS)## Warning: package 'MASS' was built under R version 3.4.3
library(ggplot2)
library(tidyr)## Warning: package 'tidyr' was built under R version 3.4.3
library(corrplot)## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
data <- read.csv("Data/train.csv")
head(data) ## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
Defining variables x and y i m choosing x as Grlivarea and y as sale price
X <- data$LotArea
Y <- data$SalePriceCalculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities
x <- quantile(X, .25)
x## 25%
## 7553.5
y<- quantile(Y,.50)
y## 50%
## 163000
a)P(X>x|Y>y) Probability of X greater than first quartile of X (7553.5 ) given that Y is greater than 2nd quartile of Y (163,000).
total<-nrow(data)
px_py<-nrow(subset(data,LotArea>x & SalePrice>y))
p_y<-nrow(subset(data,SalePrice>y))
px <- nrow(subset(data,SalePrice>y))/total
p1<-px_py/total
p2<-p_y/total
a<-p1/p2
a## [1] 0.8653846
p1## [1] 0.4315068
c.P(Xy) Probability of X less than first quartile of X given Y is greater than second quartile of Y.
xl<-nrow(subset(data,LotArea<x & SalePrice>y))
yl<-nrow(subset(data,SalePrice>y))
p3 <-xl/total
p4<-yl/total
c<-p3/p4
c## [1] 0.1346154
# X<=x, Y<=y
a1 <- subset(data, LotArea <= x & SalePrice <= y)
row1_col1 <- nrow(a1)
# X>x, Y<=y
a2 <- subset(data, LotArea > x & SalePrice <= y)
row1_col2 <- nrow(a1)
# X<=x, Y>y
a3 <- subset(data, LotArea <= x & SalePrice > y)
row2_col1 <- nrow(a3)
# X>x, Y>y
a4 <- subset(data, LotArea > x & SalePrice > y)
row2_col2 <- nrow(a4)creating table
tab <- data.frame(c(row1_col1, row2_col1), c(row1_col2, row2_col2))
tab[3,] = tab[1,] + tab[2,]
tab[,3] = tab[,1] + tab[,2]
names(tab) <- c('X<=x', 'X>x', 'Total')
rownames(tab) <- c('Y<=y', 'Y>y', 'Total')
# Calculate the probabilities by dividing by the total number of rows:
prob <- tab / nrow(data)
round(prob, 4)## X<=x X>x Total
## Y<=y 0.1829 0.1829 0.3658
## Y>y 0.0671 0.4315 0.4986
## Total 0.2500 0.6144 0.8644
tab## X<=x X>x Total
## Y<=y 267 267 534
## Y>y 98 630 728
## Total 365 897 1262
Does splitting the training data in this fashion make them independent? In other words, does P(X|Y)=P(X)P(Y))? Check mathematically, and then evaluate by running a Chi Square test for association.
For independence :
P(XY)= P(X)P(Y) or
P(X|Y) = P(X)
px_py == px * p2## [1] FALSE
The variables are independent mathematically. We will perform chi-square test to check the assumption. Chi-Square Test H0 : Sale Price and Lot Area are independent
Ha : Sale Price and Lot Area are not independent
chisq.test(X,Y)## Warning in chisq.test(X, Y): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: X and Y
## X-squared = 735090, df = 709660, p-value < 2.2e-16
p-value is very small, we will reject null hypothesis. The variables are dependent. Lot area does effects the sale price of house.
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
plot(X,Y, col="#4caf50", main="Scatterplot of Lot Area and Sale Price", xlab = "Lot ARea", ylab="Sale Price")
abline(lm(Y~X), col="yellow", lwd=3) # regression line (y~x) hist(X, col="green", main="Histogram of Lot Area", xlab = "Lot Area")hist(Y, col="#80cbc4", main="Histogram of Sale Price", xlab = "Sale Price")summary(X)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
boxplot(X)summary(Y)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
boxplot(Y)Derive a correlation matrix for any three quantitative variables in the dataset
myvars<-data.frame(data$OverallCond, data$YearBuilt,data$SalePrice)
head(myvars)## data.OverallCond data.YearBuilt data.SalePrice
## 1 5 2003 208500
## 2 8 1976 181500
## 3 5 2001 223500
## 4 5 1915 140000
## 5 5 2000 250000
## 6 5 1993 143000
myvars <- cor(myvars)
corrplot(myvars, method = "square")cor.test(data$OverallCond + data$YearBuilt, data$SalePrice, conf.level = 0.92)##
## Pearson's product-moment correlation
##
## data: data$OverallCond + data$YearBuilt and data$SalePrice
## t = 23.679, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.4930961 0.5593400
## sample estimates:
## cor
## 0.5270181
t.test(data$OverallCond + data$YearBuilt, data$SalePrice, conf.level = 0.92)##
## Welch Two Sample t-test
##
## data: data$OverallCond + data$YearBuilt and data$SalePrice
## t = -86.068, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## -182586.8 -175302.0
## sample estimates:
## mean of x mean of y
## 1976.843 180921.196
mymx<-as.matrix(cor(myvars))
#correlation matrix
mymx## data.OverallCond data.YearBuilt data.SalePrice
## data.OverallCond 1.0000000 -0.9900717 -0.7869968
## data.YearBuilt -0.9900717 1.0000000 0.6924618
## data.SalePrice -0.7869968 0.6924618 1.0000000
ginvmymx<-ginv(mymx)
ginvmymx## [,1] [,2] [,3]
## [1,] 0.3305170 -0.4994715 0.4958592
## [2,] -0.4994715 0.8252836 -1.0587311
## [3,] 0.4958592 -1.0587311 2.1019152
#corr mat * precision mat
mymxginv<-mymx%*%ginvmymx
round(mymxginv,2)## [,1] [,2] [,3]
## data.OverallCond 0.43 -0.48 -0.11
## data.YearBuilt -0.48 0.59 -0.09
## data.SalePrice -0.11 -0.09 0.98
ginvmymx<-ginvmymx%*%mymx
round(ginvmymx,2)## data.OverallCond data.YearBuilt data.SalePrice
## [1,] 0.43 -0.48 -0.11
## [2,] -0.48 0.59 -0.09
## [3,] -0.11 -0.09 0.98
With a 92 percent confidence level, the correlation between Overall condition plus Year Built and Sale Price is estimated to be between 0.49 and 0.55.
Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function.
Checking minium value is above 0
min(data$LotArea)## [1] 1300
fit <- fitdistr(data$LotArea, 'normal')
mean_fit <- fit$estimate[1]
sd_fit <- fit$estimate[2]
mean_fit## mean
## 10516.83
sd_fit## sd
## 9977.846
sample <-rnorm(1000,mean_fit, sd_fit)
hist(sample)After analyzing the data,we identified following independent variables could be used to generate our model.
| Response Variable | SalePrice |
|---|---|
| MSSubClass | The building class |
| LotArea | Lot size in square feet |
| LotFrontage | Linear feet of street connected to property |
| OverallCond | Overall condition rating |
| OverallQual | Overall material and finish quality |
| YearBuilt | Original construction date |
| FullBath | Full bathrooms above grade |
| GarageArea | Size of garage in square feet |
| GarageCars | Size of garage in car capacity |
| PoolArea | Pool area in square feet |
| BsmtFullBath | Basement full bathrooms |
Once we generated the model we found from the above only below variables were statistically significant. MSSubClass, LotArea, OverallCond, OverallQual, FullBath, GarageCars, BsmtFullBath
house <- data[,c("SalePrice", "MSSubClass", "LotArea", "OverallCond", "OverallQual", "FullBath", "GarageCars","BsmtFullBath")]
model <- lm(SalePrice ~ .,data=house)
summary(model)##
## Call:
## lm(formula = SalePrice ~ ., data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -242258 -23763 -3830 16971 384660
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.171e+05 8.025e+03 -14.589 < 2e-16 ***
## MSSubClass -1.879e+02 2.635e+01 -7.132 1.56e-12 ***
## LotArea 9.354e-01 1.127e-01 8.297 2.40e-16 ***
## OverallCond 3.270e+03 1.004e+03 3.257 0.00115 **
## OverallQual 3.205e+04 1.069e+03 29.996 < 2e-16 ***
## FullBath 2.608e+04 2.517e+03 10.363 < 2e-16 ***
## GarageCars 2.005e+04 1.897e+03 10.567 < 2e-16 ***
## BsmtFullBath 2.085e+04 2.172e+03 9.601 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41350 on 1452 degrees of freedom
## Multiple R-squared: 0.7304, Adjusted R-squared: 0.7291
## F-statistic: 561.9 on 7 and 1452 DF, p-value: < 2.2e-16
reduced.model<- step (model, direction = "backward")## Start: AIC=31047.05
## SalePrice ~ MSSubClass + LotArea + OverallCond + OverallQual +
## FullBath + GarageCars + BsmtFullBath
##
## Df Sum of Sq RSS AIC
## <none> 2.4826e+12 31047
## - OverallCond 1 1.8132e+10 2.5007e+12 31056
## - MSSubClass 1 8.6963e+10 2.5696e+12 31095
## - LotArea 1 1.1772e+11 2.6003e+12 31113
## - BsmtFullBath 1 1.5762e+11 2.6402e+12 31135
## - FullBath 1 1.8360e+11 2.6662e+12 31149
## - GarageCars 1 1.9092e+11 2.6735e+12 31153
## - OverallQual 1 1.5384e+12 4.0210e+12 31749
summary(reduced.model)##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallCond +
## OverallQual + FullBath + GarageCars + BsmtFullBath, data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -242258 -23763 -3830 16971 384660
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.171e+05 8.025e+03 -14.589 < 2e-16 ***
## MSSubClass -1.879e+02 2.635e+01 -7.132 1.56e-12 ***
## LotArea 9.354e-01 1.127e-01 8.297 2.40e-16 ***
## OverallCond 3.270e+03 1.004e+03 3.257 0.00115 **
## OverallQual 3.205e+04 1.069e+03 29.996 < 2e-16 ***
## FullBath 2.608e+04 2.517e+03 10.363 < 2e-16 ***
## GarageCars 2.005e+04 1.897e+03 10.567 < 2e-16 ***
## BsmtFullBath 2.085e+04 2.172e+03 9.601 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41350 on 1452 degrees of freedom
## Multiple R-squared: 0.7304, Adjusted R-squared: 0.7291
## F-statistic: 561.9 on 7 and 1452 DF, p-value: < 2.2e-16
Based on the statistics, this is a good model. R^2 is close to 100 and F significate is less than 0.05. Also all the dependent variables are statistically significant (P stat is less than 0.05)
plot(reduced.model)Plot are radomly scattered with no pattern for both model. The points in the Normal Q-Q plot are more or less on the line, indicating that residuals follow a normal distribution. In both Scale-Location plot and Residual Vs Leverage plots, the points are in the a group with none too far from center. Diagostic plots for the both the models are show below. Based on the plots we can say these are pretty good regression models.
test <- read.csv("Data/test.csv")
predicts <- predict(reduced.model, test)
predicts_df<-data.frame(cbind(test$Id,predicts))
names(predicts_df) <- c("Id", "SalePrice")
predicts_df[is.na(predicts_df)] <- median(house$SalePrice)
write.csv(predicts_df, "predict.csv", row.names = FALSE)Kaggle score:- is 0.29801 username - harpreet18
knitr::include_graphics("Data/result.png")