data <- read.csv("/Users/bchand005c/CUNY/DATA-605/assignment/final/data.csv", header = TRUE)
X <- data$X1
Y <- data$Y1
(x <- quantile(X, 0.75))
## 75%
## 15.825
(y <- quantile(Y, 0.25))
## 25%
## 18.55
(n <- nrow(data))
## [1] 20
(ny <- nrow(subset(data, Y > y)))
## [1] 15
(pa <- nrow(subset(data, X > x & Y > y))/ny)
## [1] 0.2
(pb<-nrow(subset(data, X > x & Y > y))/n)
## [1] 0.15
(pc <- nrow(subset(data, X < x & Y > y))/ny)
## [1] 0.8
t <- c(nrow(data[X<=x & Y<=y,]),
nrow(data[X<=x & Y>y,]))
t <- rbind(t, c(nrow(data[X>x & Y<=y,]),
nrow(data[X>x & Y>y,])))
t <- cbind(t, t[,1] + t[,2])
t <- rbind(t, t[1,] + t[2,])
colnames(t) <- c("<=3d quartile", ">3d quartile", "Total")
rownames(t) <- c("<=1st quartile", ">1 st quartile", "Total")
knitr::kable(t)
| <=3d quartile | >3d quartile | Total | |
|---|---|---|---|
| <=1st quartile | 3 | 12 | 15 |
| >1 st quartile | 2 | 3 | 5 |
| Total | 5 | 15 | 20 |
X <- data$X1
Y <- data$Y1
(x <- quantile(X, 0.25))
## 25%
## 6.85
(y <- quantile(Y, 0.25))
## 25%
## 18.55
A <- X > x
B <- Y > y
# Calculate P(AB)
(P_AB = sum(A[B]) / length(Y))
## [1] 0.55
# Calculate P(A) * P(B)
P_A = sum(A)/length(Y)
P_B = sum(B)/length(Y)
(P_A * P_B)
## [1] 0.5625
The above shows that P(AB)≠P(A)P(B), i.e. that A and B are not independent.
(ptest <- chisq.test(table(X > x, Y > x)))
##
## Chi-squared test for given probabilities
##
## data: table(X > x, Y > x)
## X-squared = 5, df = 1, p-value = 0.02535
Since p-value is less than significane level of alpha = 0.05, the variables are statisically dependent.
train <- read.csv("/Users/bchand005c/CUNY/DATA-605/assignment/final/train.csv", header = TRUE)
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub:1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## NoSeWa: 1 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual
## Norm :1445 1Fam :1220 1Story :726 Min. : 1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.099
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000
## PosA : 1 1.5Unf : 14 Max. :10.000
## (Other): 2 (Other): 19
## OverallCond YearBuilt YearRemodAdd RoofStyle
## Min. :1.000 Min. :1872 Min. :1950 Flat : 13
## 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 Gable :1141
## Median :5.000 Median :1973 Median :1994 Gambrel: 11
## Mean :5.575 Mean :1971 Mean :1985 Hip : 286
## 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7
## Max. :9.000 Max. :2010 Max. :2010 Shed : 2
##
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea
## CompShg:1434 VinylSd:515 VinylSd:504 BrkCmn : 15 Min. : 0.0
## Tar&Grv: 11 HdBoard:222 MetalSd:214 BrkFace:445 1st Qu.: 0.0
## WdShngl: 6 MetalSd:220 HdBoard:207 None :864 Median : 0.0
## WdShake: 5 Wd Sdng:206 Wd Sdng:197 Stone :128 Mean : 103.7
## ClyTile: 1 Plywood:108 Plywood:142 NA's : 8 3rd Qu.: 166.0
## Membran: 1 CemntBd: 61 CmentBd: 60 Max. :1600.0
## (Other): 2 (Other):128 (Other):136 NA's :8
## ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## Ex: 52 Ex: 3 BrkTil:146 Ex :121 Fa : 45 Av :221
## Fa: 14 Fa: 28 CBlock:634 Fa : 35 Gd : 65 Gd :134
## Gd:488 Gd: 146 PConc :647 Gd :618 Po : 2 Mn :114
## TA:906 Po: 1 Slab : 24 TA :649 TA :1311 No :953
## TA:1282 Stone : 6 NA's: 37 NA's: 37 NA's: 38
## Wood : 3
##
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## ALQ :220 Min. : 0.0 ALQ : 19 Min. : 0.00
## BLQ :148 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00
## GLQ :418 Median : 383.5 GLQ : 14 Median : 0.00
## LwQ : 74 Mean : 443.6 LwQ : 46 Mean : 46.55
## Rec :133 3rd Qu.: 712.2 Rec : 54 3rd Qu.: 0.00
## Unf :430 Max. :5644.0 Unf :1256 Max. :1474.00
## NA's: 37 NA's: 38
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741 N: 95
## 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49 Y:1365
## Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :2336.0 Max. :6110.0 Wall : 4
##
## Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39
## Median :0.0000 Median :3.000 Median :1.000 Gd:586
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :2.0000 Max. :8.000 Max. :3.000
##
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType
## Min. : 2.000 Maj1: 14 Min. :0.000 Ex : 24 2Types : 6
## 1st Qu.: 5.000 Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870
## Median : 6.000 Min1: 31 Median :1.000 Gd :380 Basment: 19
## Mean : 6.518 Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88
## 3rd Qu.: 7.000 Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9
## Max. :14.000 Sev : 1 Max. :3.000 NA's:690 Detchd :387
## Typ :1360 NA's : 81
## GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## Min. :1900 Fin :352 Min. :0.000 Min. : 0.0 Ex : 3
## 1st Qu.:1961 RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48
## Median :1980 Unf :605 Median :2.000 Median : 480.0 Gd : 14
## Mean :1979 NA's: 81 Mean :1.767 Mean : 473.0 Po : 3
## 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311
## Max. :2010 Max. :4.000 Max. :1418.0 NA's: 81
## NA's :81
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Ex : 2 N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Fa : 35 P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Gd : 9 Y:1340 Median : 0.00 Median : 25.00 Median : 0.00
## Po : 7 Mean : 94.24 Mean : 46.66 Mean : 21.95
## TA :1326 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## NA's: 81 Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Ex : 2
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2
## Median : 0.00 Median : 0.00 Median : 0.000 Gd : 3
## Mean : 3.41 Mean : 15.06 Mean : 2.759 NA's:1453
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.00 Max. :480.00 Max. :738.000
##
## Fence MiscFeature MiscVal MoSold
## GdPrv: 59 Gar2: 2 Min. : 0.00 Min. : 1.000
## GdWo : 54 Othr: 2 1st Qu.: 0.00 1st Qu.: 5.000
## MnPrv: 157 Shed: 49 Median : 0.00 Median : 6.000
## MnWw : 11 TenC: 1 Mean : 43.49 Mean : 6.322
## NA's :1179 NA's:1406 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :15500.00 Max. :12.000
##
## YrSold SaleType SaleCondition SalePrice
## Min. :2006 WD :1267 Abnorml: 101 Min. : 34900
## 1st Qu.:2007 New : 122 AdjLand: 4 1st Qu.:129975
## Median :2008 COD : 43 Alloca : 12 Median :163000
## Mean :2008 ConLD : 9 Family : 20 Mean :180921
## 3rd Qu.:2009 ConLI : 5 Normal :1198 3rd Qu.:214000
## Max. :2010 ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
X <- train$TotalBsmtSF
Y <- train$SalePrice
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 795.8 991.5 1057.4 1298.2 6110.0
hist(X, xlab="TotalBsmtSF", main = "Histogram of TotalBsmtSF")
boxplot(X)
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
hist(Y, xlab="SalePrice", main = "Histogram of Frequencies")
boxplot(Y)
plot(X,Y, xlab='TotalBsmtSF', ylab='SalePrice', main='TotalBsmtSF vs SalePrice')
Z <- train$LotFrontage
plot(X,Z, xlab='LotFrontage', ylab='SalePrice', main='LotFrontage vs SalePrice')
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1.00 0.45 0.61
## GrLivArea 0.45 1.00 0.71
## SalePrice 0.61 0.71 1.00
The above plot shows the correlation coefficients in colors (blue for positive correlation and red for negative correlation).
cor.test(train$TotalBsmtSF, train$GrLivArea, conf.level = 0.80, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train$TotalBsmtSF and train$GrLivArea
## t = 19.503, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4278380 0.4810855
## sample estimates:
## cor
## 0.4548682
cor.test(train$GrLivArea, train$SalePrice, conf.level = 0.80, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(train$TotalBsmtSF, train$SalePrice, conf.level = 0.80, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: train$TotalBsmtSF and train$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
Conclusion : In all the above three tests, null hypothesis is rejected in favor of alternate hypothesis.
Familywise errors are based on Type 1 or Type 2 errors. We should be worried when we observe extremely low p value and moderate values of correlation coefficients. Here it is not the case. So I am not worried about it.
Correlation matrix
corr_matrix
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1.0000000 0.4548682 0.6135806
## GrLivArea 0.4548682 1.0000000 0.7086245
## SalePrice 0.6135806 0.7086245 1.0000000
Invert the correlation matrix
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1.60588442 -0.06473842 -0.9394642
## GrLivArea -0.06473842 2.01124151 -1.3854927
## SalePrice -0.93946422 -1.38549273 2.5582310
Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.
round(corr_matrix %*% precision_matrix)
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
round(precision_matrix %*% corr_matrix)
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
In both cases Identity matrix is obtained.
(LU <- lu.decomposition(precision_matrix))
## $L
## [,1] [,2] [,3]
## [1,] 1.00000000 0.0000000 0
## [2,] -0.04031325 1.0000000 0
## [3,] -0.58501360 -0.7086245 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1.605884 -0.06473842 -0.9394642
## [2,] 0.000000 2.00863169 -1.4233656
## [3,] 0.000000 0.00000000 1.0000000
X <- train$TotalBsmtSF
hist(X, main = "TotalBsmtSF")
skewness(X)
## [1] 1.522688
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 795.8 991.5 1057.4 1298.2 6110.0
TotalBsmtSF is skewed to right.
Shifting above 0 since min value is 0
X <- X + 1
lamda <- fitdistr(X, 'exponential')
lamda$estimate
## rate
## 0.0009447961
The optimum of lambda is 0.0009447961
sample <- rexp(1000, lamda$estimate)
# Compare the two histograms side-by-side.
par(mfrow=c(1, 2))
hist(train$TotalBsmtSF, main="X (TotalBsmtSF)")
hist(sample, main="Exponential Distr for X")
# For X
quantile(train$TotalBsmtSF, probs=c(0.05, 0.95))
## 5% 95%
## 519.3 1753.0
# For sample
quantile(sample, probs=c(0.05, 0.95))
## 5% 95%
## 65.01865 3054.38949
# Generate 95% confidence interval for the emperical data:
sd = sd(X)
mean = mean(X)
n = length(X)
err = qnorm(0.975)*sd/sqrt(n)
left = mean - err
right = mean + err
cat("A 95% confidence interval for TotalBsmtSF is [", left, ",", right, "]")
## A 95% confidence interval for TotalBsmtSF is [ 1035.926 , 1080.933 ]
Conclusion : The data follow normal distribution than exponential distribution. Normal distribution would be helpful to explain the data better than exponential distribution.
train <- read.csv("/Users/bchand005c/CUNY/DATA-605/assignment/final/train.csv", header = TRUE)
df.train <- as.data.frame(train)
summary(df.train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub:1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## NoSeWa: 1 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual
## Norm :1445 1Fam :1220 1Story :726 Min. : 1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.099
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000
## PosA : 1 1.5Unf : 14 Max. :10.000
## (Other): 2 (Other): 19
## OverallCond YearBuilt YearRemodAdd RoofStyle
## Min. :1.000 Min. :1872 Min. :1950 Flat : 13
## 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 Gable :1141
## Median :5.000 Median :1973 Median :1994 Gambrel: 11
## Mean :5.575 Mean :1971 Mean :1985 Hip : 286
## 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7
## Max. :9.000 Max. :2010 Max. :2010 Shed : 2
##
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea
## CompShg:1434 VinylSd:515 VinylSd:504 BrkCmn : 15 Min. : 0.0
## Tar&Grv: 11 HdBoard:222 MetalSd:214 BrkFace:445 1st Qu.: 0.0
## WdShngl: 6 MetalSd:220 HdBoard:207 None :864 Median : 0.0
## WdShake: 5 Wd Sdng:206 Wd Sdng:197 Stone :128 Mean : 103.7
## ClyTile: 1 Plywood:108 Plywood:142 NA's : 8 3rd Qu.: 166.0
## Membran: 1 CemntBd: 61 CmentBd: 60 Max. :1600.0
## (Other): 2 (Other):128 (Other):136 NA's :8
## ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## Ex: 52 Ex: 3 BrkTil:146 Ex :121 Fa : 45 Av :221
## Fa: 14 Fa: 28 CBlock:634 Fa : 35 Gd : 65 Gd :134
## Gd:488 Gd: 146 PConc :647 Gd :618 Po : 2 Mn :114
## TA:906 Po: 1 Slab : 24 TA :649 TA :1311 No :953
## TA:1282 Stone : 6 NA's: 37 NA's: 37 NA's: 38
## Wood : 3
##
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## ALQ :220 Min. : 0.0 ALQ : 19 Min. : 0.00
## BLQ :148 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00
## GLQ :418 Median : 383.5 GLQ : 14 Median : 0.00
## LwQ : 74 Mean : 443.6 LwQ : 46 Mean : 46.55
## Rec :133 3rd Qu.: 712.2 Rec : 54 3rd Qu.: 0.00
## Unf :430 Max. :5644.0 Unf :1256 Max. :1474.00
## NA's: 37 NA's: 38
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741 N: 95
## 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49 Y:1365
## Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :2336.0 Max. :6110.0 Wall : 4
##
## Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39
## Median :0.0000 Median :3.000 Median :1.000 Gd:586
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :2.0000 Max. :8.000 Max. :3.000
##
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType
## Min. : 2.000 Maj1: 14 Min. :0.000 Ex : 24 2Types : 6
## 1st Qu.: 5.000 Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870
## Median : 6.000 Min1: 31 Median :1.000 Gd :380 Basment: 19
## Mean : 6.518 Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88
## 3rd Qu.: 7.000 Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9
## Max. :14.000 Sev : 1 Max. :3.000 NA's:690 Detchd :387
## Typ :1360 NA's : 81
## GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## Min. :1900 Fin :352 Min. :0.000 Min. : 0.0 Ex : 3
## 1st Qu.:1961 RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48
## Median :1980 Unf :605 Median :2.000 Median : 480.0 Gd : 14
## Mean :1979 NA's: 81 Mean :1.767 Mean : 473.0 Po : 3
## 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311
## Max. :2010 Max. :4.000 Max. :1418.0 NA's: 81
## NA's :81
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Ex : 2 N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Fa : 35 P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Gd : 9 Y:1340 Median : 0.00 Median : 25.00 Median : 0.00
## Po : 7 Mean : 94.24 Mean : 46.66 Mean : 21.95
## TA :1326 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## NA's: 81 Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Ex : 2
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2
## Median : 0.00 Median : 0.00 Median : 0.000 Gd : 3
## Mean : 3.41 Mean : 15.06 Mean : 2.759 NA's:1453
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.00 Max. :480.00 Max. :738.000
##
## Fence MiscFeature MiscVal MoSold
## GdPrv: 59 Gar2: 2 Min. : 0.00 Min. : 1.000
## GdWo : 54 Othr: 2 1st Qu.: 0.00 1st Qu.: 5.000
## MnPrv: 157 Shed: 49 Median : 0.00 Median : 6.000
## MnWw : 11 TenC: 1 Mean : 43.49 Mean : 6.322
## NA's :1179 NA's:1406 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :15500.00 Max. :12.000
##
## YrSold SaleType SaleCondition SalePrice
## Min. :2006 WD :1267 Abnorml: 101 Min. : 34900
## 1st Qu.:2007 New : 122 AdjLand: 4 1st Qu.:129975
## Median :2008 COD : 43 Alloca : 12 Median :163000
## Mean :2008 ConLD : 9 Family : 20 Mean :180921
## 3rd Qu.:2009 ConLI : 5 Normal :1198 3rd Qu.:214000
## Max. :2010 ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
Based on summary stats, removing fields that are missing data
df.train <- df.train %>% dplyr::select(-c(Street, Alley, LandContour, Utilities,
LandSlope, Condition2, MasVnrArea, Heating,
BsmtFinSF2, X2ndFlrSF, LowQualFinSF, BsmtFullBath,
BsmtHalfBath, HalfBath, PoolQC, PoolArea, MiscVal,
MiscFeature, Fence, ScreenPorch, Fireplaces,
EnclosedPorch, MoSold, YrSold))
Convert categorical variables to numerical values
df.train <- df.train %>% mutate_if(is.factor, as.numeric)
Replace NA with 0
df.train <- df.train %>% replace(is.na(.), 0)
Convert sales price to log values as feature engineering
df.train$SalePrice <- log(df.train$SalePrice)
Build initial model with all selected features
sale_price_lm <- lm(SalePrice ~ . , data=df.train)
summary(sale_price_lm)
##
## Call:
## lm(formula = SalePrice ~ ., data = df.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98665 -0.06503 0.00333 0.07143 0.57305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.263e+00 6.923e-01 9.046 < 2e-16 ***
## Id -1.212e-05 9.085e-06 -1.334 0.182571
## MSSubClass -1.327e-04 2.020e-04 -0.657 0.511138
## MSZoning -1.829e-02 6.722e-03 -2.720 0.006601 **
## LotFrontage -2.952e-04 1.239e-04 -2.382 0.017346 *
## LotArea 1.757e-06 4.240e-07 4.143 3.63e-05 ***
## LotShape -4.811e-03 3.027e-03 -1.589 0.112256
## LotConfig -7.497e-04 2.426e-03 -0.309 0.757349
## Neighborhood 6.277e-04 6.990e-04 0.898 0.369385
## Condition1 8.491e-04 4.511e-03 0.188 0.850728
## BldgType -1.298e-02 6.647e-03 -1.953 0.050999 .
## HouseStyle -4.242e-03 2.859e-03 -1.484 0.138073
## OverallQual 7.419e-02 5.253e-03 14.124 < 2e-16 ***
## OverallCond 3.754e-02 4.607e-03 8.150 8.00e-16 ***
## YearBuilt 1.250e-03 3.085e-04 4.050 5.39e-05 ***
## YearRemodAdd 8.181e-04 2.969e-04 2.755 0.005941 **
## RoofStyle 5.350e-03 4.969e-03 1.077 0.281800
## RoofMatl 1.606e-02 6.649e-03 2.416 0.015827 *
## Exterior1st -4.906e-03 2.325e-03 -2.110 0.035024 *
## Exterior2nd 4.302e-03 2.108e-03 2.041 0.041393 *
## MasVnrType 4.479e-03 6.208e-03 0.721 0.470734
## ExterQual -3.391e-03 8.703e-03 -0.390 0.696846
## ExterCond 1.204e-02 5.615e-03 2.145 0.032156 *
## Foundation 1.030e-02 7.501e-03 1.374 0.169710
## BsmtQual -1.575e-02 6.061e-03 -2.599 0.009455 **
## BsmtCond 1.041e-02 5.840e-03 1.783 0.074761 .
## BsmtExposure -9.066e-03 3.883e-03 -2.335 0.019680 *
## BsmtFinType1 -8.410e-03 2.769e-03 -3.037 0.002431 **
## BsmtFinSF1 -9.490e-05 3.327e-05 -2.852 0.004402 **
## BsmtFinType2 1.922e-02 4.996e-03 3.846 0.000125 ***
## BsmtUnfSF -1.467e-04 3.292e-05 -4.457 8.95e-06 ***
## TotalBsmtSF 1.859e-04 3.453e-05 5.384 8.54e-08 ***
## HeatingQC -9.455e-03 2.726e-03 -3.468 0.000540 ***
## CentralAir 8.755e-02 1.898e-02 4.612 4.35e-06 ***
## Electrical -6.020e-04 4.056e-03 -0.148 0.882027
## X1stFlrSF 2.421e-05 2.519e-05 0.961 0.336604
## GrLivArea 1.782e-04 1.860e-05 9.579 < 2e-16 ***
## FullBath 2.011e-02 1.093e-02 1.840 0.065985 .
## BedroomAbvGr 7.432e-03 7.427e-03 1.001 0.317202
## KitchenAbvGr -3.372e-02 2.230e-02 -1.512 0.130779
## KitchenQual -2.754e-02 6.449e-03 -4.271 2.07e-05 ***
## TotRmsAbvGrd 1.395e-02 5.206e-03 2.680 0.007453 **
## Functional 1.605e-02 4.211e-03 3.811 0.000144 ***
## FireplaceQu 1.019e-02 2.255e-03 4.520 6.71e-06 ***
## GarageType -5.463e-03 2.844e-03 -1.921 0.054921 .
## GarageYrBlt 6.746e-06 2.613e-05 0.258 0.796319
## GarageFinish -7.150e-03 6.611e-03 -1.082 0.279606
## GarageCars 7.285e-02 1.257e-02 5.797 8.31e-09 ***
## GarageArea -1.137e-05 4.134e-05 -0.275 0.783365
## GarageQual -1.656e-03 7.914e-03 -0.209 0.834318
## GarageCond 1.083e-02 8.967e-03 1.208 0.227274
## PavedDrive 2.571e-02 9.251e-03 2.780 0.005511 **
## WoodDeckSF 8.216e-05 3.309e-05 2.482 0.013163 *
## OpenPorchSF -2.759e-05 6.341e-05 -0.435 0.663584
## X3SsnPorch 1.274e-04 1.302e-04 0.978 0.328043
## SaleType -1.191e-03 2.555e-03 -0.466 0.641310
## SaleCondition 2.281e-02 3.681e-03 6.196 7.62e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.143 on 1403 degrees of freedom
## Multiple R-squared: 0.8768, Adjusted R-squared: 0.8719
## F-statistic: 178.3 on 56 and 1403 DF, p-value: < 2.2e-16
Performs stepwise model selection by AIC.
aic_lm <- stepAIC(sale_price_lm, trace=FALSE)
summary(aic_lm)
##
## Call:
## lm(formula = SalePrice ~ MSZoning + LotFrontage + LotArea + LotShape +
## BldgType + HouseStyle + OverallQual + OverallCond + YearBuilt +
## YearRemodAdd + RoofMatl + Exterior1st + Exterior2nd + ExterCond +
## BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 + BsmtFinSF1 +
## BsmtFinType2 + BsmtUnfSF + TotalBsmtSF + HeatingQC + CentralAir +
## GrLivArea + FullBath + KitchenAbvGr + KitchenQual + TotRmsAbvGrd +
## Functional + FireplaceQu + GarageType + GarageCars + GarageCond +
## PavedDrive + WoodDeckSF + SaleCondition, data = df.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02454 -0.06521 0.00175 0.07304 0.57445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.949e+00 6.284e-01 9.467 < 2e-16 ***
## MSZoning -1.981e-02 6.398e-03 -3.097 0.001995 **
## LotFrontage -2.688e-04 1.222e-04 -2.199 0.028038 *
## LotArea 1.751e-06 4.198e-07 4.170 3.23e-05 ***
## LotShape -5.663e-03 2.933e-03 -1.931 0.053712 .
## BldgType -1.738e-02 3.801e-03 -4.572 5.24e-06 ***
## HouseStyle -5.973e-03 2.472e-03 -2.417 0.015794 *
## OverallQual 7.514e-02 5.068e-03 14.826 < 2e-16 ***
## OverallCond 3.708e-02 4.550e-03 8.151 7.84e-16 ***
## YearBuilt 1.394e-03 2.790e-04 4.996 6.59e-07 ***
## YearRemodAdd 8.705e-04 2.882e-04 3.020 0.002570 **
## RoofMatl 1.630e-02 6.507e-03 2.505 0.012360 *
## Exterior1st -5.157e-03 2.294e-03 -2.248 0.024728 *
## Exterior2nd 4.236e-03 2.071e-03 2.045 0.041013 *
## ExterCond 1.207e-02 5.505e-03 2.192 0.028571 *
## BsmtQual -1.890e-02 5.743e-03 -3.290 0.001026 **
## BsmtCond 9.121e-03 5.697e-03 1.601 0.109590
## BsmtExposure -8.670e-03 3.674e-03 -2.360 0.018401 *
## BsmtFinType1 -8.934e-03 2.724e-03 -3.280 0.001065 **
## BsmtFinSF1 -8.910e-05 3.274e-05 -2.721 0.006586 **
## BsmtFinType2 1.757e-02 4.878e-03 3.602 0.000326 ***
## BsmtUnfSF -1.389e-04 3.227e-05 -4.304 1.80e-05 ***
## TotalBsmtSF 1.967e-04 3.140e-05 6.264 4.95e-10 ***
## HeatingQC -9.599e-03 2.651e-03 -3.621 0.000304 ***
## CentralAir 8.869e-02 1.852e-02 4.789 1.85e-06 ***
## GrLivArea 1.818e-04 1.704e-05 10.666 < 2e-16 ***
## FullBath 2.032e-02 1.062e-02 1.913 0.055926 .
## KitchenAbvGr -3.143e-02 2.128e-02 -1.476 0.140033
## KitchenQual -2.830e-02 5.971e-03 -4.739 2.36e-06 ***
## TotRmsAbvGrd 1.649e-02 4.605e-03 3.582 0.000353 ***
## Functional 1.551e-02 4.073e-03 3.809 0.000146 ***
## FireplaceQu 1.011e-02 2.171e-03 4.657 3.51e-06 ***
## GarageType -7.093e-03 2.551e-03 -2.780 0.005503 **
## GarageCars 7.160e-02 8.293e-03 8.634 < 2e-16 ***
## GarageCond 9.459e-03 4.389e-03 2.155 0.031316 *
## PavedDrive 2.481e-02 9.140e-03 2.714 0.006724 **
## WoodDeckSF 8.289e-05 3.265e-05 2.539 0.011225 *
## SaleCondition 2.234e-02 3.562e-03 6.271 4.74e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1426 on 1422 degrees of freedom
## Multiple R-squared: 0.8758, Adjusted R-squared: 0.8725
## F-statistic: 270.9 on 37 and 1422 DF, p-value: < 2.2e-16
Check R squared value
summary(aic_lm)$r.squared
## [1] 0.8757536
High value of R squared makes our model good fit of data.
Checking the validity of model
qqnorm(aic_lm$residuals); qqline(aic_lm$residuals)
plot(aic_lm$fitted.values, aic_lm$residuals,
xlab="Values", ylab="Residuals")
abline(h=0)
Residuals don’t show any patterns so the model is valid.
Prediction
test <- read.csv("/Users/bchand005c/CUNY/DATA-605/assignment/final/test.csv", header = TRUE)
df.test <- as.data.frame(test)
df.test <- df.test %>% dplyr::select(-c(Street, Alley, LandContour, Utilities,
LandSlope, Condition2, MasVnrArea, Heating,
BsmtFinSF2, X2ndFlrSF, LowQualFinSF, BsmtFullBath,
BsmtHalfBath, HalfBath, PoolQC, PoolArea, MiscVal,
MiscFeature, Fence, ScreenPorch, Fireplaces,
EnclosedPorch, MoSold, YrSold))
df.test <- df.test %>% mutate_if(is.factor, as.numeric)
df.test <- df.test %>% replace(is.na(.), 0)
pred_saleprice <- predict(aic_lm, df.test)
kaggle <- data.frame(Id=test$Id, SalePrice=pred_saleprice)
write.csv(kaggle, file = "submission.csv", row.names=FALSE)
Kaggle Submission
username : astilavista score : 9.45492