library(kableExtra)
library(pastecs)
library(psych)
library(e1071)
library(fBasics)
## Loading required package: timeDate
##
## Attaching package: 'timeDate'
## The following objects are masked from 'package:e1071':
##
## kurtosis, skewness
## Loading required package: timeSeries
##
## Attaching package: 'timeSeries'
## The following object is masked from 'package:psych':
##
## outlier
##
## Attaching package: 'fBasics'
## The following object is masked from 'package:psych':
##
## tr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:timeSeries':
##
## filter, lag
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(pracma)
##
## Attaching package: 'pracma'
## The following objects are masked from 'package:fBasics':
##
## akimaInterp, inv, kron, pascal
## The following object is masked from 'package:e1071':
##
## sigmoid
## The following objects are masked from 'package:psych':
##
## logit, polar
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(survival)
N <- 10
set.seed(123)
X <- runif(10000, 1, N)
set.seed(123)
Y <- rnorm(10000, (N+1)/2, (N+1)/2)
x <- median(X)
print(x)
## [1] 5.451109
y <- quantile(Y, 0.25)
print(y)
## 25%
## 1.82617
X_Y_df <- as.data.frame(cbind(X,Y))
We will use the dataframe we created above from the 2 vectors.
Using the formula: P(X>x | X>y) = P(X>x and X>y) / P(X>y)
p_1_a <- (length(X[X>x & X>y])/length(X)) /(length(X[X>y])/length(X))
print(p_1_a)
## [1] 0.5501761
As both are independent events, we will use the formula: P(A and B) = P(A).P(B)
p_X_gt_x <- length(X[X>x]) / length(X)
p_Y_gt_y <- length(Y[Y>y]) / length(Y)
p_1_b <- p_X_gt_x * p_Y_gt_y
print(p_1_b)
## [1] 0.375
p_X_lt_x_and_X_gt_y = length(X[X<x & X>y])/length(X)
p_X_gt_y <- length(X[X>y])/length(X)
p_1_c <- p_X_lt_x_and_X_gt_y / p_X_gt_y
print(p_1_c)
## [1] 0.4498239
Xgtx_Ygty_count <- length(X[X>x & Y>y])
Xgtx_Ylty_count <- length(X[X>x & Y<y])
Xltx_Ygty_count <- length(X[X<x & Y>y])
Xltx_Ylty_count <- length(X[X<x & Y<y])
contingency_table <- matrix(c(Xgtx_Ygty_count, Xgtx_Ylty_count, Xltx_Ygty_count, Xltx_Ylty_count), nrow = 2, ncol = 2)
rownames(contingency_table) <- c('(Y>y)','(Y<y)')
kable(contingency_table, digits=4, col.names = c('(X>x)', '(X<x)'), align = 'l')
| (X>x) | (X<x) | |
|---|---|---|
| (Y>y) | 3749 | 3751 |
| (Y<y) | 1251 | 1249 |
p_X_gt_x_and_Y_gt_y <- length(X[X>x & Y>y]) / 10000
print(p_X_gt_x_and_Y_gt_y)
## [1] 0.3749
p_X_gt_x_into_p_Y_gt_y <- p_X_gt_x * p_Y_gt_y
print(p_X_gt_x_into_p_Y_gt_y)
## [1] 0.375
As we see from above, both these are nearly equal.
fisher.test(contingency_table)
##
## Fisher's Exact Test for Count Data
##
## data: contingency_table
## p-value = 0.9816
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9105431 1.0935740
## sample estimates:
## odds ratio
## 0.9978691
As we see here, the p-value is very high as compared to the general significance level of 0.05, hence we cannot reject the null hypothesis, that both are independent. Hence using Fisher test, we can say that both of these events are independent.
chisq.test(contingency_table)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contingency_table
## X-squared = 0.00053333, df = 1, p-value = 0.9816
We see here too using Chi Square test, that p-value is very high as compared to the significance level of 0.05. Hence we can say again using this test that both of these are independent.
In this case, both are giving us the same p-value.
train_raw_path <- "https://raw.githubusercontent.com/deepakmongia/Data605-Spring2019/master/train.csv"
train_df <- read.csv(train_raw_path, header = TRUE, sep = ",")
head(train_df)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
nrow(train_df)
## [1] 1460
ncol(train_df)
## [1] 81
#summary(train_df)
Building a table for univariate statistics for all the numeric columns.
train_df_numeric <- train_df[c(2,4,5, 18:21, 27,35, 37:39, 44:53, 55, 57, 60, 62, 63, 67, 72, 76:78, 81)]
train_df_univariate_stats_df <- basicStats(train_df_numeric)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
"Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()
kable(train_df_univariate_stats_df)
| Minimum | Maximum |
|
|
Mean | Median | Variance | Stdev | Skewness | Kurtosis | |
|---|---|---|---|---|---|---|---|---|---|---|
| MSSubClass | 20 | 190 | 20.00 | 70.00 | 5.689726e+01 | 50.0 | 1.789338e+03 | 42.300571 | 1.404766 | 1.564416 |
| LotFrontage | 21 | 313 | 59.00 | 80.00 | 7.004996e+01 | 69.0 | 5.897492e+02 | 24.284752 | 2.158168 | 17.341384 |
| LotArea | 1300 | 215245 | 7553.50 | 11601.50 | 1.051683e+04 | 9478.5 | 9.962565e+07 | 9981.264932 | 12.182615 | 202.262322 |
| OverallQual | 1 | 10 | 5.00 | 7.00 | 6.099315e+00 | 6.0 | 1.912679e+00 | 1.382997 | 0.216498 | 0.087623 |
| OverallCond | 1 | 9 | 5.00 | 6.00 | 5.575342e+00 | 5.0 | 1.238322e+00 | 1.112799 | 0.691644 | 1.092909 |
| YearBuilt | 1872 | 2010 | 1954.00 | 2000.00 | 1.971268e+03 | 1973.0 | 9.122154e+02 | 30.202904 | -0.612201 | -0.445658 |
| YearRemodAdd | 1950 | 2010 | 1967.00 | 2004.00 | 1.984866e+03 | 1994.0 | 4.262328e+02 | 20.645407 | -0.502528 | -1.274365 |
| MasVnrArea | 0 | 1600 | 0.00 | 166.00 | 1.036853e+02 | 0.0 | 3.278497e+04 | 181.066207 | 2.663572 | 10.025642 |
| BsmtFinSF1 | 0 | 5644 | 0.00 | 712.25 | 4.436397e+02 | 383.5 | 2.080255e+05 | 456.098091 | 1.682041 | 11.056814 |
| BsmtFinSF2 | 0 | 1474 | 0.00 | 0.00 | 4.654931e+01 | 0.0 | 2.602391e+04 | 161.319273 | 4.246521 | 20.008864 |
| BsmtUnfSF | 0 | 2336 | 223.00 | 808.00 | 5.672404e+02 | 477.5 | 1.952464e+05 | 441.866955 | 0.918378 | 0.464511 |
| TotalBsmtSF | 0 | 6110 | 795.75 | 1298.25 | 1.057429e+03 | 991.5 | 1.924624e+05 | 438.705324 | 1.521124 | 13.178856 |
| X1stFlrSF | 334 | 4692 | 882.00 | 1391.25 | 1.162627e+03 | 1087.0 | 1.494501e+05 | 386.587738 | 1.373929 | 5.710132 |
| X2ndFlrSF | 0 | 2065 | 0.00 | 728.00 | 3.469925e+02 | 0.0 | 1.905571e+05 | 436.528436 | 0.811360 | -0.559024 |
| LowQualFinSF | 0 | 572 | 0.00 | 0.00 | 5.844521e+00 | 0.0 | 2.364204e+03 | 48.623081 | 8.992833 | 82.828239 |
| GrLivArea | 334 | 5642 | 1129.50 | 1776.75 | 1.515464e+03 | 1464.0 | 2.761296e+05 | 525.480383 | 1.363754 | 4.863483 |
| BsmtFullBath | 0 | 3 | 0.00 | 1.00 | 4.253420e-01 | 0.0 | 2.692680e-01 | 0.518911 | 0.594842 | -0.843292 |
| BsmtHalfBath | 0 | 2 | 0.00 | 0.00 | 5.753400e-02 | 0.0 | 5.700300e-02 | 0.238753 | 4.094975 | 16.309957 |
| FullBath | 0 | 3 | 1.00 | 2.00 | 1.565068e+00 | 2.0 | 3.035080e-01 | 0.550916 | 0.036486 | -0.861150 |
| HalfBath | 0 | 2 | 0.00 | 1.00 | 3.828770e-01 | 0.0 | 2.528940e-01 | 0.502885 | 0.674509 | -1.079982 |
| BedroomAbvGr | 0 | 8 | 2.00 | 3.00 | 2.866438e+00 | 3.0 | 6.654940e-01 | 0.815778 | 0.211355 | 2.211988 |
| KitchenAbvGr | 0 | 3 | 1.00 | 1.00 | 1.046575e+00 | 1.0 | 4.854900e-02 | 0.220338 | 4.479178 | 21.421139 |
| TotRmsAbvGrd | 2 | 14 | 5.00 | 7.00 | 6.517808e+00 | 6.0 | 2.641903e+00 | 1.625393 | 0.674952 | 0.868337 |
| Fireplaces | 0 | 3 | 0.00 | 1.00 | 6.130140e-01 | 1.0 | 4.155950e-01 | 0.644666 | 0.648231 | -0.224407 |
| GarageYrBlt | 1900 | 2010 | 1961.00 | 2002.00 | 1.978506e+03 | 1980.0 | 6.095825e+02 | 24.689725 | -0.648003 | -0.424912 |
| GarageCars | 0 | 4 | 1.00 | 2.00 | 1.767123e+00 | 2.0 | 5.584800e-01 | 0.747315 | -0.341845 | 0.211731 |
| GarageArea | 0 | 1418 | 334.50 | 576.00 | 4.729801e+02 | 480.0 | 4.571251e+04 | 213.804841 | 0.179611 | 0.904469 |
| WoodDeckSF | 0 | 857 | 0.00 | 168.00 | 9.424452e+01 | 0.0 | 1.570981e+04 | 125.338794 | 1.538210 | 2.970417 |
| PoolArea | 0 | 738 | 0.00 | 0.00 | 2.758904e+00 | 0.0 | 1.614216e+03 | 40.177307 | 14.797918 | 222.191708 |
| MiscVal | 0 | 15500 | 0.00 | 0.00 | 4.348904e+01 | 0.0 | 2.461381e+05 | 496.123024 | 24.426522 | 697.640072 |
| MoSold | 1 | 12 | 5.00 | 8.00 | 6.321918e+00 | 6.0 | 7.309595e+00 | 2.703626 | 0.211617 | -0.410385 |
| YrSold | 2006 | 2010 | 2007.00 | 2009.00 | 2.007816e+03 | 2008.0 | 1.763837e+00 | 1.328095 | 0.096071 | -1.193112 |
| SalePrice | 34900 | 755000 | 129975.00 | 214000.00 | 1.809212e+05 | 163000.0 | 6.311111e+09 | 79442.502883 | 1.879009 | 6.496789 |
Plotting each of the variables for their ranges:
par(mfrow=c(2,3))
boxplot(train_df$MSSubClass, main='MSSubClass')
boxplot(train_df$LotFrontage, main='LotFrontage')
boxplot(train_df$LotArea, main='LotArea')
boxplot(train_df$OverallQual, main='OverallQual')
boxplot(train_df$OverallCond, main='OverallCond')
boxplot(train_df$YearBuilt, main='YearBuilt')
par(mfrow=c(2,3))
boxplot(train_df$YearRemodAdd, main='YearRemodAdd')
boxplot(train_df$MasVnrArea, main='MasVnrArea')
boxplot(train_df$BsmtFinSF1, main='BsmtFinSF1')
boxplot(train_df$BsmtFinSF2, main='BsmtFinSF2')
boxplot(train_df$BsmtUnfSF, main='BsmtUnfSF')
boxplot(train_df$TotalBsmtSF, main='TotalBsmtSF')
par(mfrow=c(2,3))
boxplot(train_df$X1stFlrSF, main='X1stFlrSF')
boxplot(train_df$X2ndFlrSF, main='X2ndFlrSF')
boxplot(train_df$LowQualFinSF, main='LowQualFinSF')
boxplot(train_df$GrLivArea, main='GrLivArea')
boxplot(train_df$BsmtFullBath, main='BsmtFullBath')
boxplot(train_df$BsmtHalfBath, main='BsmtHalfBath')
par(mfrow=c(2,3))
boxplot(train_df$FullBath, main='FullBath')
boxplot(train_df$HalfBath, main='HalfBath')
boxplot(train_df$BedroomAbvGr, main='BedroomAbvGr')
boxplot(train_df$KitchenAbvGr, main='KitchenAbvGr')
boxplot(train_df$TotRmsAbvGrd, main='TotRmsAbvGrd')
boxplot(train_df$Fireplaces, main='Fireplaces')
par(mfrow=c(2,3))
boxplot(train_df$GarageYrBlt, main='GarageYrBlt')
boxplot(train_df$GarageCars, main='GarageCars')
boxplot(train_df$GarageArea, main='GarageArea')
boxplot(train_df$WoodDeckSF, main='WoodDeckSF')
boxplot(train_df$PoolArea, main='PoolArea')
boxplot(train_df$MiscVal, main='MiscVal')
par(mfrow=c(1,3))
boxplot(train_df$MoSold, main='MoSold')
boxplot(train_df$YrSold, main='YrSold')
boxplot(train_df$SalePrice, main='SalePrice')
par(mfrow=c(2,3))
ggplot(train_df, aes(x=LotArea, y=SalePrice)) +
geom_point(shape=1)
ggplot(train_df, aes(x=BsmtFinSF1, y=SalePrice)) +
geom_point(shape=1)
ggplot(train_df, aes(x=TotalBsmtSF, y=SalePrice)) +
geom_point(shape=1)
ggplot(train_df, aes(x=GrLivArea, y=SalePrice)) +
geom_point(shape=1)
ggplot(train_df, aes(x=GarageArea, y=SalePrice)) +
geom_point(shape=1)
The GrLiving area - which is area above ground has a linear relationship with the sale price of the apartment. Also the garage area also looks to have a good relationship, even though there are some houses available with no garage area.
X <- train_df$LotArea
Y <- train_df$GrLivArea
Z <- train_df$SalePrice
cor(Y,Z)
## [1] 0.7086245
cor(X,Z)
## [1] 0.2638434
Working on Living area (Y) and sale price (Z) first: H0: correlation between Y and Z = 0 –> Null Hypothesis HA: correlation between Y and Z > 0 –> Alternate Hypothesis
Using the R function for T-testing to get 80% confidence level:
t.test(Y, Z, conf.level = 0.8)
##
## Welch Two Sample t-test
##
## data: Y and Z
## t = -86.288, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
## -182071.5 -176740.0
## sample estimates:
## mean of x mean of y
## 1515.464 180921.196
From this result, we see that there is a 80% confidence level that the difference in the means of the 2 variables is between -182071.5 and -176740.0. And the p-value is 2.2e-16 which is way less than the significance value of 0.05. Hence we can reject the null hypothesis and say that the correlation between Living area and sale price is not 0, and these are related to each other.
Similarly, for the other pair: Lot area (X) and sale price (Z) H0: correlation between X and Z = 0 –> Null Hypothesis HA: correlation between X and Z > 0 –> Alternate Hypothesis
t.test(X, Z, conf.level = 0.8)
##
## Welch Two Sample t-test
##
## data: X and Z
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 80 percent confidence interval:
## -173091.0 -167717.8
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
From this result, we see that there is a 80% confidence level that the difference in the means of the 2 variables is between -173091.0 and -167717.8. And the p-value is 2.2e-16 for this one too, which is way less than the significance value of 0.05. Hence we can reject the null hypothesis and say that the correlation between Living area and sale price is not 0, and these are related to each other.
We will build the correlation matrix for X (LotArea) and Z (SalePrice)
matrix1 <- data.frame(X,Z)
head(matrix1)
## X Z
## 1 8450 208500
## 2 9600 181500
## 3 11250 223500
## 4 9550 140000
## 5 14260 250000
## 6 14115 143000
Building the correlation matrix for this:
matrix1_corr <- cor(matrix1)
matrix1_corr
## X Z
## X 1.0000000 0.2638434
## Z 0.2638434 1.0000000
Taking the inverse of the correlation matrix:
matrix1_prec <- solve(matrix1_corr)
matrix1_prec
## X Z
## X 1.0748219 -0.2835846
## Z -0.2835846 1.0748219
Multiplying the correlation and precision matrices in both directions
matrix1_corr %*% matrix1_prec
## X Z
## X 1 0
## Z 0 1
matrix1_prec %*% matrix1_corr
## X Z
## X 1 0
## Z 0 1
As the precision matrix is an inverse of the correlation matrix, multiplying them in either direction gives us an identity matrix.
matrix1_corr_L <- lu(matrix1_corr)$L
matrix1_corr_L
## X Z
## X 1.0000000 0
## Z 0.2638434 1
matrix1_corr_U <- lu(matrix1_corr)$U
matrix1_corr_U
## X Z
## X 1 0.2638434
## Z 0 0.9303867
As per the LU decomposition, LU = A Here A is the correlation matrix we created above. So, if we multiply L and U above, it should give correlation matrix
matrix1_corr_L %*% matrix1_corr_U
## X Z
## X 1.0000000 0.2638434
## Z 0.2638434 1.0000000
identical(matrix1_corr, matrix1_corr_L %*% matrix1_corr_U)
## [1] TRUE
hist(train_df$LotArea, col = 'purple', main = 'Lot Area variable', breaks = 30)
As we see above in the histogram, this variable is highly skewed to the right.
###Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of lamda for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, lamda)). Plot a histogram and compare it with a histogram of your original variable.
Now we are going to fit this variable to an exponential distribution.
LotArea_fitted <- fitdistr(train_df$LotArea, "exponential")
Getting the value of lambda for this exponential distribution
lamda <- LotArea_fitted$estimate
lamda
## rate
## 9.50857e-05
exp_sample <- rexp(1000, lamda)
summary(exp_sample)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.14 3479.93 7536.22 11012.09 14741.94 82115.69
hist(exp_sample, col = 'purple', breaks = 20)
As we see from this new histogram of the new variable which was generated by fitting the LotArea variable to the exponential distribution, the plot looks a very good exponential one, and we can use it for estimation, in case required further.
Generating the 5th and 95th percentiles
qexp(c(0.05,0.95), lamda)
## [1] 539.4428 31505.6013
Generating 95% confidence level from the data, assuming that the distribution is normal
qnorm(c(0.025, 0.975), mean = mean(train_df$LotArea), sd = sd(train_df$LotArea))
## [1] -9046.092 30079.748
Now using the actual data to get 5th and 95th percentile
quantile(train_df$LotArea, c(0.05, 0.95))
## 5% 95%
## 3311.70 17401.15
This indicates that the lowest 5% of the observations are below 3311 sq. ft. of Lot Area, and the upper 5% values are above 17401 sq. ft. So the 90% fall under this range.
saleprice_lm2 <- lm(SalePrice ~ ., data = train_df_numeric)
summary(saleprice_lm2)
##
## Call:
## lm(formula = SalePrice ~ ., data = train_df_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -448886 -17213 -2076 15074 314113
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.376e+05 1.701e+06 -0.198 0.842703
## MSSubClass -2.003e+02 3.455e+01 -5.798 8.80e-09 ***
## LotFrontage -1.210e+02 6.104e+01 -1.983 0.047624 *
## LotArea 5.516e-01 1.573e-01 3.508 0.000471 ***
## OverallQual 1.878e+04 1.473e+03 12.751 < 2e-16 ***
## OverallCond 5.397e+03 1.349e+03 4.001 6.73e-05 ***
## YearBuilt 3.027e+02 8.440e+01 3.586 0.000351 ***
## YearRemodAdd 1.149e+02 8.644e+01 1.329 0.183977
## MasVnrArea 3.221e+01 6.989e+00 4.609 4.52e-06 ***
## BsmtFinSF1 1.767e+01 5.821e+00 3.036 0.002457 **
## BsmtFinSF2 9.486e+00 8.723e+00 1.087 0.277077
## BsmtUnfSF 5.093e+00 5.247e+00 0.971 0.331968
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.676e+01 7.358e+00 6.355 3.05e-10 ***
## X2ndFlrSF 4.658e+01 6.052e+00 7.696 3.13e-14 ***
## LowQualFinSF 3.730e+01 2.788e+01 1.338 0.181150
## GrLivArea NA NA NA NA
## BsmtFullBath 8.899e+03 3.194e+03 2.786 0.005425 **
## BsmtHalfBath 2.485e+03 5.067e+03 0.490 0.623912
## FullBath 5.615e+03 3.526e+03 1.592 0.111563
## HalfBath -3.962e+02 3.304e+03 -0.120 0.904560
## BedroomAbvGr -1.001e+04 2.152e+03 -4.650 3.72e-06 ***
## KitchenAbvGr -2.294e+04 6.703e+03 -3.423 0.000643 ***
## TotRmsAbvGrd 5.234e+03 1.482e+03 3.531 0.000431 ***
## Fireplaces 5.057e+03 2.177e+03 2.323 0.020346 *
## GarageYrBlt -5.455e+01 9.105e+01 -0.599 0.549223
## GarageCars 1.721e+04 3.478e+03 4.948 8.69e-07 ***
## GarageArea 6.216e+00 1.206e+01 0.515 0.606398
## WoodDeckSF 1.697e+01 9.879e+00 1.718 0.086035 .
## PoolArea -5.956e+01 2.977e+01 -2.000 0.045711 *
## MiscVal -5.263e-01 6.869e+00 -0.077 0.938939
## MoSold -2.135e+02 4.213e+02 -0.507 0.612320
## YrSold -2.220e+02 8.456e+02 -0.262 0.792989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36870 on 1090 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.808, Adjusted R-squared: 0.8027
## F-statistic: 152.9 on 30 and 1090 DF, p-value: < 2.2e-16
From the above summary information, we will remove the independent variables which gave NA in the results, and also the ones which have P-value significantly greater than 0.05. Hence generating this new model.
saleprice_lm3 <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual +
OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
BsmtFinSF2 + BsmtUnfSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF +
BsmtFullBath + FullBath + BedroomAbvGr + KitchenAbvGr +
TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF +
PoolArea, data = train_df_numeric)
summary(saleprice_lm3)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea +
## BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + X1stFlrSF + X2ndFlrSF +
## LowQualFinSF + BsmtFullBath + FullBath + BedroomAbvGr + KitchenAbvGr +
## TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + PoolArea,
## data = train_df_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -458575 -17710 -2478 14079 319298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.599e+05 1.417e+05 -6.772 2.01e-11 ***
## MSSubClass -1.894e+02 3.208e+01 -5.903 4.66e-09 ***
## LotFrontage -9.584e+01 5.771e+01 -1.661 0.097040 .
## LotArea 5.622e-01 1.550e-01 3.627 0.000299 ***
## OverallQual 1.754e+04 1.377e+03 12.737 < 2e-16 ***
## OverallCond 4.452e+03 1.215e+03 3.665 0.000259 ***
## YearBuilt 2.842e+02 6.182e+01 4.597 4.75e-06 ***
## YearRemodAdd 1.728e+02 7.565e+01 2.285 0.022511 *
## MasVnrArea 3.593e+01 6.845e+00 5.249 1.82e-07 ***
## BsmtFinSF1 1.909e+01 5.439e+00 3.509 0.000467 ***
## BsmtFinSF2 9.341e+00 8.347e+00 1.119 0.263309
## BsmtUnfSF 7.781e+00 4.876e+00 1.596 0.110786
## X1stFlrSF 4.706e+01 6.920e+00 6.801 1.66e-11 ***
## X2ndFlrSF 4.666e+01 5.051e+00 9.236 < 2e-16 ***
## LowQualFinSF 3.711e+01 2.180e+01 1.702 0.088936 .
## BsmtFullBath 9.736e+03 2.887e+03 3.372 0.000770 ***
## FullBath 7.073e+03 3.000e+03 2.358 0.018543 *
## BedroomAbvGr -9.862e+03 1.943e+03 -5.077 4.45e-07 ***
## KitchenAbvGr -1.338e+04 5.778e+03 -2.315 0.020776 *
## TotRmsAbvGrd 4.923e+03 1.417e+03 3.473 0.000533 ***
## Fireplaces 5.458e+03 2.070e+03 2.637 0.008477 **
## GarageCars 1.129e+04 1.914e+03 5.901 4.73e-09 ***
## WoodDeckSF 1.995e+01 9.538e+00 2.092 0.036649 *
## PoolArea -6.132e+01 2.889e+01 -2.123 0.033965 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36570 on 1171 degrees of freedom
## (265 observations deleted due to missingness)
## Multiple R-squared: 0.8104, Adjusted R-squared: 0.8067
## F-statistic: 217.7 on 23 and 1171 DF, p-value: < 2.2e-16
Removing the independent variables further which have P-value greater than 0.05.
saleprice_lm4 <- lm(SalePrice ~ MSSubClass + LotArea + OverallQual +
OverallCond + YearBuilt + YearRemodAdd + MasVnrArea +
BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + BsmtFullBath +
FullBath + BedroomAbvGr + KitchenAbvGr +
TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF +
PoolArea, data = train_df_numeric)
summary(saleprice_lm4)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
## X1stFlrSF + X2ndFlrSF + BsmtFullBath + FullBath + BedroomAbvGr +
## KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF +
## PoolArea, data = train_df_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -472084 -16329 -1999 13742 298359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.587e+05 1.237e+05 -7.747 1.77e-14 ***
## MSSubClass -1.690e+02 2.592e+01 -6.518 9.83e-11 ***
## LotArea 4.167e-01 1.002e-01 4.160 3.37e-05 ***
## OverallQual 1.801e+04 1.143e+03 15.763 < 2e-16 ***
## OverallCond 4.126e+03 1.002e+03 4.118 4.04e-05 ***
## YearBuilt 2.817e+02 5.347e+01 5.268 1.59e-07 ***
## YearRemodAdd 1.741e+02 6.544e+01 2.660 0.007895 **
## MasVnrArea 3.256e+01 5.915e+00 5.504 4.39e-08 ***
## BsmtFinSF1 1.131e+01 2.993e+00 3.779 0.000164 ***
## X1stFlrSF 5.446e+01 4.825e+00 11.288 < 2e-16 ***
## X2ndFlrSF 4.582e+01 4.264e+00 10.746 < 2e-16 ***
## BsmtFullBath 9.130e+03 2.401e+03 3.802 0.000149 ***
## FullBath 4.787e+03 2.598e+03 1.843 0.065601 .
## BedroomAbvGr -9.755e+03 1.679e+03 -5.809 7.72e-09 ***
## KitchenAbvGr -1.485e+04 5.144e+03 -2.887 0.003947 **
## TotRmsAbvGrd 5.051e+03 1.216e+03 4.153 3.47e-05 ***
## Fireplaces 4.038e+03 1.731e+03 2.332 0.019823 *
## GarageCars 1.036e+04 1.695e+03 6.115 1.24e-09 ***
## WoodDeckSF 2.113e+01 7.852e+00 2.690 0.007219 **
## PoolArea -2.541e+01 2.340e+01 -1.086 0.277699
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34790 on 1432 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.8099, Adjusted R-squared: 0.8074
## F-statistic: 321.2 on 19 and 1432 DF, p-value: < 2.2e-16
Removing further some variables, as we still have some variables with P-value > 0.05
saleprice_lm5 <- lm(SalePrice ~ MSSubClass + LotArea + OverallQual +
OverallCond + YearBuilt + YearRemodAdd + MasVnrArea +
BsmtFinSF1 + X1stFlrSF + X2ndFlrSF + BsmtFullBath +
BedroomAbvGr + KitchenAbvGr +
TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF,
data = train_df_numeric)
summary(saleprice_lm5)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
## X1stFlrSF + X2ndFlrSF + BsmtFullBath + BedroomAbvGr + KitchenAbvGr +
## TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF, data = train_df_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -487250 -16180 -2016 13330 285692
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.043e+06 1.157e+05 -9.016 < 2e-16 ***
## MSSubClass -1.651e+02 2.583e+01 -6.393 2.20e-10 ***
## LotArea 4.223e-01 1.002e-01 4.215 2.66e-05 ***
## OverallQual 1.821e+04 1.139e+03 15.977 < 2e-16 ***
## OverallCond 4.017e+03 1.001e+03 4.012 6.32e-05 ***
## YearBuilt 3.062e+02 5.184e+01 5.907 4.34e-09 ***
## YearRemodAdd 1.929e+02 6.482e+01 2.976 0.002973 **
## MasVnrArea 3.232e+01 5.892e+00 5.486 4.87e-08 ***
## BsmtFinSF1 1.080e+01 2.983e+00 3.620 0.000304 ***
## X1stFlrSF 5.593e+01 4.632e+00 12.075 < 2e-16 ***
## X2ndFlrSF 4.715e+01 4.100e+00 11.501 < 2e-16 ***
## BsmtFullBath 8.545e+03 2.379e+03 3.591 0.000340 ***
## BedroomAbvGr -9.399e+03 1.665e+03 -5.645 1.99e-08 ***
## KitchenAbvGr -1.339e+04 5.097e+03 -2.627 0.008694 **
## TotRmsAbvGrd 5.166e+03 1.214e+03 4.255 2.22e-05 ***
## Fireplaces 3.938e+03 1.732e+03 2.273 0.023147 *
## GarageCars 1.058e+04 1.693e+03 6.253 5.31e-10 ***
## WoodDeckSF 2.099e+01 7.857e+00 2.672 0.007630 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34830 on 1434 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.8093, Adjusted R-squared: 0.807
## F-statistic: 358 on 17 and 1434 DF, p-value: < 2.2e-16
Now as we see above, in the newest model, there are no variables with P-value > 0.05. Hence we will use this as our final model here.
Now we will load the test data into a dataframe and then predict the results using this model.
test_raw_path <- "https://raw.githubusercontent.com/deepakmongia/Data605-Spring2019/master/test.csv"
test_df <- read.csv(test_raw_path, header = TRUE, sep = ",")
head(test_df)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461 20 RH 80 11622 Pave <NA> Reg
## 2 1462 20 RL 81 14267 Pave <NA> IR1
## 3 1463 60 RL 74 13830 Pave <NA> IR1
## 4 1464 60 RL 78 9978 Pave <NA> IR1
## 5 1465 120 RL 43 5005 Pave <NA> IR1
## 6 1466 60 RL 75 10000 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl NAmes Feedr
## 2 Lvl AllPub Corner Gtl NAmes Norm
## 3 Lvl AllPub Inside Gtl Gilbert Norm
## 4 Lvl AllPub Inside Gtl Gilbert Norm
## 5 HLS AllPub Inside Gtl StoneBr Norm
## 6 Lvl AllPub Corner Gtl Gilbert Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 1Story 5 6 1961
## 2 Norm 1Fam 1Story 6 6 1958
## 3 Norm 1Fam 2Story 5 5 1997
## 4 Norm 1Fam 2Story 6 6 1998
## 5 Norm TwnhsE 1Story 8 5 1992
## 6 Norm 1Fam 2Story 6 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1961 Gable CompShg VinylSd VinylSd None
## 2 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 3 1998 Gable CompShg VinylSd VinylSd None
## 4 1998 Gable CompShg VinylSd VinylSd BrkFace
## 5 1992 Gable CompShg HdBoard HdBoard None
## 6 1994 Gable CompShg HdBoard HdBoard None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 TA TA CBlock TA TA No
## 2 108 TA TA CBlock TA TA No
## 3 0 TA TA PConc Gd TA No
## 4 20 TA TA PConc TA TA No
## 5 0 Gd TA PConc Gd TA No
## 6 0 TA TA PConc Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 Rec 468 LwQ 144 270 882
## 2 ALQ 923 Unf 0 406 1329
## 3 GLQ 791 Unf 0 137 928
## 4 GLQ 602 Unf 0 324 926
## 5 ALQ 263 Unf 0 1017 1280
## 6 Unf 0 Unf 0 763 763
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA TA Y SBrkr 896 0 0
## 2 GasA TA Y SBrkr 1329 0 0
## 3 GasA Gd Y SBrkr 928 701 0
## 4 GasA Ex Y SBrkr 926 678 0
## 5 GasA Ex Y SBrkr 1280 0 0
## 6 GasA Gd Y SBrkr 763 892 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 896 0 0 1 0 2
## 2 1329 0 0 1 1 3
## 3 1629 0 0 2 1 3
## 4 1604 0 0 2 1 3
## 5 1280 0 0 2 0 2
## 6 1655 0 0 2 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 TA 5 Typ 0 <NA>
## 2 1 Gd 6 Typ 0 <NA>
## 3 1 TA 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 5 Typ 0 <NA>
## 6 1 TA 7 Typ 1 TA
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 1961 Unf 1 730 TA
## 2 Attchd 1958 Unf 1 312 TA
## 3 Attchd 1997 Fin 2 482 TA
## 4 Attchd 1998 Fin 2 470 TA
## 5 Attchd 1992 RFn 2 506 TA
## 6 Attchd 1993 Fin 2 440 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 140 0 0 0
## 2 TA Y 393 36 0 0
## 3 TA Y 212 34 0 0
## 4 TA Y 360 36 0 0
## 5 TA Y 0 82 0 0
## 6 TA Y 157 84 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 120 0 <NA> MnPrv <NA> 0 6 2010
## 2 0 0 <NA> <NA> Gar2 12500 6 2010
## 3 0 0 <NA> MnPrv <NA> 0 3 2010
## 4 0 0 <NA> <NA> <NA> 0 6 2010
## 5 144 0 <NA> <NA> <NA> 0 1 2010
## 6 0 0 <NA> <NA> <NA> 0 4 2010
## SaleType SaleCondition
## 1 WD Normal
## 2 WD Normal
## 3 WD Normal
## 4 WD Normal
## 5 WD Normal
## 6 WD Normal
Based on a few iterations, we see that there are some variables which have value NA due to which the predicted values are coming as NA. To avoid this so that the predicted values can be submitted on Kaggle, we will replace NA with 0.
test_df$MasVnrArea[is.na(test_df$MasVnrArea)] <- 0
test_df$BsmtFinSF1[is.na(test_df$BsmtFinSF1)] <- 0
test_df$BsmtFullBath[is.na(test_df$BsmtFullBath)] <- 0
prediction_results <- predict(saleprice_lm5, test_df)
prediction_results_df <- data.frame(cbind(test_df$Id, prediction_results))
colnames(prediction_results_df) = c('Id', 'SalePrice')
head(prediction_results_df, 10)
## Id SalePrice
## 1 1461 114786.8
## 2 1462 166311.5
## 3 1463 173391.9
## 4 1464 199975.7
## 5 1465 188458.3
## 6 1466 183230.9
## 7 1467 197252.5
## 8 1468 172831.8
## 9 1469 211915.6
## 10 1470 114604.4
Now I submitted the final sheet of the predicted results to Kaggle. My Kaggle score: 0.51444 My Kaggle user name: deepakmongia