set.seed(225)
sampleSize <- 10000
n <- 10
mu_alpha <- (n+1)/2
#uniform distribution
ran_X <- runif(sampleSize,1,n)
#normal distribution
ran_Y <- rnorm(sampleSize,mu_alpha,mu_alpha)
xyData <- data.frame(ran_X,ran_Y)
xyData %>% head() %>% kable() %>% kable_styling()
| ran_X | ran_Y |
|---|---|
| 1.482280 | 3.1512752 |
| 2.638616 | 0.9932506 |
| 6.792097 | 0.3025562 |
| 1.410870 | 1.0387861 |
| 2.624869 | -0.2050548 |
| 4.534426 | -1.1253649 |
hist(ran_X)
hist(ran_Y)
x <- median(ran_X)
y <- quantile(ran_Y,0.25)
\(P(X>x \ | \ X>y) = \frac{P(X>x \ , \ X>y)}{P(X>y)}\) ##### The probability is:
X_gt_x_y <- xyData %>% dplyr::select(ran_X) %>% filter(ran_X > x & ran_X > y) %>% count() /sampleSize
X_gt_y <- xyData %>% dplyr::select(ran_X) %>% filter(ran_X > y ) %>% count() /sampleSize
X_gt_x_y/X_gt_y
## n
## 1 0.5417705
There is a probability of about 54% of finding a value in the distribution of X’ greater thant its own median given that this value of X’ is greater than the values that separates the 25th quartile of the distribution of y’.
X_gt_x_Y_gt_y<- xyData %>% filter(ran_X > x, ran_Y > y) %>% count() /sampleSize
X_gt_x_Y_gt_y
## n
## 1 0.3745
There is a probability of about 37.4% of finding a value in the distribution of X’ greater than its median and a value of the distribution of Y’ that is greater than the 25th quartile of the distribution of Y’.
X_less_x_X_gt_y <- xyData %>% dplyr::select(ran_X) %>% filter(ran_X < x & ran_X > y) %>% count() /sampleSize
X_gt_y <- xyData %>% dplyr::select(ran_X) %>% filter(ran_X > y ) %>% count() /sampleSize
X_less_x_X_gt_y/X_gt_y
## n
## 1 0.4582295
There is a probability of about 45.8% of finding a value in the distribution of X’ less than its own median given than this value of X’ is greater than the 25th quartile of the distribution of Y’.
prob.table <- data.frame(
A <- c(sum(xyData$ran_X < x & xyData$ran_Y < y)/sampleSize, sum(xyData$ran_X < x & xyData$ran_Y > y)/sampleSize ),
B <- c(sum(xyData$ran_X > x & xyData$ran_Y < y)/sampleSize, sum(xyData$ran_X > x & xyData$ran_Y > y)/sampleSize )
,stringsAsFactors = FALSE
)
prob.table %<>% cbind(c(rowSums(prob.table[,1:2])))
prob.table %<>% rbind(c(sum(prob.table[,1]),sum(prob.table[,2]),sum(prob.table[,3])))
names(prob.table) <- c("X<x","X>x","Total")
rownames(prob.table) <- c("Y<y","Y>y","Total")
prob.table %>% kable() %>% kable_styling()
| X<x | X>x | Total | |
|---|---|---|---|
| Y<y | 0.1245 | 0.1255 | 0.25 |
| Y>y | 0.3755 | 0.3745 | 0.75 |
| Total | 0.5000 | 0.5000 | 1.00 |
ct.table <- data.frame(
A <- c(sum(xyData$ran_X < x & xyData$ran_Y < y), sum(xyData$ran_X < x & xyData$ran_Y > y) ),
B <- c(sum(xyData$ran_X > x & xyData$ran_Y < y), sum(xyData$ran_X > x & xyData$ran_Y > y) )
,stringsAsFactors = FALSE
)
ct.table %<>% cbind(c(rowSums(ct.table[,1:2])))
ct.table %<>% rbind(c(sum(ct.table[,1]),sum(ct.table[,2]),sum(ct.table[,3])))
names(ct.table) <- c("X<x","X>x","Total")
rownames(ct.table) <- c("Y<y","Y>y","Total")
ct.table %>% kable() %>% kable_styling()
| X<x | X>x | Total | |
|---|---|---|---|
| Y<y | 1245 | 1255 | 2500 |
| Y>y | 3755 | 3745 | 7500 |
| Total | 5000 | 5000 | 10000 |
ct.table[c(1,2),2:3] %>% fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.9045
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9287373 1.0880068
## sample estimates:
## odds ratio
## 1.005341
ct.table[c(1,2),2:3] %>% chisq.test()
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 0.012833, df = 1, p-value = 0.9098
Both Test show that events are not independent.
The difference between Fisher Exact Test and Chi Square Test is that fisher is usually better when sample sizes are small whereas Chi Square is a statistical significance test mostly used when sample set is large usually greater than 1000.
Based on the output, both test are almost equal in their outputs ( p-values) therefore would say that both are good; however, chi squared is preferred since the sample size for is greater than 1000.
house.train.df %>% summary()
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
house.train.df %>% head() %>% kable() %>% kable_styling()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 60 | RL | 65 | 8450 | Pave | NA | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NA | Attchd | 2003 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 2 | 2008 | WD | Normal | 208500 |
| 2 | 20 | RL | 80 | 9600 | Pave | NA | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 5 | 2007 | WD | Normal | 181500 |
| 3 | 60 | RL | 68 | 11250 | Pave | NA | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 9 | 2008 | WD | Normal | 223500 |
| 4 | 70 | RL | 60 | 9550 | Pave | NA | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NA | NA | NA | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 5 | 60 | RL | 84 | 14260 | Pave | NA | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 12 | 2008 | WD | Normal | 250000 |
| 6 | 50 | RL | 85 | 14115 | Pave | NA | IR1 | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | 1.5Fin | 5 | 5 | 1993 | 1995 | Gable | CompShg | VinylSd | VinylSd | None | 0 | TA | TA | Wood | Gd | TA | No | GLQ | 732 | Unf | 0 | 64 | 796 | GasA | Ex | Y | SBrkr | 796 | 566 | 0 | 1362 | 1 | 0 | 1 | 1 | 1 | 1 | TA | 5 | Typ | 0 | NA | Attchd | 1993 | Unf | 2 | 480 | TA | TA | Y | 40 | 30 | 0 | 320 | 0 | 0 | NA | MnPrv | Shed | 700 | 10 | 2009 | WD | Normal | 143000 |
house.train.df %>% ggplot(aes(.$SalePrice))+geom_histogram()+xlab("Sale Price in Dollars")
house.train.df %>% ggplot(aes(.$GrLivArea))+geom_histogram()+xlab("Living Area in Square Feet")
house.train.df %>% ggplot(aes(x="",y=.$SalePrice,fill=.$Neighborhood))+
geom_boxplot()+ylab("Sale Price")+
labs(fill="Neighborhood")+
theme( axis.title.x = element_blank())
As shown in the plot, the sale price for a home have higher or lower variability depending on the neighborhood where you want to buy that home.
house.train.df %>%
ggplot(aes(x=.$GrLivArea,y=.$SalePrice))+geom_point()+geom_smooth(method = "lm" )+
xlab("Avobe Grade Living Area in Sqrt Feet")+
ylab("Sale Price in dollars")+
ggtitle("Sale Price vs Above Grade Living Area")
house.train.df %>%
ggplot(aes(x=.$`1stFlrSF`,y=.$SalePrice))+geom_point()+geom_smooth(method = "lm" )+
xlab("First Floor Measure in Sqrt Feet")+
ylab("Sale Price in dollars")+
ggtitle("Sale Price vs First Floor Measure")
cor.matrix <- house.train.df %>% dplyr::select(`1stFlrSF`,GrLivArea,SalePrice) %>% cor()
cor.matrix %>% corrplot(method = "color")
cor.test(house.train.df$SalePrice,house.train.df$GrLivArea,conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house.train.df$SalePrice and house.train.df$GrLivArea
## t = 38.348, df = 1458, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(house.train.df$SalePrice,house.train.df$`1stFlrSF`,conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house.train.df$SalePrice and house.train.df$`1stFlrSF`
## t = 29.078, df = 1458, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5841687 0.6266715
## sample estimates:
## cor
## 0.6058522
Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
pre.matrix <- solve(cor.matrix)
pre.matrix
## 1stFlrSF GrLivArea SalePrice
## 1stFlrSF 1.6795240 -0.4611713 -0.690746
## GrLivArea -0.4611713 2.1352622 -1.233697
## SalePrice -0.6907460 -1.2336974 2.292718
prod.matrix <- pre.matrix %*% cor.matrix %>% round(4)
prod.matrix
## 1stFlrSF GrLivArea SalePrice
## 1stFlrSF 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
lu.decomposition(prod.matrix)
## $L
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
fit <- fitdistr(house.train.df$SalePrice,"exponential")
lambda <- fit$estimate
dist <- rexp(1000,lambda)
par(mfrow= c(2,2))
hist(dist, xlab = "Values",breaks = 100)
house.train.df$SalePrice %>% hist( xlab="Sales' Price" )
qexp(0.05, rate = lambda)
## [1] 9280.044
qexp(0.95, rate = lambda)
## [1] 541991.5
z <- qnorm(0.95)
sd <- sd(house.train.df$SalePrice)
mean <- mean(house.train.df$SalePrice)
n <- length(house.train.df$SalePrice)
lower_interval <- mean - z*(sd/sqrt(n))
upper_interval <- mean + z*(sd/sqrt(n))
c(lower_interval, upper_interval)
## [1] 177501.4 184341.0
quantile(house.train.df$SalePrice,c(0.05,0.95))
## 5% 95%
## 88000 326100
house.train.df$MSZoning <- as.factor(house.train.df$MSZoning)
house.train.df$Neighborhood <- as.factor(house.train.df$Neighborhood)
house.test.df$MSZoning <- as.factor(house.test.df$MSZoning)
house.lm <- lm( SalePrice~GrLivArea+`1stFlrSF`+Neighborhood+BldgType ,house.train.df)
summary(house.lm)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + `1stFlrSF` + Neighborhood +
## BldgType, data = house.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -377034 -17429 -160 14621 272958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77420.590 11555.212 6.700 0.000000000029881 ***
## GrLivArea 62.062 2.655 23.373 < 0.0000000000000002 ***
## `1stFlrSF` 37.634 3.566 10.553 < 0.0000000000000002 ***
## NeighborhoodBlueste -15315.444 28867.102 -0.531 0.595813
## NeighborhoodBrDale -26498.386 14602.485 -1.815 0.069787 .
## NeighborhoodBrkSide -60987.560 11625.116 -5.246 0.000000178655612 ***
## NeighborhoodClearCr -25026.666 12583.539 -1.989 0.046909 *
## NeighborhoodCollgCr -13402.409 10679.390 -1.255 0.209692
## NeighborhoodCrawfor -22057.228 11528.132 -1.913 0.055904 .
## NeighborhoodEdwards -67117.850 10972.841 -6.117 0.000000001230756 ***
## NeighborhoodGilbert -21646.801 11337.351 -1.909 0.056419 .
## NeighborhoodIDOTRR -77454.163 12274.913 -6.310 0.000000000371486 ***
## NeighborhoodMeadowV -38279.969 13575.143 -2.820 0.004871 **
## NeighborhoodMitchel -36647.403 11612.829 -3.156 0.001634 **
## NeighborhoodNAmes -53660.893 10658.184 -5.035 0.000000539451765 ***
## NeighborhoodNoRidge 44785.282 12036.332 3.721 0.000206 ***
## NeighborhoodNPkVill -14912.028 15981.641 -0.933 0.350940
## NeighborhoodNridgHt 69613.491 10778.484 6.459 0.000000000144530 ***
## NeighborhoodNWAmes -44042.484 11229.101 -3.922 0.000091916487098 ***
## NeighborhoodOldTown -74029.736 11085.695 -6.678 0.000000000034594 ***
## NeighborhoodSawyer -54409.405 11311.790 -4.810 0.000001669173472 ***
## NeighborhoodSawyerW -28330.288 11270.612 -2.514 0.012058 *
## NeighborhoodSomerst 16296.009 10732.458 1.518 0.129138
## NeighborhoodStoneBr 70893.681 12206.833 5.808 0.000000007793838 ***
## NeighborhoodSWISU -83518.114 12969.419 -6.440 0.000000000163190 ***
## NeighborhoodTimber 1698.983 11989.408 0.142 0.887331
## NeighborhoodVeenker 19905.848 15102.947 1.318 0.187711
## BldgType2fmCon -19960.660 7160.065 -2.788 0.005377 **
## BldgTypeDuplex -40225.884 5567.711 -7.225 0.000000000000813 ***
## BldgTypeTwnhs -43645.069 7646.061 -5.708 0.000000013865383 ***
## BldgTypeTwnhsE -26466.132 4788.493 -5.527 0.000000038660042 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38160 on 1429 degrees of freedom
## Multiple R-squared: 0.774, Adjusted R-squared: 0.7693
## F-statistic: 163.2 on 30 and 1429 DF, p-value: < 0.00000000000000022
testData <- house.test.df %>% dplyr::select(GrLivArea,`1stFlrSF`,Neighborhood,BldgType)
prediction <- as.data.frame(predict(house.lm,testData))
submit <- house.test.df$Id %>% cbind(prediction)
names(submit) <- c("Id","SalePrice")
submit %>% write_csv("housePrediction.csv")