X2 = c(7.4,
6.4,
8.5,
9.5,
11.8,
8.8,
8.4,
5.1,
11.4,
15.1,
12.6,
8.0,
10.3,
10.4,
9.5,
9.5,
15.1,
6.6,
15.4,
8.2)
x = quantile(X2, 0.75)
x
## 75%
## 11.5
Y2 = c(20.8,
14.6,
18.0,
7.3,
19.4,
13.5,
14.7,
15.3,
12.6,
13.0,
13.1,
10.3,
14.9,
14.8,
16.2,
15.7,
16.3,
11.5,
12.2,
11.8)
y = quantile(Y2, 0.25)
y
## 25%
## 12.5
count = 0
ty = Y2 > y
for (i in 1:20)
{
if (ty[i] == TRUE)
{
if (X2[i] > x)
{
count = count + 1
}
}
}
count
## [1] 4
Pab = count/20
Pab
## [1] 0.2
#conditional probability of P(X>x | Y>y)
Pc = Pab/y
Pc
## 25%
## 0.016
#joint probabiilty of P(X>x, Y>y)
Pab
## [1] 0.2
count2 = 0
for (i in 1:20)
{
if (ty[i] == TRUE)
{
if (X2[i] < x)
{
count2 = count2 + 1
}
}
}
count2
## [1] 11
Pab2 = count2/20
Pab2
## [1] 0.55
#conditional probability of P(X<x | Y>y)
Pc2 = Pab2/y
Pc2
## 25%
## 0.044
#y - <=3d quartile and x <=1st quartile
ty = Y2 <= y
cnt1 = 0
for (i in 1:20)
{
if (ty[i] == TRUE)
{
if (X2[i] <= x)
{
cnt1 = cnt1 + 1
}
}
}
cnt1
## [1] 4
cnt2 = 0
for (i in 1:20)
{
if (ty[i] == TRUE)
{
if (X2[i] > x)
{
cnt2 = cnt2 + 1
}
}
}
cnt2
## [1] 1
ty = Y2 > y
cnt3 = 0
for (i in 1:20)
{
if (ty[i] == TRUE)
{
if (X2[i] <= x)
{
cnt3 = cnt3 + 1
}
}
}
cnt3
## [1] 11
cnt4 = 0
for (i in 1:20)
{
if (ty[i] == TRUE)
{
if (X2[i] > x)
{
cnt4 = cnt4 + 1
}
}
}
cnt4
## [1] 4
x = quantile(X2, 0.25)
y = quantile(Y2, 0.25)
A = sum(X2>x)
A
## [1] 15
B = sum(Y2>y)
B
## [1] 15
y
## 25%
## 12.5
ty = Y2 > y
for (i in 1:20)
{
if (ty[i] == TRUE)
{
if (X2[i] > x)
{
count = count + 1
}
}
}
count
## [1] 16
Pab = count/20
Pab
## [1] 0.8
Pab == (A/20)*(B/20)
## [1] FALSE
#perform the Chi Square Test for assoication
rows = 20
Matriz = matrix(c(X2, Y2),
nrow=rows,
byrow=TRUE)
chisq.test(Matriz)
##
## Pearson's Chi-squared test
##
## data: Matriz
## X-squared = 10.588, df = 19, p-value = 0.9369
#getwd()
library(stats)
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
Traindata <- read.csv(file="C:/Users/ajb2/Documents/training.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
train_subset <- cbind(Traindata$MSSubClass, Traindata$LotArea, Traindata$OverallQual, Traindata$OverallCond, Traindata$YearBuilt, Traindata$MasVnrArea, Traindata$BsmtFinSF1, Traindata$BsmtFinSF2, Traindata$BsmtUnfSF, Traindata$X1stFlrSF, Traindata$X2ndFlrSF, Traindata$BedroomAbvGr, Traindata$KitchenAbvGr, Traindata$GarageYrBlt, Traindata$PoolArea)
head(Traindata,10)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## 7 7 20 RL 75 10084 Pave <NA> Reg
## 8 8 60 RL NA 10382 Pave <NA> IR1
## 9 9 50 RM 51 6120 Pave <NA> Reg
## 10 10 190 RL 50 7420 Pave <NA> Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## 7 Lvl AllPub Inside Gtl Somerst Norm
## 8 Lvl AllPub Corner Gtl NWAmes PosN
## 9 Lvl AllPub Inside Gtl OldTown Artery
## 10 Lvl AllPub Corner Gtl BrkSide Artery
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## 7 Norm 1Fam 1Story 8 5 2004
## 8 Norm 1Fam 2Story 7 6 1973
## 9 Norm 1Fam 1.5Fin 7 5 1931
## 10 Artery 2fmCon 1.5Unf 5 6 1939
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## 7 2005 Gable CompShg VinylSd VinylSd Stone
## 8 1973 Gable CompShg HdBoard HdBoard Stone
## 9 1950 Gable CompShg BrkFace Wd Shng None
## 10 1950 Gable CompShg MetalSd MetalSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 1 196 Gd TA PConc Gd TA
## 2 0 TA TA CBlock Gd TA
## 3 162 Gd TA PConc Gd TA
## 4 0 TA TA BrkTil TA Gd
## 5 350 Gd TA PConc Gd TA
## 6 0 TA TA Wood Gd TA
## 7 186 Gd TA PConc Ex TA
## 8 240 TA TA CBlock Gd TA
## 9 0 TA TA BrkTil TA TA
## 10 0 TA TA BrkTil TA TA
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 1 No GLQ 706 Unf 0 150
## 2 Gd ALQ 978 Unf 0 284
## 3 Mn GLQ 486 Unf 0 434
## 4 No ALQ 216 Unf 0 540
## 5 Av GLQ 655 Unf 0 490
## 6 No GLQ 732 Unf 0 64
## 7 Av GLQ 1369 Unf 0 317
## 8 Mn ALQ 859 BLQ 32 216
## 9 No Unf 0 Unf 0 952
## 10 No GLQ 851 Unf 0 140
## TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 1 856 GasA Ex Y SBrkr 856 854
## 2 1262 GasA Ex Y SBrkr 1262 0
## 3 920 GasA Ex Y SBrkr 920 866
## 4 756 GasA Gd Y SBrkr 961 756
## 5 1145 GasA Ex Y SBrkr 1145 1053
## 6 796 GasA Ex Y SBrkr 796 566
## 7 1686 GasA Ex Y SBrkr 1694 0
## 8 1107 GasA Ex Y SBrkr 1107 983
## 9 952 GasA Gd Y FuseF 1022 752
## 10 991 GasA Ex Y SBrkr 1077 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1 0 1710 1 0 2 1
## 2 0 1262 0 1 2 0
## 3 0 1786 1 0 2 1
## 4 0 1717 1 0 1 0
## 5 0 2198 1 0 2 1
## 6 0 1362 1 0 1 1
## 7 0 1694 1 0 2 0
## 8 0 2090 1 0 2 1
## 9 0 1774 0 0 2 0
## 10 0 1077 1 0 1 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 3 1 Gd 8 Typ
## 2 3 1 TA 6 Typ
## 3 3 1 Gd 6 Typ
## 4 3 1 Gd 7 Typ
## 5 4 1 Gd 9 Typ
## 6 1 1 TA 5 Typ
## 7 3 1 Gd 7 Typ
## 8 3 1 TA 7 Typ
## 9 2 2 TA 8 Min1
## 10 2 2 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## 7 1 Gd Attchd 2004 RFn 2
## 8 2 TA Attchd 1973 RFn 2
## 9 2 TA Detchd 1931 Unf 2
## 10 2 TA Attchd 1939 RFn 1
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## 7 636 TA TA Y 255 57
## 8 484 TA TA Y 235 204
## 9 468 Fa TA Y 90 0
## 10 205 Gd TA Y 0 4
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## 7 0 0 0 0 <NA> <NA> <NA>
## 8 228 0 0 0 <NA> <NA> Shed
## 9 205 0 0 0 <NA> <NA> <NA>
## 10 0 0 0 0 <NA> <NA> <NA>
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
## 7 0 8 2007 WD Normal 307000
## 8 350 11 2009 WD Normal 200000
## 9 0 4 2008 WD Abnorml 129900
## 10 0 1 2008 WD Normal 118000
describe(Traindata)
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in describe(Traindata): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## vars n mean sd median trimmed mad
## Id 1 1460 730.50 421.61 730.5 730.50 541.15
## MSSubClass 2 1460 56.90 42.30 50.0 49.15 44.48
## MSZoning* 3 1460 NaN NA NA NaN NA
## LotFrontage 4 1201 70.05 24.28 69.0 68.94 16.31
## LotArea 5 1460 10516.83 9981.26 9478.5 9563.28 2962.23
## Street* 6 1460 NaN NA NA NaN NA
## Alley* 7 91 NaN NA NA NaN NA
## LotShape* 8 1460 NaN NA NA NaN NA
## LandContour* 9 1460 NaN NA NA NaN NA
## Utilities* 10 1460 NaN NA NA NaN NA
## LotConfig* 11 1460 NaN NA NA NaN NA
## LandSlope* 12 1460 NaN NA NA NaN NA
## Neighborhood* 13 1460 NaN NA NA NaN NA
## Condition1* 14 1460 NaN NA NA NaN NA
## Condition2* 15 1460 NaN NA NA NaN NA
## BldgType* 16 1460 NaN NA NA NaN NA
## HouseStyle* 17 1460 NaN NA NA NaN NA
## OverallQual 18 1460 6.10 1.38 6.0 6.08 1.48
## OverallCond 19 1460 5.58 1.11 5.0 5.48 0.00
## YearBuilt 20 1460 1971.27 30.20 1973.0 1974.13 37.06
## YearRemodAdd 21 1460 1984.87 20.65 1994.0 1986.37 19.27
## RoofStyle* 22 1460 NaN NA NA NaN NA
## RoofMatl* 23 1460 NaN NA NA NaN NA
## Exterior1st* 24 1460 NaN NA NA NaN NA
## Exterior2nd* 25 1460 NaN NA NA NaN NA
## MasVnrType* 26 1452 NaN NA NA NaN NA
## MasVnrArea 27 1452 103.69 181.07 0.0 63.15 0.00
## ExterQual* 28 1460 NaN NA NA NaN NA
## ExterCond* 29 1460 NaN NA NA NaN NA
## Foundation* 30 1460 NaN NA NA NaN NA
## BsmtQual* 31 1423 NaN NA NA NaN NA
## BsmtCond* 32 1423 NaN NA NA NaN NA
## BsmtExposure* 33 1422 NaN NA NA NaN NA
## BsmtFinType1* 34 1423 NaN NA NA NaN NA
## BsmtFinSF1 35 1460 443.64 456.10 383.5 386.08 568.58
## BsmtFinType2* 36 1422 NaN NA NA NaN NA
## BsmtFinSF2 37 1460 46.55 161.32 0.0 1.38 0.00
## BsmtUnfSF 38 1460 567.24 441.87 477.5 519.29 426.99
## TotalBsmtSF 39 1460 1057.43 438.71 991.5 1036.70 347.67
## Heating* 40 1460 NaN NA NA NaN NA
## HeatingQC* 41 1460 NaN NA NA NaN NA
## CentralAir* 42 1460 NaN NA NA NaN NA
## Electrical* 43 1459 NaN NA NA NaN NA
## X1stFlrSF 44 1460 1162.63 386.59 1087.0 1129.99 347.67
## X2ndFlrSF 45 1460 346.99 436.53 0.0 285.36 0.00
## LowQualFinSF 46 1460 5.84 48.62 0.0 0.00 0.00
## GrLivArea 47 1460 1515.46 525.48 1464.0 1467.67 483.33
## BsmtFullBath 48 1460 0.43 0.52 0.0 0.39 0.00
## BsmtHalfBath 49 1460 0.06 0.24 0.0 0.00 0.00
## FullBath 50 1460 1.57 0.55 2.0 1.56 0.00
## HalfBath 51 1460 0.38 0.50 0.0 0.34 0.00
## BedroomAbvGr 52 1460 2.87 0.82 3.0 2.85 0.00
## KitchenAbvGr 53 1460 1.05 0.22 1.0 1.00 0.00
## KitchenQual* 54 1460 NaN NA NA NaN NA
## TotRmsAbvGrd 55 1460 6.52 1.63 6.0 6.41 1.48
## Functional* 56 1460 NaN NA NA NaN NA
## Fireplaces 57 1460 0.61 0.64 1.0 0.53 1.48
## FireplaceQu* 58 770 NaN NA NA NaN NA
## GarageType* 59 1379 NaN NA NA NaN NA
## GarageYrBlt 60 1379 1978.51 24.69 1980.0 1981.07 31.13
## GarageFinish* 61 1379 NaN NA NA NaN NA
## GarageCars 62 1460 1.77 0.75 2.0 1.77 0.00
## GarageArea 63 1460 472.98 213.80 480.0 469.81 177.91
## GarageQual* 64 1379 NaN NA NA NaN NA
## GarageCond* 65 1379 NaN NA NA NaN NA
## PavedDrive* 66 1460 NaN NA NA NaN NA
## WoodDeckSF 67 1460 94.24 125.34 0.0 71.76 0.00
## OpenPorchSF 68 1460 46.66 66.26 25.0 33.23 37.06
## EnclosedPorch 69 1460 21.95 61.12 0.0 3.87 0.00
## X3SsnPorch 70 1460 3.41 29.32 0.0 0.00 0.00
## ScreenPorch 71 1460 15.06 55.76 0.0 0.00 0.00
## PoolArea 72 1460 2.76 40.18 0.0 0.00 0.00
## PoolQC* 73 7 NaN NA NA NaN NA
## Fence* 74 281 NaN NA NA NaN NA
## MiscFeature* 75 54 NaN NA NA NaN NA
## MiscVal 76 1460 43.49 496.12 0.0 0.00 0.00
## MoSold 77 1460 6.32 2.70 6.0 6.25 2.97
## YrSold 78 1460 2007.82 1.33 2008.0 2007.77 1.48
## SaleType* 79 1460 NaN NA NA NaN NA
## SaleCondition* 80 1460 NaN NA NA NaN NA
## SalePrice 81 1460 180921.20 79442.50 163000.0 170783.29 56338.80
## min max range skew kurtosis se
## Id 1 1460 1459 0.00 -1.20 11.03
## MSSubClass 20 190 170 1.40 1.56 1.11
## MSZoning* Inf -Inf -Inf NA NA NA
## LotFrontage 21 313 292 2.16 17.34 0.70
## LotArea 1300 215245 213945 12.18 202.26 261.22
## Street* Inf -Inf -Inf NA NA NA
## Alley* Inf -Inf -Inf NA NA NA
## LotShape* Inf -Inf -Inf NA NA NA
## LandContour* Inf -Inf -Inf NA NA NA
## Utilities* Inf -Inf -Inf NA NA NA
## LotConfig* Inf -Inf -Inf NA NA NA
## LandSlope* Inf -Inf -Inf NA NA NA
## Neighborhood* Inf -Inf -Inf NA NA NA
## Condition1* Inf -Inf -Inf NA NA NA
## Condition2* Inf -Inf -Inf NA NA NA
## BldgType* Inf -Inf -Inf NA NA NA
## HouseStyle* Inf -Inf -Inf NA NA NA
## OverallQual 1 10 9 0.22 0.09 0.04
## OverallCond 1 9 8 0.69 1.09 0.03
## YearBuilt 1872 2010 138 -0.61 -0.45 0.79
## YearRemodAdd 1950 2010 60 -0.50 -1.27 0.54
## RoofStyle* Inf -Inf -Inf NA NA NA
## RoofMatl* Inf -Inf -Inf NA NA NA
## Exterior1st* Inf -Inf -Inf NA NA NA
## Exterior2nd* Inf -Inf -Inf NA NA NA
## MasVnrType* Inf -Inf -Inf NA NA NA
## MasVnrArea 0 1600 1600 2.66 10.03 4.75
## ExterQual* Inf -Inf -Inf NA NA NA
## ExterCond* Inf -Inf -Inf NA NA NA
## Foundation* Inf -Inf -Inf NA NA NA
## BsmtQual* Inf -Inf -Inf NA NA NA
## BsmtCond* Inf -Inf -Inf NA NA NA
## BsmtExposure* Inf -Inf -Inf NA NA NA
## BsmtFinType1* Inf -Inf -Inf NA NA NA
## BsmtFinSF1 0 5644 5644 1.68 11.06 11.94
## BsmtFinType2* Inf -Inf -Inf NA NA NA
## BsmtFinSF2 0 1474 1474 4.25 20.01 4.22
## BsmtUnfSF 0 2336 2336 0.92 0.46 11.56
## TotalBsmtSF 0 6110 6110 1.52 13.18 11.48
## Heating* Inf -Inf -Inf NA NA NA
## HeatingQC* Inf -Inf -Inf NA NA NA
## CentralAir* Inf -Inf -Inf NA NA NA
## Electrical* Inf -Inf -Inf NA NA NA
## X1stFlrSF 334 4692 4358 1.37 5.71 10.12
## X2ndFlrSF 0 2065 2065 0.81 -0.56 11.42
## LowQualFinSF 0 572 572 8.99 82.83 1.27
## GrLivArea 334 5642 5308 1.36 4.86 13.75
## BsmtFullBath 0 3 3 0.59 -0.84 0.01
## BsmtHalfBath 0 2 2 4.09 16.31 0.01
## FullBath 0 3 3 0.04 -0.86 0.01
## HalfBath 0 2 2 0.67 -1.08 0.01
## BedroomAbvGr 0 8 8 0.21 2.21 0.02
## KitchenAbvGr 0 3 3 4.48 21.42 0.01
## KitchenQual* Inf -Inf -Inf NA NA NA
## TotRmsAbvGrd 2 14 12 0.67 0.87 0.04
## Functional* Inf -Inf -Inf NA NA NA
## Fireplaces 0 3 3 0.65 -0.22 0.02
## FireplaceQu* Inf -Inf -Inf NA NA NA
## GarageType* Inf -Inf -Inf NA NA NA
## GarageYrBlt 1900 2010 110 -0.65 -0.42 0.66
## GarageFinish* Inf -Inf -Inf NA NA NA
## GarageCars 0 4 4 -0.34 0.21 0.02
## GarageArea 0 1418 1418 0.18 0.90 5.60
## GarageQual* Inf -Inf -Inf NA NA NA
## GarageCond* Inf -Inf -Inf NA NA NA
## PavedDrive* Inf -Inf -Inf NA NA NA
## WoodDeckSF 0 857 857 1.54 2.97 3.28
## OpenPorchSF 0 547 547 2.36 8.44 1.73
## EnclosedPorch 0 552 552 3.08 10.37 1.60
## X3SsnPorch 0 508 508 10.28 123.06 0.77
## ScreenPorch 0 480 480 4.11 18.34 1.46
## PoolArea 0 738 738 14.80 222.19 1.05
## PoolQC* Inf -Inf -Inf NA NA NA
## Fence* Inf -Inf -Inf NA NA NA
## MiscFeature* Inf -Inf -Inf NA NA NA
## MiscVal 0 15500 15500 24.43 697.64 12.98
## MoSold 1 12 11 0.21 -0.41 0.07
## YrSold 2006 2010 4 0.10 -1.19 0.03
## SaleType* Inf -Inf -Inf NA NA NA
## SaleCondition* Inf -Inf -Inf NA NA NA
## SalePrice 34900 755000 720100 1.88 6.50 2079.11
ggplot(Traindata, aes(x=SalePrice))+
geom_bar(stat="bin", fill="steelblue")+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Traindata, aes(x=log(SalePrice)))+
geom_bar(stat="bin", fill="steelblue")+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
housevars <- cbind(Traindata$LotArea, Traindata$TotRmsAbvGrd, Traindata$OverallQual, Traindata$X1stFlrSF, Traindata$SalePrice)
summary(housevars)
## V1 V2 V3 V4
## Min. : 1300 Min. : 2.000 Min. : 1.000 Min. : 334
## 1st Qu.: 7554 1st Qu.: 5.000 1st Qu.: 5.000 1st Qu.: 882
## Median : 9478 Median : 6.000 Median : 6.000 Median :1087
## Mean : 10517 Mean : 6.518 Mean : 6.099 Mean :1163
## 3rd Qu.: 11602 3rd Qu.: 7.000 3rd Qu.: 7.000 3rd Qu.:1391
## Max. :215245 Max. :14.000 Max. :10.000 Max. :4692
## V5
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
colnames(housevars) = c("LotArea", "TotRmsAbvGrd", "OverallQual", "X1stFlrSF" , "SalePrice")
plot(Traindata$TotRmsAbvGrd,Traindata$LotArea,main='Scatterplot', xlab="Total Rooms Above Ground", ylab="Lot Area", col=2)
abline(lm(Traindata$TotRmsAbvGrd~Traindata$LotArea),col=4)
plot(Traindata$TotRmsAbvGrd,Traindata$SalePrice, main='Scatterplot', xlab="Total Rooms Above Ground", ylab="Sale Price", col=6)
abline(lm(Traindata$TotRmsAbvGrd~Traindata$SalePrice),col=3)
plot(Traindata$LotArea,Traindata$SalePrice, main='Scatterplot', xlab="Lot Area", ylab="Sale Price", col=5, xlim=c(0,20000))
abline(lm(Traindata$LotArea~Traindata$SalePrice),col=1)
plot(Traindata$OverallQual,Traindata$SalePrice, main='Scatterplot', xlab="Overall Quality", ylab="Sale Price", col=3)
abline(lm(Traindata$OverallQual~Traindata$SalePrice),col=1)
cor(housevars)
## LotArea TotRmsAbvGrd OverallQual X1stFlrSF SalePrice
## LotArea 1.0000000 0.1900148 0.1058057 0.2994746 0.2638434
## TotRmsAbvGrd 0.1900148 1.0000000 0.4274523 0.4095160 0.5337232
## OverallQual 0.1058057 0.4274523 1.0000000 0.4762238 0.7909816
## X1stFlrSF 0.2994746 0.4095160 0.4762238 1.0000000 0.6058522
## SalePrice 0.2638434 0.5337232 0.7909816 0.6058522 1.0000000
cor.test(Traindata$LotArea, Traindata$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: Traindata$LotArea and Traindata$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
cor.test(Traindata$TotRmsAbvGrd, Traindata$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: Traindata$TotRmsAbvGrd and Traindata$SalePrice
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5092841 0.5573021
## sample estimates:
## cor
## 0.5337232
cor.test(Traindata$OverallQual, Traindata$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: Traindata$OverallQual and Traindata$SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7780752 0.8032204
## sample estimates:
## cor
## 0.7909816
Yes, I will be. To be more confident that we did not reject the null hypothesis erroneously (type I error), I will restrict my range to a confidence level of 90 or even 95%.
library(Matrix)
chosen_vars <- cbind(Traindata$TotRmsAbvGrd, Traindata$OverallQual, Traindata$X1stFlrSF)
corr_mat = cor(chosen_vars)
corr_mat
## [,1] [,2] [,3]
## [1,] 1.0000000 0.4274523 0.4095160
## [2,] 0.4274523 1.0000000 0.4762238
## [3,] 0.4095160 0.4762238 1.0000000
det(corr_mat)
## [1] 0.5895166
precision_mat = solve(corr_mat)
precision_mat
## [,1] [,2] [,3]
## [1,] 1.3116015 -0.3942740 -0.3493591
## [2,] -0.3942740 1.4118290 -0.5108851
## [3,] -0.3493591 -0.5108851 1.3863638
id_mat = corr_mat %*% precision_mat
id_mat
## [,1] [,2] [,3]
## [1,] 1 2.775558e-17 0
## [2,] 0 1.000000e+00 0
## [3,] 0 -1.110223e-16 1
id2_mat = precision_mat %*% corr_mat
id2_mat
## [,1] [,2] [,3]
## [1,] 1.000000e+00 1.110223e-16 0.000000e+00
## [2,] -8.326673e-17 1.000000e+00 -1.110223e-16
## [3,] 0.000000e+00 0.000000e+00 1.000000e+00
ex <- expand(lu(t(corr_mat)))
L <- ex$L
P <- ex$P
C <- U <- ex$U
C[lower.tri(U)] <- L[lower.tri(L)]
print(C)
## 3 x 3 Matrix of class "dgeMatrix"
## [,1] [,2] [,3]
## [1,] 1.0000000 0.4274523 0.4095160
## [2,] 0.4274523 0.8172845 0.3011753
## [3,] 0.4095160 0.3685073 0.7213114
require(MASS)
## Loading required package: MASS
hist(Traindata$SalePrice/100000, main="histogram",xlab="Sales Price (in $100,000)",freq=FALSE)
boxplot(Traindata$SalePrice/100000, col='blue',main="Sale Price")
fitdistr(Traindata$SalePrice/100000,"exponential")
## rate
## 0.55272684
## (0.01446552)
# data generation, $\lambda$ = 0.55
ex <- rexp(1000, rate = 0.55) # generate some exponential distribution
hist(ex, main="histogram",xlab="Sales Price (in $100,000)",freq=FALSE)
fiftile = pexp(0.094, rate = 0.55, log=FALSE)
fiftile
## [1] 0.05038629
#5th percentile is 0.094 ($9,400 houses)
Ninefiftile = pexp(5.45, rate = 0.55, log=FALSE)
Ninefiftile
## [1] 0.9500883
#95th percentile is 5.45 ($545,000 houses)
n = 1460
me <- qnorm(0.975)*(sd(Traindata$SalePrice/100000)/sqrt(n))
mean(Traindata$SalePrice/100000) - me
## [1] 1.768462
mean(Traindata$SalePrice/100000) + me
## [1] 1.849962
emp_fif = quantile(Traindata$SalePrice/100000,0.05)
emp_fif
## 5%
## 0.88
emp_Ninefif = quantile(Traindata$SalePrice/100000,0.95)
emp_Ninefif
## 95%
## 3.261
#To buillt a simplified and more realistic model, pick only dependendent variables that have statistical significance of p>0.05
#attach(Traindata)
train_subset <- cbind(Traindata$MSSubClass, Traindata$LotArea, Traindata$OverallQual, Traindata$OverallCond, Traindata$YearBuilt, Traindata$MasVnrArea, Traindata$BsmtFinSF1, Traindata$X1stFlrSF, Traindata$X2ndFlrSF, Traindata$BedroomAbvGr, Traindata$GarageYrBlt)
head(train_subset)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] 60 8450 7 5 2003 196 706 856 854 3 2003
## [2,] 20 9600 6 8 1976 0 978 1262 0 3 1976
## [3,] 60 11250 7 5 2001 162 486 920 866 3 2001
## [4,] 70 9550 7 5 1915 0 216 961 756 3 1998
## [5,] 60 14260 8 5 2000 350 655 1145 1053 4 2000
## [6,] 50 14115 5 5 1993 0 732 796 566 1 1993
train_model <- lm(Traindata$SalePrice ~ Traindata$MSSubClass + Traindata$LotArea + Traindata$OverallQual + Traindata$OverallCond + Traindata$YearBuilt + Traindata$MasVnrArea + Traindata$BsmtFinSF1 + Traindata$X1stFlrSF + Traindata$X2ndFlrSF + Traindata$BedroomAbvGr + Traindata$GarageYrBlt)
summary(train_model)
##
## Call:
## lm(formula = Traindata$SalePrice ~ Traindata$MSSubClass + Traindata$LotArea +
## Traindata$OverallQual + Traindata$OverallCond + Traindata$YearBuilt +
## Traindata$MasVnrArea + Traindata$BsmtFinSF1 + Traindata$X1stFlrSF +
## Traindata$X2ndFlrSF + Traindata$BedroomAbvGr + Traindata$GarageYrBlt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -549022 -16701 -2070 13787 254449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.174e+06 1.036e+05 -11.332 < 2e-16 ***
## Traindata$MSSubClass -2.128e+02 2.667e+01 -7.978 3.12e-15 ***
## Traindata$LotArea 5.480e-01 1.030e-01 5.322 1.20e-07 ***
## Traindata$OverallQual 2.217e+04 1.178e+03 18.817 < 2e-16 ***
## Traindata$OverallCond 5.851e+03 1.019e+03 5.740 1.17e-08 ***
## Traindata$YearBuilt 3.591e+02 6.609e+01 5.433 6.56e-08 ***
## Traindata$MasVnrArea 3.100e+01 6.103e+00 5.080 4.30e-07 ***
## Traindata$BsmtFinSF1 1.672e+01 2.508e+00 6.667 3.77e-11 ***
## Traindata$X1stFlrSF 7.152e+01 3.963e+00 18.049 < 2e-16 ***
## Traindata$X2ndFlrSF 6.236e+01 3.473e+00 17.953 < 2e-16 ***
## Traindata$BedroomAbvGr -7.713e+03 1.631e+03 -4.729 2.49e-06 ***
## Traindata$GarageYrBlt 1.983e+02 7.194e+01 2.756 0.00594 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36310 on 1359 degrees of freedom
## (89 observations deleted due to missingness)
## Multiple R-squared: 0.7898, Adjusted R-squared: 0.7881
## F-statistic: 464.2 on 11 and 1359 DF, p-value: < 2.2e-16
#Load the test dataset
Testdata <- read.csv(file="C:/Users/ajb2/Documents/test.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
test_subset <- data.frame(Testdata$MSSubClass, Testdata$LotArea, Testdata$OverallQual, Testdata$OverallCond, Testdata$YearBuilt, Testdata$MasVnrArea, Testdata$BsmtFinSF1, Testdata$X1stFlrSF, Testdata$X2ndFlrSF, Testdata$BedroomAbvGr, Testdata$GarageYrBlt)
head(test_subset)
## Testdata.MSSubClass Testdata.LotArea Testdata.OverallQual
## 1 20 11622 5
## 2 20 14267 6
## 3 60 13830 5
## 4 60 9978 6
## 5 120 5005 8
## 6 60 10000 6
## Testdata.OverallCond Testdata.YearBuilt Testdata.MasVnrArea
## 1 6 1961 0
## 2 6 1958 108
## 3 5 1997 0
## 4 6 1998 20
## 5 5 1992 0
## 6 5 1993 0
## Testdata.BsmtFinSF1 Testdata.X1stFlrSF Testdata.X2ndFlrSF
## 1 468 896 0
## 2 923 1329 0
## 3 791 928 701
## 4 602 926 678
## 5 263 1280 0
## 6 0 763 892
## Testdata.BedroomAbvGr Testdata.GarageYrBlt
## 1 2 1961
## 2 3 1958
## 3 3 1997
## 4 3 1998
## 5 2 1992
## 6 3 1993
#predict home prices result based on multiple regession model built
test_results = predict(train_model, newdata=test_subset, type="response")
## Warning: 'newdata' had 1459 rows but variables found have 1460 rows
summary(test_results)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1710 133947 177903 185182 225864 709022 89
#compare test dataset sale price prediction by model to that generated in the training dataset
hist(test_results, col='green', main="Predicted Test Data Sale Price")
hist(Traindata$SalePrice, col='green', main="Training Data Sale Price")
boxplot(test_results, col='pink',main="Predicted Test Data Sale Price")
boxplot(Traindata$SalePrice, col='pink',main="Training Sale Price")
#write.csv(test_results, file = "test_results.csv")