if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, mice, e1071, Metrics, Matrix, MASS, skimr, corrplot, DataExplorer)
house_data <- read.csv('https://raw.githubusercontent.com/mjdacs/data605/master/train.csv', header = TRUE)
glimpse(house_data)
## Observations: 1,460
## Variables: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60,...
## $ MSZoning <fctr> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL,...
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, ...
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10...
## $ Street <fctr> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave,...
## $ Alley <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ LotShape <fctr> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Re...
## $ LandContour <fctr> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lv...
## $ Utilities <fctr> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub,...
## $ LotConfig <fctr> Inside, FR2, Inside, Corner, FR2, Inside, Insid...
## $ LandSlope <fctr> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gt...
## $ Neighborhood <fctr> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mi...
## $ Condition1 <fctr> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN...
## $ Condition2 <fctr> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm,...
## $ BldgType <fctr> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam,...
## $ HouseStyle <fctr> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin,...
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, ...
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, ...
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, ...
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, ...
## $ RoofStyle <fctr> Gable, Gable, Gable, Gable, Gable, Gable, Gable...
## $ RoofMatl <fctr> CompShg, CompShg, CompShg, CompShg, CompShg, Co...
## $ Exterior1st <fctr> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, Vi...
## $ Exterior2nd <fctr> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, Vi...
## $ MasVnrType <fctr> BrkFace, None, BrkFace, None, BrkFace, None, St...
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, ...
## $ ExterQual <fctr> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex,...
## $ ExterCond <fctr> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA,...
## $ Foundation <fctr> PConc, CBlock, PConc, BrkTil, PConc, Wood, PCon...
## $ BsmtQual <fctr> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex,...
## $ BsmtCond <fctr> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA,...
## $ BsmtExposure <fctr> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No,...
## $ BsmtFinType1 <fctr> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GL...
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851,...
## $ BsmtFinType2 <fctr> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Un...
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140,...
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952,...
## $ Heating <fctr> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA,...
## $ HeatingQC <fctr> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex,...
## $ CentralAir <fctr> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ Electrical <fctr> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr...
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022...
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, ...
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, ...
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ...
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, ...
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, ...
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, ...
## $ KitchenQual <fctr> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex,...
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5,...
## $ Functional <fctr> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, T...
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, ...
## $ FireplaceQu <fctr> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd,...
## $ GarageType <fctr> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd,...
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, ...
## $ GarageFinish <fctr> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RF...
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, ...
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205...
## $ GarageQual <fctr> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA,...
## $ GarageCond <fctr> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA,...
## $ PavedDrive <fctr> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, ...
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, ...
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, ...
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0...
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PoolQC <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Fence <fctr> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, ...
## $ MiscFeature <fctr> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA,...
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0,...
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, ...
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, ...
## $ SaleType <fctr> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New...
## $ SaleCondition <fctr> Normal, Normal, Normal, Abnorml, Normal, Normal...
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, ...
Pick one of the quantitative variables from the training set, and define that variable x. Make sure this variable is skewed to the right!
hist(house_data$BsmtUnfSF)
We see that BsmtUnfSF –Unfinished Basement (sqft)– has a right skew. Additionally, SalesPrice is our chosen dependent variable.
X <- house_data$BsmtUnfSF
Y <- house_data$SalePrice
Calculate as a minimum the below probabilities a through c. Assume the small letter ‘x’ is estimated ast he 1st quartile of the X variable, and the small letter ‘y’ is the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
We run a summary of BsmtUnfSF to confirm the right skew.
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 223.0 477.5 567.2 808.0 2336.0
Since the Mean > Median, it confirms the right skew.
We also run a summary of the dependent variable SalesPrice, or Y
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
We are asking what is the probability that X variable is greater than its first quartile given that Y is larger than the its first quartile.
# quartiles for BsmtUnfSF
q1_x <- summary(X)[2]
q2_x <- summary(X)[3]
q3_x <- summary(X)[5]
# quartiles for SalesPrice
q1_y <- summary(Y)[2]
q2_y <- summary(Y)[3]
q3_y <- summary(Y)[5]
x <- house_data %>%
filter(BsmtUnfSF > q1_x ) %>%
nrow()
y <- house_data %>%
filter(SalePrice > q1_y) %>%
nrow()
Here we start to construct the probability table.
top_left <- house_data %>%
filter(BsmtUnfSF <= q1_x & SalePrice <= q1_y) %>%
nrow()
top_mid <- house_data %>%
filter(BsmtUnfSF > q1_x & SalePrice <= q1_y) %>%
nrow()
top_right <- top_left + top_mid
mid_left <- house_data %>%
filter(BsmtUnfSF <= q1_x & SalePrice > q1_y) %>%
nrow()
mid_mid <- house_data %>%
filter(BsmtUnfSF > q1_x & SalePrice > q1_y) %>%
nrow()
mid_right <- mid_left + mid_mid
bot_left <- top_left + mid_left
bot_mid <- top_mid + mid_mid
bot_right <- top_right + mid_right
t1 <- list(c(top_left, top_mid, top_right),
c(mid_left, mid_mid, mid_right),
c(bot_left, bot_mid, bot_right))
df <- as.data.frame(t1,
row.names = c("<=1st q", ">1st q", "Total"),
col.names = c("leq_1st quartile", "grtr_1st quartile", "Total"))
df
## leq_1st.quartile grtr_1st.quartile Total
## <=1st q 112 254 366
## >1st q 253 841 1094
## Total 365 1095 1460
\(P(X>x~~ | ~~Y>y) = 253/1095 = 0.2311\)
\(P(X>x, Y>y) = 841/1460 = 0.5760\)
\(P(X<x~~ | ~~Y>y) = 254/1095 = 0.2320\)
Does splitting the training data in this fashion make them independent?
Let \(A\) be the new variable counting those observations above the 1st quartile for \(X\), and let \(B\) be the new variable counting those observations above the 1st quartile for \(Y\). Does \(P(AB)=P(A)P(B)\)? Check mathematically, and then evaluate by running a Chi Square test for association.
A <- x / nrow(house_data)
B <- y / nrow(house_data)
AB <- house_data %>% filter(BsmtUnfSF > q1_x, SalePrice > q1_y) %>% nrow() / nrow(df)
AB / B
## [1] 373.7778
A * B
## [1] 0.5619863
The differing results suggests the variables are not independent.
chi_tab <- table(house_data$BsmtUnfSF, house_data$SalePrice)
chisq.test(chi_tab)
##
## Pearson's Chi-squared test
##
## data: chi_tab
## X-squared = 532040, df = 515700, p-value < 2.2e-16
With a chi squared of 532040, we reject the null hypothesis that the variables are dependent.
Provide univariate descriptive statistics and appropriate plots for the training data set.
Lets run a summary on the dataframe
skim(house_data)
## Skim summary statistics
## n obs: 1460
## n variables: 81
##
## Variable type: factor
## variable missing complete n n_unique
## Alley 1369 91 1460 2
## BldgType 0 1460 1460 5
## BsmtCond 37 1423 1460 4
## BsmtExposure 38 1422 1460 4
## BsmtFinType1 37 1423 1460 6
## BsmtFinType2 38 1422 1460 6
## BsmtQual 37 1423 1460 4
## CentralAir 0 1460 1460 2
## Condition1 0 1460 1460 9
## Condition2 0 1460 1460 8
## Electrical 1 1459 1460 5
## ExterCond 0 1460 1460 5
## Exterior1st 0 1460 1460 15
## Exterior2nd 0 1460 1460 16
## ExterQual 0 1460 1460 4
## Fence 1179 281 1460 4
## FireplaceQu 690 770 1460 5
## Foundation 0 1460 1460 6
## Functional 0 1460 1460 7
## GarageCond 81 1379 1460 5
## GarageFinish 81 1379 1460 3
## GarageQual 81 1379 1460 5
## GarageType 81 1379 1460 6
## Heating 0 1460 1460 6
## HeatingQC 0 1460 1460 5
## HouseStyle 0 1460 1460 8
## KitchenQual 0 1460 1460 4
## LandContour 0 1460 1460 4
## LandSlope 0 1460 1460 3
## LotConfig 0 1460 1460 5
## LotShape 0 1460 1460 4
## MasVnrType 8 1452 1460 4
## MiscFeature 1406 54 1460 4
## MSZoning 0 1460 1460 5
## Neighborhood 0 1460 1460 25
## PavedDrive 0 1460 1460 3
## PoolQC 1453 7 1460 3
## RoofMatl 0 1460 1460 8
## RoofStyle 0 1460 1460 6
## SaleCondition 0 1460 1460 6
## SaleType 0 1460 1460 9
## Street 0 1460 1460 2
## Utilities 0 1460 1460 2
## top_counts ordered
## NA: 1369, Grv: 50, Pav: 41 FALSE
## 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43 FALSE
## TA: 1311, Gd: 65, Fa: 45, NA: 37 FALSE
## No: 953, Av: 221, Gd: 134, Mn: 114 FALSE
## Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148 FALSE
## Unf: 1256, Rec: 54, LwQ: 46, NA: 38 FALSE
## TA: 649, Gd: 618, Ex: 121, NA: 37 FALSE
## Y: 1365, N: 95, NA: 0 FALSE
## Nor: 1260, Fee: 81, Art: 48, RRA: 26 FALSE
## Nor: 1445, Fee: 6, Art: 2, Pos: 2 FALSE
## SBr: 1334, Fus: 94, Fus: 27, Fus: 3 FALSE
## TA: 1282, Gd: 146, Fa: 28, Ex: 3 FALSE
## Vin: 515, HdB: 222, Met: 220, Wd : 206 FALSE
## Vin: 504, Met: 214, HdB: 207, Wd : 197 FALSE
## TA: 906, Gd: 488, Ex: 52, Fa: 14 FALSE
## NA: 1179, MnP: 157, GdP: 59, GdW: 54 FALSE
## NA: 690, Gd: 380, TA: 313, Fa: 33 FALSE
## PCo: 647, CBl: 634, Brk: 146, Sla: 24 FALSE
## Typ: 1360, Min: 34, Min: 31, Mod: 15 FALSE
## TA: 1326, NA: 81, Fa: 35, Gd: 9 FALSE
## Unf: 605, RFn: 422, Fin: 352, NA: 81 FALSE
## TA: 1311, NA: 81, Fa: 48, Gd: 14 FALSE
## Att: 870, Det: 387, Bui: 88, NA: 81 FALSE
## Gas: 1428, Gas: 18, Gra: 7, Wal: 4 FALSE
## Ex: 741, TA: 428, Gd: 241, Fa: 49 FALSE
## 1St: 726, 2St: 445, 1.5: 154, SLv: 65 FALSE
## TA: 735, Gd: 586, Ex: 100, Fa: 39 FALSE
## Lvl: 1311, Bnk: 63, HLS: 50, Low: 36 FALSE
## Gtl: 1382, Mod: 65, Sev: 13, NA: 0 FALSE
## Ins: 1052, Cor: 263, Cul: 94, FR2: 47 FALSE
## Reg: 925, IR1: 484, IR2: 41, IR3: 10 FALSE
## Non: 864, Brk: 445, Sto: 128, Brk: 15 FALSE
## NA: 1406, She: 49, Gar: 2, Oth: 2 FALSE
## RL: 1151, RM: 218, FV: 65, RH: 16 FALSE
## NAm: 225, Col: 150, Old: 113, Edw: 100 FALSE
## Y: 1340, N: 90, P: 30, NA: 0 FALSE
## NA: 1453, Gd: 3, Ex: 2, Fa: 2 FALSE
## Com: 1434, Tar: 11, WdS: 6, WdS: 5 FALSE
## Gab: 1141, Hip: 286, Fla: 13, Gam: 11 FALSE
## Nor: 1198, Par: 125, Abn: 101, Fam: 20 FALSE
## WD: 1267, New: 122, COD: 43, Con: 9 FALSE
## Pav: 1454, Grv: 6, NA: 0 FALSE
## All: 1459, NoS: 1, NA: 0 FALSE
##
## Variable type: integer
## variable missing complete n mean sd p0 p25
## BedroomAbvGr 0 1460 1460 2.87 0.82 0 2
## BsmtFinSF1 0 1460 1460 443.64 456.1 0 0
## BsmtFinSF2 0 1460 1460 46.55 161.32 0 0
## BsmtFullBath 0 1460 1460 0.43 0.52 0 0
## BsmtHalfBath 0 1460 1460 0.058 0.24 0 0
## BsmtUnfSF 0 1460 1460 567.24 441.87 0 223
## EnclosedPorch 0 1460 1460 21.95 61.12 0 0
## Fireplaces 0 1460 1460 0.61 0.64 0 0
## FullBath 0 1460 1460 1.57 0.55 0 1
## GarageArea 0 1460 1460 472.98 213.8 0 334.5
## GarageCars 0 1460 1460 1.77 0.75 0 1
## GarageYrBlt 81 1379 1460 1978.51 24.69 1900 1961
## GrLivArea 0 1460 1460 1515.46 525.48 334 1129.5
## HalfBath 0 1460 1460 0.38 0.5 0 0
## Id 0 1460 1460 730.5 421.61 1 365.75
## KitchenAbvGr 0 1460 1460 1.05 0.22 0 1
## LotArea 0 1460 1460 10516.83 9981.26 1300 7553.5
## LotFrontage 259 1201 1460 70.05 24.28 21 59
## LowQualFinSF 0 1460 1460 5.84 48.62 0 0
## MasVnrArea 8 1452 1460 103.69 181.07 0 0
## MiscVal 0 1460 1460 43.49 496.12 0 0
## MoSold 0 1460 1460 6.32 2.7 1 5
## MSSubClass 0 1460 1460 56.9 42.3 20 20
## OpenPorchSF 0 1460 1460 46.66 66.26 0 0
## OverallCond 0 1460 1460 5.58 1.11 1 5
## OverallQual 0 1460 1460 6.1 1.38 1 5
## PoolArea 0 1460 1460 2.76 40.18 0 0
## SalePrice 0 1460 1460 180921.2 79442.5 34900 129975
## ScreenPorch 0 1460 1460 15.06 55.76 0 0
## TotalBsmtSF 0 1460 1460 1057.43 438.71 0 795.75
## TotRmsAbvGrd 0 1460 1460 6.52 1.63 2 5
## WoodDeckSF 0 1460 1460 94.24 125.34 0 0
## X1stFlrSF 0 1460 1460 1162.63 386.59 334 882
## X2ndFlrSF 0 1460 1460 346.99 436.53 0 0
## X3SsnPorch 0 1460 1460 3.41 29.32 0 0
## YearBuilt 0 1460 1460 1971.27 30.2 1872 1954
## YearRemodAdd 0 1460 1460 1984.87 20.65 1950 1967
## YrSold 0 1460 1460 2007.82 1.33 2006 2007
## p50 p75 p100 hist
## 3 3 8 ▁▃▇▂▁▁▁▁
## 383.5 712.25 5644 ▇▂▁▁▁▁▁▁
## 0 0 1474 ▇▁▁▁▁▁▁▁
## 0 1 3 ▇▁▆▁▁▁▁▁
## 0 0 2 ▇▁▁▁▁▁▁▁
## 477.5 808 2336 ▇▆▅▂▂▁▁▁
## 0 0 552 ▇▁▁▁▁▁▁▁
## 1 1 3 ▇▁▇▁▁▁▁▁
## 2 2 3 ▁▁▇▁▁▇▁▁
## 480 576 1418 ▁▅▇▅▂▁▁▁
## 2 2 4 ▁▃▁▇▁▂▁▁
## 1980 2002 2010 ▁▁▁▂▅▃▃▇
## 1464 1776.75 5642 ▂▇▅▁▁▁▁▁
## 0 1 2 ▇▁▁▅▁▁▁▁
## 730.5 1095.25 1460 ▇▇▇▇▇▇▇▇
## 1 1 3 ▁▁▇▁▁▁▁▁
## 9478.5 11601.5 215245 ▇▁▁▁▁▁▁▁
## 69 80 313 ▃▇▁▁▁▁▁▁
## 0 0 572 ▇▁▁▁▁▁▁▁
## 0 166 1600 ▇▂▁▁▁▁▁▁
## 0 0 15500 ▇▁▁▁▁▁▁▁
## 6 8 12 ▂▂▇▆▆▅▂▃
## 50 70 190 ▇▆▂▁▁▁▁▁
## 25 68 547 ▇▂▁▁▁▁▁▁
## 5 6 9 ▁▁▁▇▂▂▁▁
## 6 7 10 ▁▁▂▇▇▆▃▁
## 0 0 738 ▇▁▁▁▁▁▁▁
## 163000 214000 755000 ▃▇▂▁▁▁▁▁
## 0 0 480 ▇▁▁▁▁▁▁▁
## 991.5 1298.25 6110 ▂▇▂▁▁▁▁▁
## 6 7 14 ▁▆▆▇▁▁▁▁
## 0 168 857 ▇▃▁▁▁▁▁▁
## 1087 1391.25 4692 ▃▇▃▁▁▁▁▁
## 0 728 2065 ▇▁▂▂▁▁▁▁
## 0 0 508 ▇▁▁▁▁▁▁▁
## 1973 2000 2010 ▁▁▂▂▃▅▃▇
## 1994 2004 2010 ▅▂▂▂▁▂▅▇
## 2008 2009 2010 ▇▇▁▇▁▇▁▅
Provide a scatterplot of X and Y.
plot(house_data$BsmtUnfSF, house_data$SalePrice, main = 'Sales Price with respect to Unfinished Basement Sqft', xlab = 'Unfinished Basement SF', ylab = ' Sales Price')
Derive a correlation matrix for any THREE quantitative variables in the dataset.
Here we will choose Sales Price, Lot Area, and Garage Area for our correlation matrix.
#library(corrplot)
df <- house_data %>%
dplyr::select(SalePrice, LotArea, GarageArea)
M <- cor(df)
corrplot(M, method = 'number')
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval.
cor.test(df$SalePrice, df$LotArea, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: df$SalePrice and df$LotArea
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.2206794 0.3059759
## sample estimates:
## cor
## 0.2638434
cor.test(df$SalePrice, df$GarageArea, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: df$SalePrice and df$GarageArea
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.5945883 0.6506720
## sample estimates:
## cor
## 0.6234314
cor.test(df$GarageArea, df$LotArea, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: df$GarageArea and df$LotArea
## t = 7.0034, df = 1458, p-value = 3.803e-12
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.1356921 0.2243801
## sample estimates:
## cor
## 0.1804028
Discuss the meaning of your analysis. We can see the correlation is not equal to zero, rejecting the null hypothesis.
Would you be worried about familywise error? Why or why not?
FWE <- 1-(1-.08)^3
print(paste("The probabilty of a type I error is", round(FWE*100,2), "%"))
## [1] "The probabilty of a type I error is 22.13 %"
Yes we should probably be worried about familywise error. This seems like a high probability considering only three tests were performed.
Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
prec_mat <- t(M)
prec_mat
## SalePrice LotArea GarageArea
## SalePrice 1.0000000 0.2638434 0.6234314
## LotArea 0.2638434 1.0000000 0.1804028
## GarageArea 0.6234314 0.1804028 1.0000000
M %*% prec_mat
## SalePrice LotArea GarageArea
## SalePrice 1.4582801 0.6401555 1.2944609
## LotArea 0.6401555 1.1021585 0.5252938
## GarageArea 1.2944609 0.5252938 1.4212119
prec_mat %*% M
## SalePrice LotArea GarageArea
## SalePrice 1.4582801 0.6401555 1.2944609
## LotArea 0.6401555 1.1021585 0.5252938
## GarageArea 1.2944609 0.5252938 1.4212119
lu.M <- lu(M)
lu.M
## 'MatrixFactorization' of Formal class 'denseLU' [package "Matrix"] with 4 slots
## ..@ x : num [1:9] 1 0.264 0.623 0.264 0.93 ...
## ..@ perm : int [1:3] 1 2 3
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:3] "SalePrice" "LotArea" "GarageArea"
## .. ..$ : chr [1:3] "SalePrice" "LotArea" "GarageArea"
## ..@ Dim : int [1:2] 3 3
Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary.
min(house_data$BsmtUnfSF)
## [1] 0
b <- house_data$BsmtUnfSF - min(house_data$BsmtUnfSF) + 0.0001
min(b)
## [1] 1e-04
Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).
fit <- fitdistr(b, "exponential")
lambda <- fit$estimate
lambda
## rate
## 0.001762921
samples <- rexp(1000, lambda)
Plot a histogram and compare it with a histogram of your original variable.
hist(samples, breaks = 15, main = 'Simulated')
hist(b, main = "Observed")
Using the exponential pdf, find the 5th and 95 th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
\[ CDF = \frac{ln(1 - P)}{-\lambda} \]
cdf_5 <- log(0.95) / (-lambda)
cdf_95 <- log(0.05) / (-lambda)
emp_5 <- quantile(b, 0.05)
emp_95 <- quantile(b, 0.95)
t <- t.test(b)$conf.int[1:2]
x <- data_frame(Type = c('Simulated', "Observed", "CI"),
P5 = c(cdf_5, emp_5, t[1]),
P95 = c(cdf_95, emp_95, t[2]))
x
## # A tibble: 3 x 3
## Type P5 P95
## <chr> <dbl> <dbl>
## 1 Simulated 29.09563 1699.3007
## 2 Observed 0.00010 1468.0001
## 3 CI 544.55630 589.9247
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
Load Datasets
train <- read.csv("https://raw.githubusercontent.com/mjdacs/data605/master/train.csv", stringsAsFactors = F)
test <- read.csv("https://raw.githubusercontent.com/mjdacs/data605/master/test.csv", stringsAsFactors = F)
full <- bind_rows(train,test)
Obtain summary stats on the full set
We’re going to skim the data again, this time with the training and test sets combined. Skim is a great tool that tells us which variables are factors and which are integers. We are going to fill the NA in Character column with ‘Not Available’ (Can be changed to literally anything).
skim(full)
## Skim summary statistics
## n obs: 2919
## n variables: 81
##
## Variable type: character
## variable missing complete n min max empty n_unique
## Alley 2721 198 2919 4 4 0 2
## BldgType 0 2919 2919 4 6 0 5
## BsmtCond 82 2837 2919 2 2 0 4
## BsmtExposure 82 2837 2919 2 2 0 4
## BsmtFinType1 79 2840 2919 3 3 0 6
## BsmtFinType2 80 2839 2919 3 3 0 6
## BsmtQual 81 2838 2919 2 2 0 4
## CentralAir 0 2919 2919 1 1 0 2
## Condition1 0 2919 2919 4 6 0 9
## Condition2 0 2919 2919 4 6 0 8
## Electrical 1 2918 2919 3 5 0 5
## ExterCond 0 2919 2919 2 2 0 5
## Exterior1st 1 2918 2919 5 7 0 15
## Exterior2nd 1 2918 2919 5 7 0 16
## ExterQual 0 2919 2919 2 2 0 4
## Fence 2348 571 2919 4 5 0 4
## FireplaceQu 1420 1499 2919 2 2 0 5
## Foundation 0 2919 2919 4 6 0 6
## Functional 2 2917 2919 3 4 0 7
## GarageCond 159 2760 2919 2 2 0 5
## GarageFinish 159 2760 2919 3 3 0 3
## GarageQual 159 2760 2919 2 2 0 5
## GarageType 157 2762 2919 6 7 0 6
## Heating 0 2919 2919 4 5 0 6
## HeatingQC 0 2919 2919 2 2 0 5
## HouseStyle 0 2919 2919 4 6 0 8
## KitchenQual 1 2918 2919 2 2 0 4
## LandContour 0 2919 2919 3 3 0 4
## LandSlope 0 2919 2919 3 3 0 3
## LotConfig 0 2919 2919 3 7 0 5
## LotShape 0 2919 2919 3 3 0 4
## MasVnrType 24 2895 2919 4 7 0 4
## MiscFeature 2814 105 2919 4 4 0 4
## MSZoning 4 2915 2919 2 7 0 5
## Neighborhood 0 2919 2919 5 7 0 25
## PavedDrive 0 2919 2919 1 1 0 3
## PoolQC 2909 10 2919 2 2 0 3
## RoofMatl 0 2919 2919 4 7 0 8
## RoofStyle 0 2919 2919 3 7 0 6
## SaleCondition 0 2919 2919 6 7 0 6
## SaleType 1 2918 2919 2 5 0 9
## Street 0 2919 2919 4 4 0 2
## Utilities 2 2917 2919 6 6 0 2
##
## Variable type: integer
## variable missing complete n mean sd p0 p25
## BedroomAbvGr 0 2919 2919 2.86 0.82 0 2
## BsmtFinSF1 1 2918 2919 441.42 455.61 0 0
## BsmtFinSF2 1 2918 2919 49.58 169.21 0 0
## BsmtFullBath 2 2917 2919 0.43 0.52 0 0
## BsmtHalfBath 2 2917 2919 0.061 0.25 0 0
## BsmtUnfSF 1 2918 2919 560.77 439.54 0 220
## EnclosedPorch 0 2919 2919 23.1 64.24 0 0
## Fireplaces 0 2919 2919 0.6 0.65 0 0
## FullBath 0 2919 2919 1.57 0.55 0 1
## GarageArea 1 2918 2919 472.87 215.39 0 320
## GarageCars 1 2918 2919 1.77 0.76 0 1
## GarageYrBlt 159 2760 2919 1978.11 25.57 1895 1960
## GrLivArea 0 2919 2919 1500.76 506.05 334 1126
## HalfBath 0 2919 2919 0.38 0.5 0 0
## Id 0 2919 2919 1460 842.79 1 730.5
## KitchenAbvGr 0 2919 2919 1.04 0.21 0 1
## LotArea 0 2919 2919 10168.11 7887 1300 7478
## LotFrontage 486 2433 2919 69.31 23.34 21 59
## LowQualFinSF 0 2919 2919 4.69 46.4 0 0
## MasVnrArea 23 2896 2919 102.2 179.33 0 0
## MiscVal 0 2919 2919 50.83 567.4 0 0
## MoSold 0 2919 2919 6.21 2.71 1 4
## MSSubClass 0 2919 2919 57.14 42.52 20 20
## OpenPorchSF 0 2919 2919 47.49 67.58 0 0
## OverallCond 0 2919 2919 5.56 1.11 1 5
## OverallQual 0 2919 2919 6.09 1.41 1 5
## PoolArea 0 2919 2919 2.25 35.66 0 0
## SalePrice 1459 1460 2919 180921.2 79442.5 34900 129975
## ScreenPorch 0 2919 2919 16.06 56.18 0 0
## TotalBsmtSF 1 2918 2919 1051.78 440.77 0 793
## TotRmsAbvGrd 0 2919 2919 6.45 1.57 2 5
## WoodDeckSF 0 2919 2919 93.71 126.53 0 0
## X1stFlrSF 0 2919 2919 1159.58 392.36 334 876
## X2ndFlrSF 0 2919 2919 336.48 428.7 0 0
## X3SsnPorch 0 2919 2919 2.6 25.19 0 0
## YearBuilt 0 2919 2919 1971.31 30.29 1872 1953.5
## YearRemodAdd 0 2919 2919 1984.26 20.89 1950 1965
## YrSold 0 2919 2919 2007.79 1.31 2006 2007
## p50 p75 p100 hist
## 3 3 8 ▁▃▇▂▁▁▁▁
## 368.5 733 5644 ▇▂▁▁▁▁▁▁
## 0 0 1526 ▇▁▁▁▁▁▁▁
## 0 1 3 ▇▁▆▁▁▁▁▁
## 0 0 2 ▇▁▁▁▁▁▁▁
## 467 805.5 2336 ▇▆▅▂▂▁▁▁
## 0 0 1012 ▇▁▁▁▁▁▁▁
## 1 1 4 ▇▇▁▁▁▁▁▁
## 2 2 4 ▁▇▁▇▁▁▁▁
## 480 576 1488 ▁▅▇▃▂▁▁▁
## 2 2 5 ▁▃▁▇▂▁▁▁
## 1979 2002 2207 ▁▅▇▁▁▁▁▁
## 1444 1743.5 5642 ▂▇▃▁▁▁▁▁
## 0 1 2 ▇▁▁▅▁▁▁▁
## 1460 2189.5 2919 ▇▇▇▇▇▇▇▇
## 1 1 3 ▁▁▇▁▁▁▁▁
## 9453 11570 215245 ▇▁▁▁▁▁▁▁
## 68 80 313 ▃▇▁▁▁▁▁▁
## 0 0 1064 ▇▁▁▁▁▁▁▁
## 0 164 1600 ▇▁▁▁▁▁▁▁
## 0 0 17000 ▇▁▁▁▁▁▁▁
## 6 8 12 ▃▃▇▆▅▅▂▃
## 50 70 190 ▇▆▂▁▁▁▁▁
## 26 70 742 ▇▁▁▁▁▁▁▁
## 5 6 9 ▁▁▁▇▂▂▁▁
## 6 7 10 ▁▁▂▇▇▆▃▂
## 0 0 800 ▇▁▁▁▁▁▁▁
## 163000 214000 755000 ▃▇▂▁▁▁▁▁
## 0 0 576 ▇▁▁▁▁▁▁▁
## 989.5 1302 6110 ▃▇▂▁▁▁▁▁
## 6 7 15 ▁▆▇▇▂▁▁▁
## 0 168 1424 ▇▂▁▁▁▁▁▁
## 1082 1387.5 5095 ▅▇▂▁▁▁▁▁
## 0 704 2065 ▇▁▂▂▁▁▁▁
## 0 0 508 ▇▁▁▁▁▁▁▁
## 1973 2001 2010 ▁▁▂▂▃▅▂▇
## 1993 2004 2010 ▅▂▂▂▁▂▅▇
## 2008 2009 2010 ▇▇▁▇▁▇▁▃
missing_data <- plot_missing(full, title="Housing Data - Missing Values (%)")
Separate Id and SalePrice
We save the Id for the test dataset so we can submit the result on kaggle. The SalePrice is the dependent variable we will train the model on. As a general rule of thumb, imputing the target variable is not recommended, and if we don’t make it a distinct now it will be imputed when we run the mice() package
SalePrice <- train$SalePrice
Id <- test$Id
full[,c('Id','SalePrice')] <- NULL
rm(train,test)
Separate the dataset into charactor variables and integer variables
Use different strategies to fill NA for Charactor variables and Integer variables. Therefore, I separate the dataset into two groups.
chr <- full[,sapply(full,is.character)]
int <- full[,sapply(full,is.integer)]
chr[is.na(chr)] <- "Not Available"
fac <- chr %>%
lapply(as.factor) %>%
as.data.frame()
cm<- cor(int, use='complete.obs')
corrplot(cm, method='circle')
ggplot(stack(int), aes(values))+
facet_wrap(~ind, scales = "free") +
geom_bar(fill = "light blue", colour="black") +
theme(legend.position="none")
Fill Character variable’s NA with “Not available” and turn it into factor
The mice() package was built for doing imputation on missing data. “Multivariate Imputation by Chained Equations”. We will use predictive mean matching as an imputation method. First we combine the chr and int datasets into one dataframe. This is essentially an entire dataset of type integer and an entire dataset of type charachter.
full <- bind_cols(fac,int)
imputed <- mice(full, method='pmm')
full <- complete(imputed)
rm(chr,fac,int,imputed)
Separate Train and Test Data
train <- full[1:length(SalePrice),]
test<-full[(length(SalePrice)+1):nrow(full),]
Run the Support Vector Machines (SVM) model and save to .csv
We run th SVM model on the training set, then predict the model against the test set. Lastly, we create a data frame for our submission that includes the Id column and the results of the SalePrice predictions.
svm_model <- svm(SalePrice ~ ., data = train)
svm_pred <- predict(svm_model, newdata = test)
solution <- data.frame(Id = Id, SalePrice = svm_pred)
write.csv(solution,"svm_dacampora.csv",row.names = FALSE)
Conclusion
This was my first time using SVM and as always, its working with the data that is the hardest part. There are still parts of the model I would stil like to understand further. The model scored 649th on Kaggle with an RMSE of 0.11816.