if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, mice, e1071, Metrics, Matrix, MASS, skimr, corrplot, DataExplorer)

house_data <- read.csv('https://raw.githubusercontent.com/mjdacs/data605/master/train.csv', header = TRUE)
glimpse(house_data)
## Observations: 1,460
## Variables: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60,...
## $ MSZoning      <fctr> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL,...
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, ...
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10...
## $ Street        <fctr> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave,...
## $ Alley         <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ LotShape      <fctr> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Re...
## $ LandContour   <fctr> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lv...
## $ Utilities     <fctr> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub,...
## $ LotConfig     <fctr> Inside, FR2, Inside, Corner, FR2, Inside, Insid...
## $ LandSlope     <fctr> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gt...
## $ Neighborhood  <fctr> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mi...
## $ Condition1    <fctr> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN...
## $ Condition2    <fctr> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm,...
## $ BldgType      <fctr> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam,...
## $ HouseStyle    <fctr> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin,...
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, ...
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, ...
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, ...
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, ...
## $ RoofStyle     <fctr> Gable, Gable, Gable, Gable, Gable, Gable, Gable...
## $ RoofMatl      <fctr> CompShg, CompShg, CompShg, CompShg, CompShg, Co...
## $ Exterior1st   <fctr> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, Vi...
## $ Exterior2nd   <fctr> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, Vi...
## $ MasVnrType    <fctr> BrkFace, None, BrkFace, None, BrkFace, None, St...
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, ...
## $ ExterQual     <fctr> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex,...
## $ ExterCond     <fctr> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA,...
## $ Foundation    <fctr> PConc, CBlock, PConc, BrkTil, PConc, Wood, PCon...
## $ BsmtQual      <fctr> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex,...
## $ BsmtCond      <fctr> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA,...
## $ BsmtExposure  <fctr> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No,...
## $ BsmtFinType1  <fctr> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GL...
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851,...
## $ BsmtFinType2  <fctr> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Un...
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140,...
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952,...
## $ Heating       <fctr> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA,...
## $ HeatingQC     <fctr> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex,...
## $ CentralAir    <fctr> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ Electrical    <fctr> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr...
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022...
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, ...
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, ...
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ...
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, ...
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, ...
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, ...
## $ KitchenQual   <fctr> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex,...
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5,...
## $ Functional    <fctr> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, T...
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, ...
## $ FireplaceQu   <fctr> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd,...
## $ GarageType    <fctr> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd,...
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, ...
## $ GarageFinish  <fctr> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RF...
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, ...
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205...
## $ GarageQual    <fctr> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA,...
## $ GarageCond    <fctr> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA,...
## $ PavedDrive    <fctr> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,...
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, ...
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, ...
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, ...
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0...
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PoolQC        <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Fence         <fctr> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, ...
## $ MiscFeature   <fctr> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA,...
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0,...
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, ...
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, ...
## $ SaleType      <fctr> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New...
## $ SaleCondition <fctr> Normal, Normal, Normal, Abnorml, Normal, Normal...
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, ...

Pick one of the quantitative variables from the training set, and define that variable x. Make sure this variable is skewed to the right!

hist(house_data$BsmtUnfSF)

We see that BsmtUnfSF –Unfinished Basement (sqft)– has a right skew. Additionally, SalesPrice is our chosen dependent variable.

X <- house_data$BsmtUnfSF

Y <- house_data$SalePrice

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter ‘x’ is estimated ast he 1st quartile of the X variable, and the small letter ‘y’ is the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

We run a summary of BsmtUnfSF to confirm the right skew.

summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   223.0   477.5   567.2   808.0  2336.0

Since the Mean > Median, it confirms the right skew.

We also run a summary of the dependent variable SalesPrice, or Y

summary(Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

We are asking what is the probability that X variable is greater than its first quartile given that Y is larger than the its first quartile.

#  quartiles for BsmtUnfSF
q1_x <- summary(X)[2]
q2_x <- summary(X)[3]
q3_x <- summary(X)[5]

# quartiles for SalesPrice
q1_y <- summary(Y)[2]
q2_y <- summary(Y)[3]
q3_y <- summary(Y)[5]

x <- house_data %>% 
  filter(BsmtUnfSF > q1_x ) %>% 
  nrow()

y <- house_data %>% 
  filter(SalePrice > q1_y) %>% 
  nrow()

Here we start to construct the probability table.

top_left <- house_data %>%
  filter(BsmtUnfSF <= q1_x & SalePrice <= q1_y) %>%
  nrow()

top_mid <- house_data %>%
  filter(BsmtUnfSF > q1_x & SalePrice <= q1_y) %>%
  nrow()

top_right <- top_left + top_mid

mid_left <- house_data %>%
  filter(BsmtUnfSF <= q1_x & SalePrice > q1_y) %>%
  nrow()

mid_mid <- house_data %>%
  filter(BsmtUnfSF > q1_x & SalePrice > q1_y) %>%
  nrow()

mid_right <- mid_left + mid_mid

bot_left <- top_left + mid_left

bot_mid <- top_mid + mid_mid

bot_right <- top_right + mid_right

t1 <- list(c(top_left, top_mid, top_right),
           c(mid_left, mid_mid, mid_right),
           c(bot_left, bot_mid, bot_right))

df <- as.data.frame(t1, 
                    row.names = c("<=1st q", ">1st q", "Total"), 
                    col.names = c("leq_1st quartile",  "grtr_1st quartile", "Total"))
df
##         leq_1st.quartile grtr_1st.quartile Total
## <=1st q              112               254   366
## >1st q               253               841  1094
## Total                365              1095  1460
  1. \(P(X>x~~ | ~~Y>y) = 253/1095 = 0.2311\)

  2. \(P(X>x, Y>y) = 841/1460 = 0.5760\)

  3. \(P(X<x~~ | ~~Y>y) = 254/1095 = 0.2320\)

Does splitting the training data in this fashion make them independent?

Let \(A\) be the new variable counting those observations above the 1st quartile for \(X\), and let \(B\) be the new variable counting those observations above the 1st quartile for \(Y\). Does \(P(AB)=P(A)P(B)\)? Check mathematically, and then evaluate by running a Chi Square test for association.

A <- x / nrow(house_data)

B <- y / nrow(house_data)

AB <- house_data %>% filter(BsmtUnfSF > q1_x, SalePrice > q1_y) %>% nrow() / nrow(df)

AB / B
## [1] 373.7778
A * B
## [1] 0.5619863

The differing results suggests the variables are not independent.

chi_tab <- table(house_data$BsmtUnfSF, house_data$SalePrice)
chisq.test(chi_tab)
## 
##  Pearson's Chi-squared test
## 
## data:  chi_tab
## X-squared = 532040, df = 515700, p-value < 2.2e-16

With a chi squared of 532040, we reject the null hypothesis that the variables are dependent.

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set.

Lets run a summary on the dataframe

skim(house_data)
## Skim summary statistics
##  n obs: 1460 
##  n variables: 81 
## 
## Variable type: factor 
##       variable missing complete    n n_unique
##          Alley    1369       91 1460        2
##       BldgType       0     1460 1460        5
##       BsmtCond      37     1423 1460        4
##   BsmtExposure      38     1422 1460        4
##   BsmtFinType1      37     1423 1460        6
##   BsmtFinType2      38     1422 1460        6
##       BsmtQual      37     1423 1460        4
##     CentralAir       0     1460 1460        2
##     Condition1       0     1460 1460        9
##     Condition2       0     1460 1460        8
##     Electrical       1     1459 1460        5
##      ExterCond       0     1460 1460        5
##    Exterior1st       0     1460 1460       15
##    Exterior2nd       0     1460 1460       16
##      ExterQual       0     1460 1460        4
##          Fence    1179      281 1460        4
##    FireplaceQu     690      770 1460        5
##     Foundation       0     1460 1460        6
##     Functional       0     1460 1460        7
##     GarageCond      81     1379 1460        5
##   GarageFinish      81     1379 1460        3
##     GarageQual      81     1379 1460        5
##     GarageType      81     1379 1460        6
##        Heating       0     1460 1460        6
##      HeatingQC       0     1460 1460        5
##     HouseStyle       0     1460 1460        8
##    KitchenQual       0     1460 1460        4
##    LandContour       0     1460 1460        4
##      LandSlope       0     1460 1460        3
##      LotConfig       0     1460 1460        5
##       LotShape       0     1460 1460        4
##     MasVnrType       8     1452 1460        4
##    MiscFeature    1406       54 1460        4
##       MSZoning       0     1460 1460        5
##   Neighborhood       0     1460 1460       25
##     PavedDrive       0     1460 1460        3
##         PoolQC    1453        7 1460        3
##       RoofMatl       0     1460 1460        8
##      RoofStyle       0     1460 1460        6
##  SaleCondition       0     1460 1460        6
##       SaleType       0     1460 1460        9
##         Street       0     1460 1460        2
##      Utilities       0     1460 1460        2
##                                              top_counts ordered
##              NA: 1369, Grv: 50, Pav: 41                   FALSE
##   1Fa: 1220, Twn: 114, Dup: 52, Twn: 43                   FALSE
##        TA: 1311, Gd: 65, Fa: 45, NA: 37                   FALSE
##      No: 953, Av: 221, Gd: 134, Mn: 114                   FALSE
##  Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148                   FALSE
##     Unf: 1256, Rec: 54, LwQ: 46, NA: 38                   FALSE
##       TA: 649, Gd: 618, Ex: 121, NA: 37                   FALSE
##                   Y: 1365, N: 95, NA: 0                   FALSE
##    Nor: 1260, Fee: 81, Art: 48, RRA: 26                   FALSE
##       Nor: 1445, Fee: 6, Art: 2, Pos: 2                   FALSE
##     SBr: 1334, Fus: 94, Fus: 27, Fus: 3                   FALSE
##        TA: 1282, Gd: 146, Fa: 28, Ex: 3                   FALSE
##  Vin: 515, HdB: 222, Met: 220, Wd : 206                   FALSE
##  Vin: 504, Met: 214, HdB: 207, Wd : 197                   FALSE
##        TA: 906, Gd: 488, Ex: 52, Fa: 14                   FALSE
##    NA: 1179, MnP: 157, GdP: 59, GdW: 54                   FALSE
##       NA: 690, Gd: 380, TA: 313, Fa: 33                   FALSE
##   PCo: 647, CBl: 634, Brk: 146, Sla: 24                   FALSE
##    Typ: 1360, Min: 34, Min: 31, Mod: 15                   FALSE
##         TA: 1326, NA: 81, Fa: 35, Gd: 9                   FALSE
##    Unf: 605, RFn: 422, Fin: 352, NA: 81                   FALSE
##        TA: 1311, NA: 81, Fa: 48, Gd: 14                   FALSE
##     Att: 870, Det: 387, Bui: 88, NA: 81                   FALSE
##      Gas: 1428, Gas: 18, Gra: 7, Wal: 4                   FALSE
##       Ex: 741, TA: 428, Gd: 241, Fa: 49                   FALSE
##                   1St: 726, 2St: 445, 1.5: 154, SLv: 65   FALSE
##       TA: 735, Gd: 586, Ex: 100, Fa: 39                   FALSE
##    Lvl: 1311, Bnk: 63, HLS: 50, Low: 36                   FALSE
##      Gtl: 1382, Mod: 65, Sev: 13, NA: 0                   FALSE
##   Ins: 1052, Cor: 263, Cul: 94, FR2: 47                   FALSE
##    Reg: 925, IR1: 484, IR2: 41, IR3: 10                   FALSE
##   Non: 864, Brk: 445, Sto: 128, Brk: 15                   FALSE
##       NA: 1406, She: 49, Gar: 2, Oth: 2                   FALSE
##       RL: 1151, RM: 218, FV: 65, RH: 16                   FALSE
##  NAm: 225, Col: 150, Old: 113, Edw: 100                   FALSE
##            Y: 1340, N: 90, P: 30, NA: 0                   FALSE
##           NA: 1453, Gd: 3, Ex: 2, Fa: 2                   FALSE
##      Com: 1434, Tar: 11, WdS: 6, WdS: 5                   FALSE
##   Gab: 1141, Hip: 286, Fla: 13, Gam: 11                   FALSE
##  Nor: 1198, Par: 125, Abn: 101, Fam: 20                   FALSE
##     WD: 1267, New: 122, COD: 43, Con: 9                   FALSE
##                Pav: 1454, Grv: 6, NA: 0                   FALSE
##                All: 1459, NoS: 1, NA: 0                   FALSE
## 
## Variable type: integer 
##       variable missing complete    n       mean       sd    p0       p25
##   BedroomAbvGr       0     1460 1460      2.87      0.82     0      2   
##     BsmtFinSF1       0     1460 1460    443.64    456.1      0      0   
##     BsmtFinSF2       0     1460 1460     46.55    161.32     0      0   
##   BsmtFullBath       0     1460 1460      0.43      0.52     0      0   
##   BsmtHalfBath       0     1460 1460      0.058     0.24     0      0   
##      BsmtUnfSF       0     1460 1460    567.24    441.87     0    223   
##  EnclosedPorch       0     1460 1460     21.95     61.12     0      0   
##     Fireplaces       0     1460 1460      0.61      0.64     0      0   
##       FullBath       0     1460 1460      1.57      0.55     0      1   
##     GarageArea       0     1460 1460    472.98    213.8      0    334.5 
##     GarageCars       0     1460 1460      1.77      0.75     0      1   
##    GarageYrBlt      81     1379 1460   1978.51     24.69  1900   1961   
##      GrLivArea       0     1460 1460   1515.46    525.48   334   1129.5 
##       HalfBath       0     1460 1460      0.38      0.5      0      0   
##             Id       0     1460 1460    730.5     421.61     1    365.75
##   KitchenAbvGr       0     1460 1460      1.05      0.22     0      1   
##        LotArea       0     1460 1460  10516.83   9981.26  1300   7553.5 
##    LotFrontage     259     1201 1460     70.05     24.28    21     59   
##   LowQualFinSF       0     1460 1460      5.84     48.62     0      0   
##     MasVnrArea       8     1452 1460    103.69    181.07     0      0   
##        MiscVal       0     1460 1460     43.49    496.12     0      0   
##         MoSold       0     1460 1460      6.32      2.7      1      5   
##     MSSubClass       0     1460 1460     56.9      42.3     20     20   
##    OpenPorchSF       0     1460 1460     46.66     66.26     0      0   
##    OverallCond       0     1460 1460      5.58      1.11     1      5   
##    OverallQual       0     1460 1460      6.1       1.38     1      5   
##       PoolArea       0     1460 1460      2.76     40.18     0      0   
##      SalePrice       0     1460 1460 180921.2   79442.5  34900 129975   
##    ScreenPorch       0     1460 1460     15.06     55.76     0      0   
##    TotalBsmtSF       0     1460 1460   1057.43    438.71     0    795.75
##   TotRmsAbvGrd       0     1460 1460      6.52      1.63     2      5   
##     WoodDeckSF       0     1460 1460     94.24    125.34     0      0   
##      X1stFlrSF       0     1460 1460   1162.63    386.59   334    882   
##      X2ndFlrSF       0     1460 1460    346.99    436.53     0      0   
##     X3SsnPorch       0     1460 1460      3.41     29.32     0      0   
##      YearBuilt       0     1460 1460   1971.27     30.2   1872   1954   
##   YearRemodAdd       0     1460 1460   1984.87     20.65  1950   1967   
##         YrSold       0     1460 1460   2007.82      1.33  2006   2007   
##       p50       p75   p100     hist
##       3        3         8 ▁▃▇▂▁▁▁▁
##     383.5    712.25   5644 ▇▂▁▁▁▁▁▁
##       0        0      1474 ▇▁▁▁▁▁▁▁
##       0        1         3 ▇▁▆▁▁▁▁▁
##       0        0         2 ▇▁▁▁▁▁▁▁
##     477.5    808      2336 ▇▆▅▂▂▁▁▁
##       0        0       552 ▇▁▁▁▁▁▁▁
##       1        1         3 ▇▁▇▁▁▁▁▁
##       2        2         3 ▁▁▇▁▁▇▁▁
##     480      576      1418 ▁▅▇▅▂▁▁▁
##       2        2         4 ▁▃▁▇▁▂▁▁
##    1980     2002      2010 ▁▁▁▂▅▃▃▇
##    1464     1776.75   5642 ▂▇▅▁▁▁▁▁
##       0        1         2 ▇▁▁▅▁▁▁▁
##     730.5   1095.25   1460 ▇▇▇▇▇▇▇▇
##       1        1         3 ▁▁▇▁▁▁▁▁
##    9478.5  11601.5  215245 ▇▁▁▁▁▁▁▁
##      69       80       313 ▃▇▁▁▁▁▁▁
##       0        0       572 ▇▁▁▁▁▁▁▁
##       0      166      1600 ▇▂▁▁▁▁▁▁
##       0        0     15500 ▇▁▁▁▁▁▁▁
##       6        8        12 ▂▂▇▆▆▅▂▃
##      50       70       190 ▇▆▂▁▁▁▁▁
##      25       68       547 ▇▂▁▁▁▁▁▁
##       5        6         9 ▁▁▁▇▂▂▁▁
##       6        7        10 ▁▁▂▇▇▆▃▁
##       0        0       738 ▇▁▁▁▁▁▁▁
##  163000   214000    755000 ▃▇▂▁▁▁▁▁
##       0        0       480 ▇▁▁▁▁▁▁▁
##     991.5   1298.25   6110 ▂▇▂▁▁▁▁▁
##       6        7        14 ▁▆▆▇▁▁▁▁
##       0      168       857 ▇▃▁▁▁▁▁▁
##    1087     1391.25   4692 ▃▇▃▁▁▁▁▁
##       0      728      2065 ▇▁▂▂▁▁▁▁
##       0        0       508 ▇▁▁▁▁▁▁▁
##    1973     2000      2010 ▁▁▂▂▃▅▃▇
##    1994     2004      2010 ▅▂▂▂▁▂▅▇
##    2008     2009      2010 ▇▇▁▇▁▇▁▅

Provide a scatterplot of X and Y.

plot(house_data$BsmtUnfSF, house_data$SalePrice, main = 'Sales Price with respect to Unfinished Basement Sqft', xlab = 'Unfinished Basement SF', ylab = ' Sales Price')

Derive a correlation matrix for any THREE quantitative variables in the dataset.

Here we will choose Sales Price, Lot Area, and Garage Area for our correlation matrix.

#library(corrplot)

df <- house_data %>%
  dplyr::select(SalePrice, LotArea, GarageArea)
  
M <- cor(df)

corrplot(M, method = 'number')

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval.

cor.test(df$SalePrice, df$LotArea, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  df$SalePrice and df$LotArea
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.2206794 0.3059759
## sample estimates:
##       cor 
## 0.2638434
cor.test(df$SalePrice, df$GarageArea, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  df$SalePrice and df$GarageArea
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.5945883 0.6506720
## sample estimates:
##       cor 
## 0.6234314
cor.test(df$GarageArea, df$LotArea, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  df$GarageArea and df$LotArea
## t = 7.0034, df = 1458, p-value = 3.803e-12
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.1356921 0.2243801
## sample estimates:
##       cor 
## 0.1804028

Discuss the meaning of your analysis. We can see the correlation is not equal to zero, rejecting the null hypothesis.

Would you be worried about familywise error? Why or why not?

FWE <-  1-(1-.08)^3
print(paste("The probabilty of a type I error is", round(FWE*100,2), "%"))
## [1] "The probabilty of a type I error is 22.13 %"

Yes we should probably be worried about familywise error. This seems like a high probability considering only three tests were performed.

Linear Algebra and Correlation

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

prec_mat <- t(M)
prec_mat
##            SalePrice   LotArea GarageArea
## SalePrice  1.0000000 0.2638434  0.6234314
## LotArea    0.2638434 1.0000000  0.1804028
## GarageArea 0.6234314 0.1804028  1.0000000
M %*% prec_mat
##            SalePrice   LotArea GarageArea
## SalePrice  1.4582801 0.6401555  1.2944609
## LotArea    0.6401555 1.1021585  0.5252938
## GarageArea 1.2944609 0.5252938  1.4212119
prec_mat %*% M
##            SalePrice   LotArea GarageArea
## SalePrice  1.4582801 0.6401555  1.2944609
## LotArea    0.6401555 1.1021585  0.5252938
## GarageArea 1.2944609 0.5252938  1.4212119
lu.M <- lu(M)
lu.M
## 'MatrixFactorization' of Formal class 'denseLU' [package "Matrix"] with 4 slots
##   ..@ x       : num [1:9] 1 0.264 0.623 0.264 0.93 ...
##   ..@ perm    : int [1:3] 1 2 3
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:3] "SalePrice" "LotArea" "GarageArea"
##   .. ..$ : chr [1:3] "SalePrice" "LotArea" "GarageArea"
##   ..@ Dim     : int [1:2] 3 3

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary.

min(house_data$BsmtUnfSF)
## [1] 0
b <- house_data$BsmtUnfSF - min(house_data$BsmtUnfSF) + 0.0001
min(b)
## [1] 1e-04

Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).

fit <- fitdistr(b, "exponential")
lambda <- fit$estimate
lambda
##        rate 
## 0.001762921
samples <- rexp(1000, lambda)

Plot a histogram and compare it with a histogram of your original variable.

hist(samples, breaks = 15, main = 'Simulated') 

hist(b, main = "Observed")

Using the exponential pdf, find the 5th and 95 th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

\[ CDF = \frac{ln(1 - P)}{-\lambda} \]

cdf_5 <- log(0.95) / (-lambda)
cdf_95 <- log(0.05) / (-lambda)

emp_5 <- quantile(b, 0.05)
emp_95 <- quantile(b, 0.95)

t <- t.test(b)$conf.int[1:2]

x <- data_frame(Type = c('Simulated', "Observed", "CI"), 
                P5 = c(cdf_5, emp_5, t[1]), 
                P95 = c(cdf_95, emp_95, t[2])) 
x
## # A tibble: 3 x 3
##        Type        P5       P95
##       <chr>     <dbl>     <dbl>
## 1 Simulated  29.09563 1699.3007
## 2  Observed   0.00010 1468.0001
## 3        CI 544.55630  589.9247

Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Load Datasets

train <- read.csv("https://raw.githubusercontent.com/mjdacs/data605/master/train.csv", stringsAsFactors = F)
test <- read.csv("https://raw.githubusercontent.com/mjdacs/data605/master/test.csv", stringsAsFactors = F)
full <- bind_rows(train,test)

Obtain summary stats on the full set

We’re going to skim the data again, this time with the training and test sets combined. Skim is a great tool that tells us which variables are factors and which are integers. We are going to fill the NA in Character column with ‘Not Available’ (Can be changed to literally anything).

skim(full)
## Skim summary statistics
##  n obs: 2919 
##  n variables: 81 
## 
## Variable type: character 
##       variable missing complete    n min max empty n_unique
##          Alley    2721      198 2919   4   4     0        2
##       BldgType       0     2919 2919   4   6     0        5
##       BsmtCond      82     2837 2919   2   2     0        4
##   BsmtExposure      82     2837 2919   2   2     0        4
##   BsmtFinType1      79     2840 2919   3   3     0        6
##   BsmtFinType2      80     2839 2919   3   3     0        6
##       BsmtQual      81     2838 2919   2   2     0        4
##     CentralAir       0     2919 2919   1   1     0        2
##     Condition1       0     2919 2919   4   6     0        9
##     Condition2       0     2919 2919   4   6     0        8
##     Electrical       1     2918 2919   3   5     0        5
##      ExterCond       0     2919 2919   2   2     0        5
##    Exterior1st       1     2918 2919   5   7     0       15
##    Exterior2nd       1     2918 2919   5   7     0       16
##      ExterQual       0     2919 2919   2   2     0        4
##          Fence    2348      571 2919   4   5     0        4
##    FireplaceQu    1420     1499 2919   2   2     0        5
##     Foundation       0     2919 2919   4   6     0        6
##     Functional       2     2917 2919   3   4     0        7
##     GarageCond     159     2760 2919   2   2     0        5
##   GarageFinish     159     2760 2919   3   3     0        3
##     GarageQual     159     2760 2919   2   2     0        5
##     GarageType     157     2762 2919   6   7     0        6
##        Heating       0     2919 2919   4   5     0        6
##      HeatingQC       0     2919 2919   2   2     0        5
##     HouseStyle       0     2919 2919   4   6     0        8
##    KitchenQual       1     2918 2919   2   2     0        4
##    LandContour       0     2919 2919   3   3     0        4
##      LandSlope       0     2919 2919   3   3     0        3
##      LotConfig       0     2919 2919   3   7     0        5
##       LotShape       0     2919 2919   3   3     0        4
##     MasVnrType      24     2895 2919   4   7     0        4
##    MiscFeature    2814      105 2919   4   4     0        4
##       MSZoning       4     2915 2919   2   7     0        5
##   Neighborhood       0     2919 2919   5   7     0       25
##     PavedDrive       0     2919 2919   1   1     0        3
##         PoolQC    2909       10 2919   2   2     0        3
##       RoofMatl       0     2919 2919   4   7     0        8
##      RoofStyle       0     2919 2919   3   7     0        6
##  SaleCondition       0     2919 2919   6   7     0        6
##       SaleType       1     2918 2919   2   5     0        9
##         Street       0     2919 2919   4   4     0        2
##      Utilities       2     2917 2919   6   6     0        2
## 
## Variable type: integer 
##       variable missing complete    n       mean       sd    p0      p25
##   BedroomAbvGr       0     2919 2919      2.86      0.82     0      2  
##     BsmtFinSF1       1     2918 2919    441.42    455.61     0      0  
##     BsmtFinSF2       1     2918 2919     49.58    169.21     0      0  
##   BsmtFullBath       2     2917 2919      0.43      0.52     0      0  
##   BsmtHalfBath       2     2917 2919      0.061     0.25     0      0  
##      BsmtUnfSF       1     2918 2919    560.77    439.54     0    220  
##  EnclosedPorch       0     2919 2919     23.1      64.24     0      0  
##     Fireplaces       0     2919 2919      0.6       0.65     0      0  
##       FullBath       0     2919 2919      1.57      0.55     0      1  
##     GarageArea       1     2918 2919    472.87    215.39     0    320  
##     GarageCars       1     2918 2919      1.77      0.76     0      1  
##    GarageYrBlt     159     2760 2919   1978.11     25.57  1895   1960  
##      GrLivArea       0     2919 2919   1500.76    506.05   334   1126  
##       HalfBath       0     2919 2919      0.38      0.5      0      0  
##             Id       0     2919 2919   1460       842.79     1    730.5
##   KitchenAbvGr       0     2919 2919      1.04      0.21     0      1  
##        LotArea       0     2919 2919  10168.11   7887     1300   7478  
##    LotFrontage     486     2433 2919     69.31     23.34    21     59  
##   LowQualFinSF       0     2919 2919      4.69     46.4      0      0  
##     MasVnrArea      23     2896 2919    102.2     179.33     0      0  
##        MiscVal       0     2919 2919     50.83    567.4      0      0  
##         MoSold       0     2919 2919      6.21      2.71     1      4  
##     MSSubClass       0     2919 2919     57.14     42.52    20     20  
##    OpenPorchSF       0     2919 2919     47.49     67.58     0      0  
##    OverallCond       0     2919 2919      5.56      1.11     1      5  
##    OverallQual       0     2919 2919      6.09      1.41     1      5  
##       PoolArea       0     2919 2919      2.25     35.66     0      0  
##      SalePrice    1459     1460 2919 180921.2   79442.5  34900 129975  
##    ScreenPorch       0     2919 2919     16.06     56.18     0      0  
##    TotalBsmtSF       1     2918 2919   1051.78    440.77     0    793  
##   TotRmsAbvGrd       0     2919 2919      6.45      1.57     2      5  
##     WoodDeckSF       0     2919 2919     93.71    126.53     0      0  
##      X1stFlrSF       0     2919 2919   1159.58    392.36   334    876  
##      X2ndFlrSF       0     2919 2919    336.48    428.7      0      0  
##     X3SsnPorch       0     2919 2919      2.6      25.19     0      0  
##      YearBuilt       0     2919 2919   1971.31     30.29  1872   1953.5
##   YearRemodAdd       0     2919 2919   1984.26     20.89  1950   1965  
##         YrSold       0     2919 2919   2007.79      1.31  2006   2007  
##       p50      p75   p100     hist
##       3        3        8 ▁▃▇▂▁▁▁▁
##     368.5    733     5644 ▇▂▁▁▁▁▁▁
##       0        0     1526 ▇▁▁▁▁▁▁▁
##       0        1        3 ▇▁▆▁▁▁▁▁
##       0        0        2 ▇▁▁▁▁▁▁▁
##     467      805.5   2336 ▇▆▅▂▂▁▁▁
##       0        0     1012 ▇▁▁▁▁▁▁▁
##       1        1        4 ▇▇▁▁▁▁▁▁
##       2        2        4 ▁▇▁▇▁▁▁▁
##     480      576     1488 ▁▅▇▃▂▁▁▁
##       2        2        5 ▁▃▁▇▂▁▁▁
##    1979     2002     2207 ▁▅▇▁▁▁▁▁
##    1444     1743.5   5642 ▂▇▃▁▁▁▁▁
##       0        1        2 ▇▁▁▅▁▁▁▁
##    1460     2189.5   2919 ▇▇▇▇▇▇▇▇
##       1        1        3 ▁▁▇▁▁▁▁▁
##    9453    11570   215245 ▇▁▁▁▁▁▁▁
##      68       80      313 ▃▇▁▁▁▁▁▁
##       0        0     1064 ▇▁▁▁▁▁▁▁
##       0      164     1600 ▇▁▁▁▁▁▁▁
##       0        0    17000 ▇▁▁▁▁▁▁▁
##       6        8       12 ▃▃▇▆▅▅▂▃
##      50       70      190 ▇▆▂▁▁▁▁▁
##      26       70      742 ▇▁▁▁▁▁▁▁
##       5        6        9 ▁▁▁▇▂▂▁▁
##       6        7       10 ▁▁▂▇▇▆▃▂
##       0        0      800 ▇▁▁▁▁▁▁▁
##  163000   214000   755000 ▃▇▂▁▁▁▁▁
##       0        0      576 ▇▁▁▁▁▁▁▁
##     989.5   1302     6110 ▃▇▂▁▁▁▁▁
##       6        7       15 ▁▆▇▇▂▁▁▁
##       0      168     1424 ▇▂▁▁▁▁▁▁
##    1082     1387.5   5095 ▅▇▂▁▁▁▁▁
##       0      704     2065 ▇▁▂▂▁▁▁▁
##       0        0      508 ▇▁▁▁▁▁▁▁
##    1973     2001     2010 ▁▁▂▂▃▅▂▇
##    1993     2004     2010 ▅▂▂▂▁▂▅▇
##    2008     2009     2010 ▇▇▁▇▁▇▁▃
missing_data <- plot_missing(full, title="Housing Data - Missing Values (%)")

Separate Id and SalePrice

We save the Id for the test dataset so we can submit the result on kaggle. The SalePrice is the dependent variable we will train the model on. As a general rule of thumb, imputing the target variable is not recommended, and if we don’t make it a distinct now it will be imputed when we run the mice() package

SalePrice <- train$SalePrice
Id <- test$Id

full[,c('Id','SalePrice')] <- NULL
rm(train,test)

Separate the dataset into charactor variables and integer variables

Use different strategies to fill NA for Charactor variables and Integer variables. Therefore, I separate the dataset into two groups.

chr <- full[,sapply(full,is.character)]
int <- full[,sapply(full,is.integer)]
chr[is.na(chr)] <- "Not Available"
fac <- chr %>% 
  lapply(as.factor) %>% 
  as.data.frame()
cm<- cor(int, use='complete.obs')
corrplot(cm, method='circle')

ggplot(stack(int), aes(values))+
  facet_wrap(~ind, scales = "free") + 
  geom_bar(fill = "light blue", colour="black") +
  theme(legend.position="none")

Fill Character variable’s NA with “Not available” and turn it into factor

The mice() package was built for doing imputation on missing data. “Multivariate Imputation by Chained Equations”. We will use predictive mean matching as an imputation method. First we combine the chr and int datasets into one dataframe. This is essentially an entire dataset of type integer and an entire dataset of type charachter.

full <- bind_cols(fac,int)

imputed <-  mice(full, method='pmm')

full <- complete(imputed)

rm(chr,fac,int,imputed)

Separate Train and Test Data

train <- full[1:length(SalePrice),]
test<-full[(length(SalePrice)+1):nrow(full),]

Run the Support Vector Machines (SVM) model and save to .csv

We run th SVM model on the training set, then predict the model against the test set. Lastly, we create a data frame for our submission that includes the Id column and the results of the SalePrice predictions.

svm_model <- svm(SalePrice ~ ., data = train)

svm_pred <- predict(svm_model, newdata = test)

solution <- data.frame(Id = Id, SalePrice = svm_pred)

write.csv(solution,"svm_dacampora.csv",row.names = FALSE)

Conclusion

This was my first time using SVM and as always, its working with the data that is the hardest part. There are still parts of the model I would stil like to understand further. The model scored 649th on Kaggle with an RMSE of 0.11816.

https://www.kaggle.com/dacdaddy