library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(purrr)
library(ggcorrplot)

1 Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu=\sigma=(N+1)/2\)

1.1 Generate Random Variable X

N <- runif(1,6,100)
n <- 10000
X <- runif(n, min=0, max=N)

hist(X)

1.2 Generate Random Variable Y

m <- (N+1)/2 # mean
sd <- (N+1)/2 # standard deviation
n <- 10000 # number of trials
Y <- rnorm(n, m, sd)
hist(Y)

1.3 Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

x is median of X

y is 1st quartile of Y

  1. \(P(X>x | X>y)\) => Probability of X is greater than median of X given that X is greater than first quartile of Y.
x <- median(X)
y <- quantile(Y, 0.25)[[1]]
x
## [1] 18.3506
y
## [1] 6.290243

\(P(X>x | X>y)=\frac{P(X>x, X>y)}{P(X>y)}\)

\(P(X>x, X>y)\) => sum of all the posibilities where X >x and X > y , divided by all the possibilities for X

\(P(X>y)\) => sum of all the possibilities where X>y, divided by all the possibilities for x

p_1 <- sum(X>x & X >y)/n
p_2 <- sum(X>y)/n
p_3 <- p_1/p_2
p_3
## [1] 0.6072383

This means the probability of X being greater than X’s median given X being greater than Y’s first quartile is 60%

  1. \(P(X>x, Y>y)\) => Probability of X is greater than all possible median of X and Y is greater than all possible quantile of Y.

\(P(X>x, Y>y)\) => Sum of all the possibilities where X>x and Y>y, divided by all the possibilities of X

p_4 <- sum(X>x & Y>y)/n
p_4
## [1] 0.3767

This means the probability of X being greater than median of X and Y being greater than first quartile of Y is 37%

  1. \(P(X<x | X>y)\) => Probability of X is smaller than median of X and X is greater than first quantile of Y.

\(P(X<x, X>y)\) => Sum of all the possibilities where X < x and X > y, divided by all the possibilities of X

p_5 <- sum(X<x & X>y)/n
p_5
## [1] 0.3234

This means the probability of X being smaller than median of X and X being greater than first quartile of Y is 34%

1.4 Investigate Condition.

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

a <- sum(X>x & Y<y) # all possibilities where X >x and Y>y
b <- sum(X>x & Y>y) # all possibilities where X>x and Y<y
c <- sum(X<x & Y<y) # all possibilities where X<x and Y<y
d <- sum(X<x & Y>y) # all possibilities where X<x and Y>y
total_1 <- a+c
total_2 <- b+d
total_3 <- a+b
total_4 <- c+d
total_5 <- total_3+total_4

p_a <- a/n
p_b <- b/n
p_c <- c/n
p_d <- d/n
p_total_1 <- total_1/n
p_total_2 <- total_2/n
p_total_3 <- total_3/n
p_total_4 <- total_4/n
p_total_5 <- total_5/n

conditions_table <- matrix(c(p_a, p_b, p_total_3, p_c, p_d, p_total_4, p_total_1, p_total_2, p_total_5), nrow = 3, ncol = 3)
conditions_df <- as.data.frame(conditions_table)
names(conditions_df) <- c("X>x", "X<x", "Total")
row.names(conditions_df) <- c("Y<y", "Y>y", "Total")

conditions_df
##          X>x    X<x Total
## Y<y   0.1233 0.1267  0.25
## Y>y   0.3767 0.3733  0.75
## Total 0.5000 0.5000  1.00

Based on the table \(P(X>x and Y>y)=0.372\) and \(P(X>x)P(Y>y)=0.5*0.75=0.375\) .

1.5 Independence Fisher’s Exact Test and Chi Square Test

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

matrix <- matrix(c(a,b,c,d), nrow = 2, ncol = 2)

fisher.test(matrix, simulate.p.value = TRUE)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix
## p-value = 0.446
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.879972 1.056878
## sample estimates:
## odds ratio 
##  0.9643714
chisq.test(matrix, correct=TRUE)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  matrix
## X-squared = 0.5808, df = 1, p-value = 0.446

We use Fisher’s Exact Test if the sample size is small. We use Chi-Squared test as it will be more accurate in this case.

2 Problem 2

Register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques

2.1 Load Data

df_train <- read.csv("https://raw.githubusercontent.com/anilak1978/house-prices/master/train.csv")
df_test <- read.csv("https://raw.githubusercontent.com/anilak1978/house-prices/master/test.csv")
head(df_train)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000
head(df_test)
##     Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461         20       RH          80   11622   Pave  <NA>      Reg
## 2 1462         20       RL          81   14267   Pave  <NA>      IR1
## 3 1463         60       RL          74   13830   Pave  <NA>      IR1
## 4 1464         60       RL          78    9978   Pave  <NA>      IR1
## 5 1465        120       RL          43    5005   Pave  <NA>      IR1
## 6 1466         60       RL          75   10000   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl        NAmes      Feedr
## 2         Lvl    AllPub    Corner       Gtl        NAmes       Norm
## 3         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 4         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 5         HLS    AllPub    Inside       Gtl      StoneBr       Norm
## 6         Lvl    AllPub    Corner       Gtl      Gilbert       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           5           6      1961
## 2       Norm     1Fam     1Story           6           6      1958
## 3       Norm     1Fam     2Story           5           5      1997
## 4       Norm     1Fam     2Story           6           6      1998
## 5       Norm   TwnhsE     1Story           8           5      1992
## 6       Norm     1Fam     2Story           6           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1961     Gable  CompShg     VinylSd     VinylSd       None
## 2         1958       Hip  CompShg     Wd Sdng     Wd Sdng    BrkFace
## 3         1998     Gable  CompShg     VinylSd     VinylSd       None
## 4         1998     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 5         1992     Gable  CompShg     HdBoard     HdBoard       None
## 6         1994     Gable  CompShg     HdBoard     HdBoard       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA     CBlock       TA       TA           No
## 2        108        TA        TA     CBlock       TA       TA           No
## 3          0        TA        TA      PConc       Gd       TA           No
## 4         20        TA        TA      PConc       TA       TA           No
## 5          0        Gd        TA      PConc       Gd       TA           No
## 6          0        TA        TA      PConc       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          Rec        468          LwQ        144       270         882
## 2          ALQ        923          Unf          0       406        1329
## 3          GLQ        791          Unf          0       137         928
## 4          GLQ        602          Unf          0       324         926
## 5          ALQ        263          Unf          0      1017        1280
## 6          Unf          0          Unf          0       763         763
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        TA          Y      SBrkr       896         0            0
## 2    GasA        TA          Y      SBrkr      1329         0            0
## 3    GasA        Gd          Y      SBrkr       928       701            0
## 4    GasA        Ex          Y      SBrkr       926       678            0
## 5    GasA        Ex          Y      SBrkr      1280         0            0
## 6    GasA        Gd          Y      SBrkr       763       892            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1       896            0            0        1        0            2
## 2      1329            0            0        1        1            3
## 3      1629            0            0        2        1            3
## 4      1604            0            0        2        1            3
## 5      1280            0            0        2        0            2
## 6      1655            0            0        2        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          TA            5        Typ          0        <NA>
## 2            1          Gd            6        Typ          0        <NA>
## 3            1          TA            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            5        Typ          0        <NA>
## 6            1          TA            7        Typ          1          TA
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        1961          Unf          1        730         TA
## 2     Attchd        1958          Unf          1        312         TA
## 3     Attchd        1997          Fin          2        482         TA
## 4     Attchd        1998          Fin          2        470         TA
## 5     Attchd        1992          RFn          2        506         TA
## 6     Attchd        1993          Fin          2        440         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y        140           0             0          0
## 2         TA          Y        393          36             0          0
## 3         TA          Y        212          34             0          0
## 4         TA          Y        360          36             0          0
## 5         TA          Y          0          82             0          0
## 6         TA          Y        157          84             0          0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1         120        0   <NA> MnPrv        <NA>       0      6   2010
## 2           0        0   <NA>  <NA>        Gar2   12500      6   2010
## 3           0        0   <NA> MnPrv        <NA>       0      3   2010
## 4           0        0   <NA>  <NA>        <NA>       0      6   2010
## 5         144        0   <NA>  <NA>        <NA>       0      1   2010
## 6           0        0   <NA>  <NA>        <NA>       0      4   2010
##   SaleType SaleCondition
## 1       WD        Normal
## 2       WD        Normal
## 3       WD        Normal
## 4       WD        Normal
## 5       WD        Normal
## 6       WD        Normal

2.2 About the Data Sets

MSSubClass: Identifies the type of dwelling involved in the sale.

    20  1-STORY 1946 & NEWER ALL STYLES
    30  1-STORY 1945 & OLDER
    40  1-STORY W/FINISHED ATTIC ALL AGES
    45  1-1/2 STORY - UNFINISHED ALL AGES
    50  1-1/2 STORY FINISHED ALL AGES
    60  2-STORY 1946 & NEWER
    70  2-STORY 1945 & OLDER
    75  2-1/2 STORY ALL AGES
    80  SPLIT OR MULTI-LEVEL
    85  SPLIT FOYER
    90  DUPLEX - ALL STYLES AND AGES
   120  1-STORY PUD (Planned Unit Development) - 1946 & NEWER
   150  1-1/2 STORY PUD - ALL AGES
   160  2-STORY PUD - 1946 & NEWER
   180  PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
   190  2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.

   A    Agriculture
   C    Commercial
   FV   Floating Village Residential
   I    Industrial
   RH   Residential High Density
   RL   Residential Low Density
   RP   Residential Low Density Park 
   RM   Residential Medium Density

LotFrontage: Linear feet of street connected to property

LotArea: Lot size in square feet

Street: Type of road access to property

   Grvl Gravel  
   Pave Paved
    

Alley: Type of alley access to property

   Grvl Gravel
   Pave Paved
   NA   No alley access
    

LotShape: General shape of property

   Reg  Regular 
   IR1  Slightly irregular
   IR2  Moderately Irregular
   IR3  Irregular
   

LandContour: Flatness of the property

   Lvl  Near Flat/Level 
   Bnk  Banked - Quick and significant rise from street grade to building
   HLS  Hillside - Significant slope from side to side
   Low  Depression
    

Utilities: Type of utilities available

   AllPub   All public Utilities (E,G,W,& S)    
   NoSewr   Electricity, Gas, and Water (Septic Tank)
   NoSeWa   Electricity and Gas Only
   ELO  Electricity only    

LotConfig: Lot configuration

   Inside   Inside lot
   Corner   Corner lot
   CulDSac  Cul-de-sac
   FR2  Frontage on 2 sides of property
   FR3  Frontage on 3 sides of property

LandSlope: Slope of property

   Gtl  Gentle slope
   Mod  Moderate Slope  
   Sev  Severe Slope

Neighborhood: Physical locations within Ames city limits

   Blmngtn  Bloomington Heights
   Blueste  Bluestem
   BrDale   Briardale
   BrkSide  Brookside
   ClearCr  Clear Creek
   CollgCr  College Creek
   Crawfor  Crawford
   Edwards  Edwards
   Gilbert  Gilbert
   IDOTRR   Iowa DOT and Rail Road
   MeadowV  Meadow Village
   Mitchel  Mitchell
   Names    North Ames
   NoRidge  Northridge
   NPkVill  Northpark Villa
   NridgHt  Northridge Heights
   NWAmes   Northwest Ames
   OldTown  Old Town
   SWISU    South & West of Iowa State University
   Sawyer   Sawyer
   SawyerW  Sawyer West
   Somerst  Somerset
   StoneBr  Stone Brook
   Timber   Timberland
   Veenker  Veenker
        

Condition1: Proximity to various conditions

   Artery   Adjacent to arterial street
   Feedr    Adjacent to feeder street   
   Norm Normal  
   RRNn Within 200' of North-South Railroad
   RRAn Adjacent to North-South Railroad
   PosN Near positive off-site feature--park, greenbelt, etc.
   PosA Adjacent to postive off-site feature
   RRNe Within 200' of East-West Railroad
   RRAe Adjacent to East-West Railroad

Condition2: Proximity to various conditions (if more than one is present)

   Artery   Adjacent to arterial street
   Feedr    Adjacent to feeder street   
   Norm Normal  
   RRNn Within 200' of North-South Railroad
   RRAn Adjacent to North-South Railroad
   PosN Near positive off-site feature--park, greenbelt, etc.
   PosA Adjacent to postive off-site feature
   RRNe Within 200' of East-West Railroad
   RRAe Adjacent to East-West Railroad

BldgType: Type of dwelling

   1Fam Single-family Detached  
   2FmCon   Two-family Conversion; originally built as one-family dwelling
   Duplx    Duplex
   TwnhsE   Townhouse End Unit
   TwnhsI   Townhouse Inside Unit

HouseStyle: Style of dwelling

   1Story   One story
   1.5Fin   One and one-half story: 2nd level finished
   1.5Unf   One and one-half story: 2nd level unfinished
   2Story   Two story
   2.5Fin   Two and one-half story: 2nd level finished
   2.5Unf   Two and one-half story: 2nd level unfinished
   SFoyer   Split Foyer
   SLvl Split Level

OverallQual: Rates the overall material and finish of the house

   10   Very Excellent
   9    Excellent
   8    Very Good
   7    Good
   6    Above Average
   5    Average
   4    Below Average
   3    Fair
   2    Poor
   1    Very Poor

OverallCond: Rates the overall condition of the house

   10   Very Excellent
   9    Excellent
   8    Very Good
   7    Good
   6    Above Average   
   5    Average
   4    Below Average   
   3    Fair
   2    Poor
   1    Very Poor
    

YearBuilt: Original construction date

YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)

RoofStyle: Type of roof

   Flat Flat
   Gable    Gable
   Gambrel  Gabrel (Barn)
   Hip  Hip
   Mansard  Mansard
   Shed Shed
    

RoofMatl: Roof material

   ClyTile  Clay or Tile
   CompShg  Standard (Composite) Shingle
   Membran  Membrane
   Metal    Metal
   Roll Roll
   Tar&Grv  Gravel & Tar
   WdShake  Wood Shakes
   WdShngl  Wood Shingles
    

Exterior1st: Exterior covering on house

   AsbShng  Asbestos Shingles
   AsphShn  Asphalt Shingles
   BrkComm  Brick Common
   BrkFace  Brick Face
   CBlock   Cinder Block
   CemntBd  Cement Board
   HdBoard  Hard Board
   ImStucc  Imitation Stucco
   MetalSd  Metal Siding
   Other    Other
   Plywood  Plywood
   PreCast  PreCast 
   Stone    Stone
   Stucco   Stucco
   VinylSd  Vinyl Siding
   Wd Sdng  Wood Siding
   WdShing  Wood Shingles

Exterior2nd: Exterior covering on house (if more than one material)

   AsbShng  Asbestos Shingles
   AsphShn  Asphalt Shingles
   BrkComm  Brick Common
   BrkFace  Brick Face
   CBlock   Cinder Block
   CemntBd  Cement Board
   HdBoard  Hard Board
   ImStucc  Imitation Stucco
   MetalSd  Metal Siding
   Other    Other
   Plywood  Plywood
   PreCast  PreCast
   Stone    Stone
   Stucco   Stucco
   VinylSd  Vinyl Siding
   Wd Sdng  Wood Siding
   WdShing  Wood Shingles

MasVnrType: Masonry veneer type

   BrkCmn   Brick Common
   BrkFace  Brick Face
   CBlock   Cinder Block
   None None
   Stone    Stone

MasVnrArea: Masonry veneer area in square feet

ExterQual: Evaluates the quality of the material on the exterior

   Ex   Excellent
   Gd   Good
   TA   Average/Typical
   Fa   Fair
   Po   Poor
    

ExterCond: Evaluates the present condition of the material on the exterior

   Ex   Excellent
   Gd   Good
   TA   Average/Typical
   Fa   Fair
   Po   Poor
    

Foundation: Type of foundation

   BrkTil   Brick & Tile
   CBlock   Cinder Block
   PConc    Poured Contrete 
   Slab Slab
   Stone    Stone
   Wood Wood
    

BsmtQual: Evaluates the height of the basement

   Ex   Excellent (100+ inches) 
   Gd   Good (90-99 inches)
   TA   Typical (80-89 inches)
   Fa   Fair (70-79 inches)
   Po   Poor (<70 inches
   NA   No Basement
    

BsmtCond: Evaluates the general condition of the basement

   Ex   Excellent
   Gd   Good
   TA   Typical - slight dampness allowed
   Fa   Fair - dampness or some cracking or settling
   Po   Poor - Severe cracking, settling, or wetness
   NA   No Basement

BsmtExposure: Refers to walkout or garden level walls

   Gd   Good Exposure
   Av   Average Exposure (split levels or foyers typically score average or above)  
   Mn   Mimimum Exposure
   No   No Exposure
   NA   No Basement

BsmtFinType1: Rating of basement finished area

   GLQ  Good Living Quarters
   ALQ  Average Living Quarters
   BLQ  Below Average Living Quarters   
   Rec  Average Rec Room
   LwQ  Low Quality
   Unf  Unfinshed
   NA   No Basement
    

BsmtFinSF1: Type 1 finished square feet

BsmtFinType2: Rating of basement finished area (if multiple types)

   GLQ  Good Living Quarters
   ALQ  Average Living Quarters
   BLQ  Below Average Living Quarters   
   Rec  Average Rec Room
   LwQ  Low Quality
   Unf  Unfinshed
   NA   No Basement

BsmtFinSF2: Type 2 finished square feet

BsmtUnfSF: Unfinished square feet of basement area

TotalBsmtSF: Total square feet of basement area

Heating: Type of heating

   Floor    Floor Furnace
   GasA Gas forced warm air furnace
   GasW Gas hot water or steam heat
   Grav Gravity furnace 
   OthW Hot water or steam heat other than gas
   Wall Wall furnace
    

HeatingQC: Heating quality and condition

   Ex   Excellent
   Gd   Good
   TA   Average/Typical
   Fa   Fair
   Po   Poor
    

CentralAir: Central air conditioning

   N    No
   Y    Yes
    

Electrical: Electrical system

   SBrkr    Standard Circuit Breakers & Romex
   FuseA    Fuse Box over 60 AMP and all Romex wiring (Average) 
   FuseF    60 AMP Fuse Box and mostly Romex wiring (Fair)
   FuseP    60 AMP Fuse Box and mostly knob & tube wiring (poor)
   Mix  Mixed
    

1stFlrSF: First Floor square feet

2ndFlrSF: Second floor square feet

LowQualFinSF: Low quality finished square feet (all floors)

GrLivArea: Above grade (ground) living area square feet

BsmtFullBath: Basement full bathrooms

BsmtHalfBath: Basement half bathrooms

FullBath: Full bathrooms above grade

HalfBath: Half baths above grade

Bedroom: Bedrooms above grade (does NOT include basement bedrooms)

Kitchen: Kitchens above grade

KitchenQual: Kitchen quality

   Ex   Excellent
   Gd   Good
   TA   Typical/Average
   Fa   Fair
   Po   Poor
    

TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)

Functional: Home functionality (Assume typical unless deductions are warranted)

   Typ  Typical Functionality
   Min1 Minor Deductions 1
   Min2 Minor Deductions 2
   Mod  Moderate Deductions
   Maj1 Major Deductions 1
   Maj2 Major Deductions 2
   Sev  Severely Damaged
   Sal  Salvage only
    

Fireplaces: Number of fireplaces

FireplaceQu: Fireplace quality

   Ex   Excellent - Exceptional Masonry Fireplace
   Gd   Good - Masonry Fireplace in main level
   TA   Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
   Fa   Fair - Prefabricated Fireplace in basement
   Po   Poor - Ben Franklin Stove
   NA   No Fireplace
    

GarageType: Garage location

   2Types   More than one type of garage
   Attchd   Attached to home
   Basment  Basement Garage
   BuiltIn  Built-In (Garage part of house - typically has room above garage)
   CarPort  Car Port
   Detchd   Detached from home
   NA   No Garage
    

GarageYrBlt: Year garage was built

GarageFinish: Interior finish of the garage

   Fin  Finished
   RFn  Rough Finished  
   Unf  Unfinished
   NA   No Garage
    

GarageCars: Size of garage in car capacity

GarageArea: Size of garage in square feet

GarageQual: Garage quality

   Ex   Excellent
   Gd   Good
   TA   Typical/Average
   Fa   Fair
   Po   Poor
   NA   No Garage
    

GarageCond: Garage condition

   Ex   Excellent
   Gd   Good
   TA   Typical/Average
   Fa   Fair
   Po   Poor
   NA   No Garage
    

PavedDrive: Paved driveway

   Y    Paved 
   P    Partial Pavement
   N    Dirt/Gravel
    

WoodDeckSF: Wood deck area in square feet

OpenPorchSF: Open porch area in square feet

EnclosedPorch: Enclosed porch area in square feet

3SsnPorch: Three season porch area in square feet

ScreenPorch: Screen porch area in square feet

PoolArea: Pool area in square feet

PoolQC: Pool quality

   Ex   Excellent
   Gd   Good
   TA   Average/Typical
   Fa   Fair
   NA   No Pool
    

Fence: Fence quality

   GdPrv    Good Privacy
   MnPrv    Minimum Privacy
   GdWo Good Wood
   MnWw Minimum Wood/Wire
   NA   No Fence

MiscFeature: Miscellaneous feature not covered in other categories

   Elev Elevator
   Gar2 2nd Garage (if not described in garage section)
   Othr Other
   Shed Shed (over 100 SF)
   TenC Tennis Court
   NA   None
    

MiscVal: $Value of miscellaneous feature

MoSold: Month Sold (MM)

YrSold: Year Sold (YYYY)

SaleType: Type of sale

   WD   Warranty Deed - Conventional
   CWD  Warranty Deed - Cash
   VWD  Warranty Deed - VA Loan
   New  Home just constructed and sold
   COD  Court Officer Deed/Estate
   Con  Contract 15% Down payment regular terms
   ConLw    Contract Low Down payment and low interest
   ConLI    Contract Low Interest
   ConLD    Contract Low Down
   Oth  Other
    

SaleCondition: Condition of sale

   Normal   Normal Sale
   Abnorml  Abnormal Sale -  trade, foreclosure, short sale
   AdjLand  Adjoining Land Purchase
   Alloca   Allocation - two linked properties with separate deeds, typically condo with a garage unit  
   Family   Sale between family members
   Partial  Home was not completed when last assessed (associated with New Homes)

2.3 Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

2.3.1 Descriptive Statistics and Plots for Training Dataset

str(df_train)
## 'data.frame':    1459 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley        : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ MiscFeature  : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
summary(df_train)
##        Id           MSSubClass        MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.00   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.5   1st Qu.: 20.00   FV     :  65   1st Qu.: 59.00  
##  Median : 730.0   Median : 50.00   RH     :  16   Median : 69.00  
##  Mean   : 730.0   Mean   : 56.92   RL     :1150   Mean   : 70.05  
##  3rd Qu.:1094.5   3rd Qu.: 70.00   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1459.0   Max.   :190.00                  Max.   :313.00  
##                                                   NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63   
##  1st Qu.:  7549   Pave:1453   Pave:  41   IR2: 41   HLS:  50   
##  Median :  9477               NA's:1368   IR3: 10   Low:  36   
##  Mean   : 10517                           Reg:924   Lvl:1310   
##  3rd Qu.: 11603                                                
##  Max.   :215245                                                
##                                                                
##   Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub:1458   Corner : 263   Gtl:1381   NAmes  :225   Norm   :1259  
##  NoSeWa:   1   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##                FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                FR3    :   4              Edwards: 99   RRAn   :  26  
##                Inside :1051              Somerst: 86   PosN   :  19  
##                                          Gilbert: 79   RRAe   :  11  
##                                          (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual    OverallCond   
##  Norm   :1444   1Fam  :1219   1Story :725   Min.   : 1.0   Min.   :1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.0   1st Qu.:5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.0   Median :5.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.1   Mean   :5.575  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.0   3rd Qu.:6.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.0   Max.   :9.000  
##  (Other):   2                 (Other): 19                                 
##    YearBuilt     YearRemodAdd    RoofStyle       RoofMatl     Exterior1st 
##  Min.   :1872   Min.   :1950   Flat   :  13   CompShg:1433   VinylSd:515  
##  1st Qu.:1954   1st Qu.:1967   Gable  :1140   Tar&Grv:  11   HdBoard:221  
##  Median :1973   Median :1994   Gambrel:  11   WdShngl:   6   MetalSd:220  
##  Mean   :1971   Mean   :1985   Hip    : 286   WdShake:   5   Wd Sdng:206  
##  3rd Qu.:2000   3rd Qu.:2004   Mansard:   7   ClyTile:   1   Plywood:108  
##  Max.   :2010   Max.   :2010   Shed   :   2   Membran:   1   CemntBd: 61  
##                                               (Other):   2   (Other):128  
##   Exterior2nd    MasVnrType    MasVnrArea     ExterQual ExterCond
##  VinylSd:504   BrkCmn : 15   Min.   :   0.0   Ex: 52    Ex:   3  
##  MetalSd:214   BrkFace:445   1st Qu.:   0.0   Fa: 14    Fa:  28  
##  HdBoard:206   None   :863   Median :   0.0   Gd:487    Gd: 146  
##  Wd Sdng:197   Stone  :128   Mean   : 103.8   TA:906    Po:   1  
##  Plywood:142   NA's   :  8   3rd Qu.: 166.0             TA:1281  
##  CmentBd: 60                 Max.   :1600.0                      
##  (Other):136                 NA's   :8                           
##   Foundation  BsmtQual   BsmtCond    BsmtExposure BsmtFinType1
##  BrkTil:146   Ex  :121   Fa  :  45   Av  :221     ALQ :220    
##  CBlock:633   Fa  : 35   Gd  :  65   Gd  :134     BLQ :147    
##  PConc :647   Gd  :618   Po  :   2   Mn  :114     GLQ :418    
##  Slab  : 24   TA  :648   TA  :1310   No  :952     LwQ : 74    
##  Stone :  6   NA's: 37   NA's:  37   NA's: 38     Rec :133    
##  Wood  :  3                                       Unf :430    
##                                                   NA's: 37    
##    BsmtFinSF1     BsmtFinType2   BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   ALQ :  19    Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00   1st Qu.: 223.5  
##  Median : 383.0   GLQ :  14    Median :   0.00   Median : 479.0  
##  Mean   : 443.4   LwQ :  45    Mean   :  46.38   Mean   : 567.5  
##  3rd Qu.: 712.0   Rec :  54    3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :5644.0   Unf :1256    Max.   :1474.00   Max.   :2336.0  
##                   NA's:  38                                      
##   TotalBsmtSF      Heating     HeatingQC CentralAir Electrical  
##  Min.   :   0.0   Floor:   1   Ex:741    N:  95     FuseA:  94  
##  1st Qu.: 795.5   GasA :1427   Fa: 49    Y:1364     FuseF:  27  
##  Median : 991.0   GasW :  18   Gd:240               FuseP:   3  
##  Mean   :1057.3   Grav :   7   Po:  1               Mix  :   1  
##  3rd Qu.:1298.5   OthW :   2   TA:428               SBrkr:1333  
##  Max.   :6110.0   Wall :   4                        NA's :   1  
##                                                                 
##    X1stFlrSF      X2ndFlrSF       LowQualFinSF       GrLivArea   
##  Min.   : 334   Min.   :   0.0   Min.   :  0.000   Min.   : 334  
##  1st Qu.: 882   1st Qu.:   0.0   1st Qu.:  0.000   1st Qu.:1129  
##  Median :1086   Median :   0.0   Median :  0.000   Median :1464  
##  Mean   :1163   Mean   : 347.2   Mean   :  5.848   Mean   :1516  
##  3rd Qu.:1392   3rd Qu.: 728.0   3rd Qu.:  0.000   3rd Qu.:1778  
##  Max.   :4692   Max.   :2065.0   Max.   :572.000   Max.   :5642  
##                                                                  
##   BsmtFullBath     BsmtHalfBath        FullBath        HalfBath     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :2.000   Median :0.0000  
##  Mean   :0.4249   Mean   :0.05757   Mean   :1.565   Mean   :0.3825  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :3.0000   Max.   :2.00000   Max.   :3.000   Max.   :2.0000  
##                                                                     
##   BedroomAbvGr    KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##  Min.   :0.000   Min.   :0.000   Ex:100      Min.   : 2.000   Maj1:  14  
##  1st Qu.:2.000   1st Qu.:1.000   Fa: 39      1st Qu.: 5.000   Maj2:   5  
##  Median :3.000   Median :1.000   Gd:586      Median : 6.000   Min1:  31  
##  Mean   :2.866   Mean   :1.047   TA:734      Mean   : 6.518   Min2:  34  
##  3rd Qu.:3.000   3rd Qu.:1.000               3rd Qu.: 7.000   Mod :  15  
##  Max.   :8.000   Max.   :3.000               Max.   :14.000   Sev :   1  
##                                                               Typ :1359  
##    Fireplaces     FireplaceQu   GarageType   GarageYrBlt   GarageFinish
##  Min.   :0.0000   Ex  : 24    2Types :  6   Min.   :1900   Fin :351    
##  1st Qu.:0.0000   Fa  : 33    Attchd :869   1st Qu.:1961   RFn :422    
##  Median :1.0000   Gd  :380    Basment: 19   Median :1980   Unf :605    
##  Mean   :0.6134   Po  : 20    BuiltIn: 88   Mean   :1979   NA's: 81    
##  3rd Qu.:1.0000   TA  :313    CarPort:  9   3rd Qu.:2002               
##  Max.   :3.0000   NA's:689    Detchd :387   Max.   :2010               
##                               NA's   : 81   NA's   :81                 
##    GarageCars      GarageArea     GarageQual  GarageCond  PavedDrive
##  Min.   :0.000   Min.   :   0.0   Ex  :   3   Ex  :   2   N:  90    
##  1st Qu.:1.000   1st Qu.: 336.0   Fa  :  48   Fa  :  35   P:  30    
##  Median :2.000   Median : 480.0   Gd  :  14   Gd  :   9   Y:1339    
##  Mean   :1.768   Mean   : 473.1   Po  :   3   Po  :   7             
##  3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1310   TA  :1325             
##  Max.   :4.000   Max.   :1418.0   NA's:  81   NA's:  81             
##                                                                     
##    WoodDeckSF     OpenPorchSF     EnclosedPorch      X3SsnPorch     
##  Min.   :  0.0   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.0   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.0   Median : 25.00   Median :  0.00   Median :  0.000  
##  Mean   : 93.8   Mean   : 46.65   Mean   : 21.97   Mean   :  3.412  
##  3rd Qu.:168.0   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :857.0   Max.   :547.00   Max.   :552.00   Max.   :508.000  
##                                                                     
##   ScreenPorch        PoolArea        PoolQC       Fence      MiscFeature
##  Min.   :  0.00   Min.   :  0.000   Ex  :   2   GdPrv:  59   Gar2:   2  
##  1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2   GdWo :  54   Othr:   2  
##  Median :  0.00   Median :  0.000   Gd  :   3   MnPrv: 157   Shed:  49  
##  Mean   : 15.07   Mean   :  2.761   NA's:1452   MnWw :  11   TenC:   1  
##  3rd Qu.:  0.00   3rd Qu.:  0.000               NA's :1178   NA's:1405  
##  Max.   :480.00   Max.   :738.000                                       
##                                                                         
##     MiscVal             MoSold           YrSold        SaleType   
##  Min.   :    0.00   Min.   : 1.000   Min.   :2006   WD     :1266  
##  1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007   New    : 122  
##  Median :    0.00   Median : 6.000   Median :2008   COD    :  43  
##  Mean   :   43.52   Mean   : 6.322   Mean   :2008   ConLD  :   9  
##  3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009   ConLI  :   5  
##  Max.   :15500.00   Max.   :12.000   Max.   :2010   ConLw  :   5  
##                                                     (Other):   9  
##  SaleCondition    SalePrice     
##  Abnorml: 101   Min.   : 34900  
##  AdjLand:   4   1st Qu.:129950  
##  Alloca :  12   Median :163000  
##  Family :  20   Mean   :180944  
##  Normal :1197   3rd Qu.:214000  
##  Partial: 125   Max.   :755000  
## 

We can look at LotArea, Neighborhood, SalePrice, HouseStyle and OverallCondition variables in more detail.

summary(df_train$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7549    9477   10517   11603  215245

The average Lot Area in the datasaet is 10517 square feet. Maximum lot size is 215245 square feet.

summary(df_train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129950  163000  180944  214000  755000

The average Sale Price in the dataset is 180944. The meximum Sale Price is 755000.

hist(df_train$SalePrice)

theme_set(theme_classic())
ggplot(df_train, aes(HouseStyle))+
  geom_bar(aes(fill=Neighborhood))

Majority of the houses are 1 or 2 story style houses.

theme_set(theme_classic())
df_train$OverallCond_factor <- as.factor(as.character(df_train$OverallCond))
ggplot(df_train, aes(OverallCond, SalePrice))+
  geom_boxplot(aes(fill=OverallCond_factor))

Condition of the house has a poistive correlation with SalePrice.

theme_set(theme_classic())
ggplot(df_train, aes(Neighborhood, SalePrice))+
  geom_boxplot(varwidth = T, fill="plum")+
  coord_flip()

2.3.2 Scatter Matrix Correlation

df_train_1 <- select(df_train, Id, SalePrice, LotArea, YearBuilt, OverallCond, OverallQual)
head(df_train_1)
##   Id SalePrice LotArea YearBuilt OverallCond OverallQual
## 1  1    208500    8450      2003           5           7
## 2  2    181500    9600      1976           8           6
## 3  3    223500   11250      2001           5           7
## 4  4    140000    9550      1915           5           7
## 5  5    250000   14260      2000           5           8
## 6  6    143000   14115      1993           5           5
df_test_1 <- select(df_test, Id, LotArea, YearBuilt, OverallCond, OverallQual)
corr <- cor(df_train_1)

ggcorrplot(corr, hc.order=TRUE,
           type = "lower",
           lab=TRUE,
           lab_size = 3,
           method = "circle",
           colors = c("tomato2", "white", "springgreen3"),
           title = "Correlogram of House Train Data Set",
           ggtheme=theme_bw())

theme_set(theme_bw())
ggplot(df_train_1, aes(OverallQual, SalePrice))+
  geom_jitter(width=10, size=1)

2.4 Hypotheses

\(H_{0}\) : The correlations between each pairwise set of variables is 0.

Confidence interval is 80%.

1- Look at the p-value of each variable that is in df_train_1 for correlation 2- 0.80 is the confidence level.

(a): Sale Price Vs OverallQual

cor.test(df_train_1$SalePrice, df_train_1$OverallQual, conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  df_train_1$SalePrice and df_train_1$OverallQual
## t = 49.345, df = 1457, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7780602 0.8032151
## sample estimates:
##       cor 
## 0.7909716

(b): Sale Price VS YearBuilt

cor.test(df_train_1$SalePrice, df_train_1$YearBuilt , conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  df_train_1$SalePrice and df_train_1$YearBuilt
## t = 23.414, df = 1457, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4980468 0.5468503
## sample estimates:
##       cor 
## 0.5228769

(c): Sale Price VS LotArea

cor.test(df_train_1$SalePrice, df_train_1$LotArea  , conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  df_train_1$SalePrice and df_train_1$LotArea
## t = 10.441, df = 1457, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323277 0.2948047
## sample estimates:
##       cor 
## 0.2638429

2.4.1 Discuss the Meaning

We are 80% confident that , with low P value for LotArea, YearBuilt and OverallQuall are somehow correlated to SalePrice.

2.5 Linear Algebra and Correlation

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

2.5.1 Precision Matrix

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
precision_matrix <- solve(corr)
precision_matrix
##                       Id  SalePrice     LotArea    YearBuilt OverallCond
## Id           1.001906559 -0.0156547  0.03327363 -0.006747185 -0.01217039
## SalePrice   -0.015654697  3.0205330 -0.56954917 -0.409888594 -0.11541100
## LotArea      0.033273628 -0.5695492  1.12331500  0.157586539  0.04347331
## YearBuilt   -0.006747185 -0.4098886  0.15758654  1.826006946  0.59289392
## OverallCond -0.012170394 -0.1154110  0.04347331  0.592893923  1.20121977
## OverallQual  0.039115176 -2.1053214  0.24636251 -0.683350579 -0.14277440
##             OverallQual
## Id           0.03911518
## SalePrice   -2.10532139
## LotArea      0.24636251
## YearBuilt   -0.68335058
## OverallCond -0.14277440
## OverallQual  3.01826995

2.5.2 Multiply correlation and precision Matrix

** Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.**

mult_1 <- corr %*% precision_matrix
mult_2 <- precision_matrix %*% corr
mult_1
##                        Id     SalePrice       LotArea     YearBuilt
## Id           1.000000e+00  0.000000e+00  2.602085e-18  0.000000e+00
## SalePrice    0.000000e+00  1.000000e+00 -8.326673e-17  0.000000e+00
## LotArea      0.000000e+00  8.326673e-17  1.000000e+00  0.000000e+00
## YearBuilt    0.000000e+00  2.220446e-16 -8.326673e-17  1.000000e+00
## OverallCond -8.673617e-19 -5.551115e-17  3.469447e-18  4.163336e-17
## OverallQual -6.938894e-18  4.440892e-16 -2.775558e-17 -2.220446e-16
##               OverallCond  OverallQual
## Id           0.000000e+00 0.000000e+00
## SalePrice    6.938894e-17 4.440892e-16
## LotArea      1.214306e-17 5.551115e-17
## YearBuilt   -4.163336e-17 4.440892e-16
## OverallCond  1.000000e+00 0.000000e+00
## OverallQual  2.775558e-17 1.000000e+00
mult_2
##                        Id    SalePrice       LotArea     YearBuilt
## Id           1.000000e+00 1.387779e-17  6.938894e-18  3.469447e-18
## SalePrice   -1.387779e-17 1.000000e+00 -1.387779e-16  4.440892e-16
## LotArea     -2.602085e-18 5.551115e-17  1.000000e+00 -5.551115e-17
## YearBuilt   -3.469447e-18 1.110223e-16  0.000000e+00  1.000000e+00
## OverallCond -3.469447e-18 1.387779e-17 -1.734723e-18  4.163336e-17
## OverallQual  1.387779e-17 4.440892e-16  1.110223e-16  2.220446e-16
##               OverallCond   OverallQual
## Id           8.673617e-19  6.938894e-18
## SalePrice   -5.551115e-17  4.440892e-16
## LotArea      0.000000e+00  5.551115e-17
## YearBuilt    1.387779e-17 -1.110223e-16
## OverallCond  1.000000e+00  5.551115e-17
## OverallQual  0.000000e+00  1.000000e+00

2.5.3 LU Decomposition on the Matrix

Conduct LU decomposition on the matrix.

library(matrixcalc)
lu_matrix <- lu.decomposition(precision_matrix)
lu_matrix
## $L
##              [,1]        [,2]        [,3]       [,4]       [,5] [,6]
## [1,]  1.000000000  0.00000000  0.00000000  0.0000000 0.00000000    0
## [2,] -0.015624907  1.00000000  0.00000000  0.0000000 0.00000000    0
## [3,]  0.033210310 -0.18840230  1.00000000  0.0000000 0.00000000    0
## [4,] -0.006734345 -0.13574665  0.07937588  1.0000000 0.00000000    0
## [5,] -0.012147234 -0.03827488  0.02177132  0.3261873 1.00000000    0
## [6,]  0.039040742 -0.69685738 -0.14922979 -0.5424148 0.09174876    1
## 
## $U
##          [,1]          [,2]        [,3]          [,4]        [,5]
## [1,] 1.001907 -1.565470e-02  0.03327363 -6.747185e-03 -0.01217039
## [2,] 0.000000  3.020288e+00 -0.56902927 -4.099940e-01 -0.11560116
## [3,] 0.000000  0.000000e+00  1.01500355  8.056680e-02  0.02209797
## [4,] 0.000000 -5.551115e-17  0.00000000  1.763911e+00  0.57536545
## [5,] 0.000000  1.810703e-17  0.00000000  0.000000e+00  1.00848930
## [6,] 0.000000 -3.177137e-17  0.00000000 -1.110223e-16  0.00000000
##             [,6]
## [1,]  0.03911518
## [2,] -2.10471022
## [3,] -0.15146876
## [4,] -0.95677155
## [5,]  0.09252764
## [6,]  1.00000000

2.6 Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function

lot_area <- df_train_1$LotArea
fit <- fitdistr(lot_area, "exponential")
fit
##        rate    
##   9.508211e-05 
##  (2.489265e-06)

Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable.

opt <- fit$estimate
dist <- rexp(1000, opt)
hist(dist, breaks=50)

hist(lot_area, breaks = 100)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

quantile(dist, probs = c(0.05, 0.95))
##         5%        95% 
##   480.3672 31822.0990

Also generate a 95% confidence interval from the empirical data, assuming normality.

mean(lot_area)
## [1] 10517.23
normal_dist <- rnorm(length(lot_area), mean(lot_area), sd(lot_area))
hist(normal_dist, breaks = 100)

quantile(normal_dist, probs = c(0.05, 0.95))
##        5%       95% 
## -6455.641 26260.829

Finally, provide the empirical 5th percentile and 95th percentile of the data

quantile(lot_area, probs = c(0.05, 0.095))
##      5%    9.5% 
## 3307.40 4921.53

2.7 Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

head(df_train_1)
##   Id SalePrice LotArea YearBuilt OverallCond OverallQual
## 1  1    208500    8450      2003           5           7
## 2  2    181500    9600      1976           8           6
## 3  3    223500   11250      2001           5           7
## 4  4    140000    9550      1915           5           7
## 5  5    250000   14260      2000           5           8
## 6  6    143000   14115      1993           5           5

2.8 Model Development

linear_model <- lm(df_train_1$SalePrice ~ df_train_1$LotArea + df_train_1$YearBuilt + df_train_1$OverallCond + df_train_1$OverallQual)
summary(linear_model)
## 
## Call:
## lm(formula = df_train_1$SalePrice ~ df_train_1$LotArea + df_train_1$YearBuilt + 
##     df_train_1$OverallCond + df_train_1$OverallQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -268646  -26367   -3704   20043  393008 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -7.981e+05  1.033e+05  -7.727 2.03e-14 ***
## df_train_1$LotArea      1.499e+00  1.210e-01  12.392  < 2e-16 ***
## df_train_1$YearBuilt    3.570e+02  5.281e+01   6.761 1.98e-11 ***
## df_train_1$OverallCond  2.732e+03  1.178e+03   2.319   0.0206 *  
## df_train_1$OverallQual  4.004e+04  1.079e+03  37.092  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45790 on 1454 degrees of freedom
## Multiple R-squared:  0.6689, Adjusted R-squared:  0.668 
## F-statistic: 734.4 on 4 and 1454 DF,  p-value: < 2.2e-16

2.9 Model Evaluation

qqnorm(linear_model$residuals)
qqline(linear_model$residuals)

hist(linear_model$residuals, breaks = 100)

sale_price <- function (LotArea, YearBuilt, OverallCond, OverallQual){
  sale_price <- (-7.984e+05 + 1.500e+00*LotArea + 3.572e+02*YearBuilt + 2.736e+03 * OverallCond + 4.003e+04 * OverallQual)
  return(sale_price)
}

bckwrd <- sale_price(df_train_1$LotArea, df_train_1$YearBuilt, df_train_1$OverallCond, df_train_1$OverallQual)

comparison <-data.frame(df_train_1$SalePrice, bckwrd, bckwrd - df_train_1$SalePrice)
head(comparison)
##   df_train_1.SalePrice   bckwrd bckwrd...df_train_1.SalePrice
## 1               208500 223636.6                       15136.6
## 2               181500 183895.2                        2395.2
## 3               223500 227122.2                        3622.2
## 4               140000 193853.0                       53853.0
## 5               250000 271310.0                       21310.0
## 6               143000 148502.1                        5502.1
head(df_test_1)
##     Id LotArea YearBuilt OverallCond OverallQual
## 1 1461   11622      1961           6           5
## 2 1462   14267      1958           6           6
## 3 1463   13830      1997           5           5
## 4 1464    9978      1998           6           6
## 5 1465    5005      1992           5           8
## 6 1466   10000      1993           5           6
AIC(linear_model)
## [1] 35462.75
BIC(linear_model)
## [1] 35494.47
prediction <- predict(linear_model, df_test_1)

predict_df <- data.frame(Id=df_test_1$Id,SalePrice=prediction)

predict_df$SalePrice <- ifelse(predict_df$SalePrice < 0,0,predict_df$SalePrice)

#export to csv
#write.csv(predict_df, "prediction_results.csv", row.names = FALSE)

Prediction are submitted to Kaggle for results

