Probleme 1

set.seed(35)
N <- 10000
nran <- 100
mu <- sig <- (N + 1)/2

X <- runif(N, min = 1, max = N)
Y <- rnorm(nran, mean = mu, sd = sig)

Probability

1a. Let A be the event X>x and B be the event X>y, P(X>x|X>y) = P(A|B). That is the probability of event A given the event B has accured. \(P(A|B)=\frac{P(A\cap B)}{P(B)} = \frac{P(A)}{P(B)}\)

$ = $

sum(X>median(X) & X>quantile(Y, 0.25))/sum(X>quantile(Y, 0.25))
## [1] 0.6566851

\(\frac{P(A)}{P(B)}=\)

sum(X>median(X))/sum(X>quantile(Y, 0.25))
## [1] 0.6566851

1b. Let A be the event X>x and B the event Y>y. The two events are independent. \(P(X>x,Y>y)= P(A,B) = P(A)P(B)\)

sum(X>median(X))/length(X)*sum(Y>quantile(Y, 0.25))/length(Y)
## [1] 0.375

1c. Let A be the event Xy, \(P(X<x|X>y) = P(A|B) = \frac{P(A\cap B)}{P(B)}\)

sum(X<median(X) & X>quantile(Y, 0.25))/sum(X>quantile(Y, 0.25))
## [1] 0.3433149

Marginal and joint probabilities

x <- median(X)
y <- quantile(Y, 0.25)

A <- sum(X>x & Y>y)
B <- sum(X>x & Y<y)
C <- sum(X<x & Y>y)
D <- sum(X<x & Y<y)
marray <- matrix(c(A, B, A+B, C, D, C+D, A+C, B+D, A+B+C+D), byrow = FALSE, nrow = 3)
colnames(marray) <- c("X>x", "X<x", "Margin")
rownames(marray) <- c("Y>y", "Y<y", "Margin")
marray/N
##           X>x    X<x Margin
## Y>y    0.3719 0.3781   0.75
## Y<y    0.1281 0.1219   0.25
## Margin 0.5000 0.5000   1.00

We see that P(X>x and Y>y) = 0.3719 and P(X>x)P(Y>y) = .5(.75) = .375 The two probabilities are not equal ### Fischer’s Exact test and Chi Square Test

mat <- matrix(c(A, B, C, D), byrow = FALSE, ncol = 2)
colnames(mat) <- c("X>x", "X<x")
rownames(mat) <- c("Y>y", "Y<y")
mat
##      X>x  X<x
## Y>y 3719 3781
## Y<y 1281 1219
fisher.test(mat)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  mat
## p-value = 0.1589
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8540543 1.0257819
## sample estimates:
## odds ratio 
##  0.9360223
chisq.test(mat)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mat
## X-squared = 1.9845, df = 1, p-value = 0.1589

The p-value of the two tests are aproximately the same. That value is greater than 0.05. It means that we do not reject the null hypothesis. There is a convincing evedence that the distribution of X is independent from the distribution of Y. The Fisher test is used for small sample sizes wher the Chi_Square test is not appropriate.

Problem 2

Reading the dataset

train.set <- read.csv('C:\\DATA605\\DATA605\\train.csv')
head(train.set)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000

Descriptive and Inferential Statistics

Structure of the dataset

str(train.set)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley        : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
##  $ LotShape     : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
##  $ LandContour  : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig    : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope    : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2   : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
##  $ Exterior2nd  : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
##  $ MasVnrType   : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ GarageType   : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ PavedDrive   : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ MiscFeature  : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

summary

summary(train.set)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50   
##  Median :  9478               NA's:1369   IR3: 10   Low:  36   
##  Mean   : 10517                           Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                
##  Max.   :215245                                                
##                                                                
##   Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub:1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  NoSeWa:   1   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##                FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                FR3    :   4              Edwards:100   RRAn   :  26  
##                Inside :1052              Somerst: 86   PosN   :  19  
##                                          Gilbert: 79   RRAe   :  11  
##                                          (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual    
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000  
##  (Other):   2                 (Other): 19                   
##   OverallCond      YearBuilt     YearRemodAdd    RoofStyle   
##  Min.   :1.000   Min.   :1872   Min.   :1950   Flat   :  13  
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Gable  :1141  
##  Median :5.000   Median :1973   Median :1994   Gambrel:  11  
##  Mean   :5.575   Mean   :1971   Mean   :1985   Hip    : 286  
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   Mansard:   7  
##  Max.   :9.000   Max.   :2010   Max.   :2010   Shed   :   2  
##                                                              
##     RoofMatl     Exterior1st   Exterior2nd    MasVnrType    MasVnrArea    
##  CompShg:1434   VinylSd:515   VinylSd:504   BrkCmn : 15   Min.   :   0.0  
##  Tar&Grv:  11   HdBoard:222   MetalSd:214   BrkFace:445   1st Qu.:   0.0  
##  WdShngl:   6   MetalSd:220   HdBoard:207   None   :864   Median :   0.0  
##  WdShake:   5   Wd Sdng:206   Wd Sdng:197   Stone  :128   Mean   : 103.7  
##  ClyTile:   1   Plywood:108   Plywood:142   NA's   :  8   3rd Qu.: 166.0  
##  Membran:   1   CemntBd: 61   CmentBd: 60                 Max.   :1600.0  
##  (Other):   2   (Other):128   (Other):136                 NA's   :8       
##  ExterQual ExterCond  Foundation  BsmtQual   BsmtCond    BsmtExposure
##  Ex: 52    Ex:   3   BrkTil:146   Ex  :121   Fa  :  45   Av  :221    
##  Fa: 14    Fa:  28   CBlock:634   Fa  : 35   Gd  :  65   Gd  :134    
##  Gd:488    Gd: 146   PConc :647   Gd  :618   Po  :   2   Mn  :114    
##  TA:906    Po:   1   Slab  : 24   TA  :649   TA  :1311   No  :953    
##            TA:1282   Stone :  6   NA's: 37   NA's:  37   NA's: 38    
##                      Wood  :  3                                      
##                                                                      
##  BsmtFinType1   BsmtFinSF1     BsmtFinType2   BsmtFinSF2     
##  ALQ :220     Min.   :   0.0   ALQ :  19    Min.   :   0.00  
##  BLQ :148     1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00  
##  GLQ :418     Median : 383.5   GLQ :  14    Median :   0.00  
##  LwQ : 74     Mean   : 443.6   LwQ :  46    Mean   :  46.55  
##  Rec :133     3rd Qu.: 712.2   Rec :  54    3rd Qu.:   0.00  
##  Unf :430     Max.   :5644.0   Unf :1256    Max.   :1474.00  
##  NA's: 37                      NA's:  38                     
##    BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC CentralAir
##  Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741    N:  95    
##  1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49    Y:1365    
##  Median : 477.5   Median : 991.5   GasW :  18   Gd:241              
##  Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1              
##  3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428              
##  Max.   :2336.0   Max.   :6110.0   Wall :   4                       
##                                                                     
##  Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##  Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                              
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100     
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39     
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586     
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735     
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000              
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000              
##                                                              
##   TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu   GarageType 
##  Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6  
##  1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870  
##  Median : 6.000   Min1:  31   Median :1.000   Gd  :380    Basment: 19  
##  Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88  
##  3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9  
##  Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690    Detchd :387  
##                   Typ :1360                               NA's   : 81  
##   GarageYrBlt   GarageFinish   GarageCars      GarageArea     GarageQual 
##  Min.   :1900   Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3  
##  1st Qu.:1961   RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48  
##  Median :1980   Unf :605     Median :2.000   Median : 480.0   Gd  :  14  
##  Mean   :1979   NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3  
##  3rd Qu.:2002                3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311  
##  Max.   :2010                Max.   :4.000   Max.   :1418.0   NA's:  81  
##  NA's   :81                                                              
##  GarageCond  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Ex  :   2   N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Fa  :  35   P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Gd  :   9   Y:1340     Median :  0.00   Median : 25.00   Median :  0.00  
##  Po  :   7              Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##  TA  :1326              3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##  NA's:  81              Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                           
##    X3SsnPorch      ScreenPorch        PoolArea        PoolQC    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Ex  :   2  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2  
##  Median :  0.00   Median :  0.00   Median :  0.000   Gd  :   3  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   NA's:1453  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000              
##  Max.   :508.00   Max.   :480.00   Max.   :738.000              
##                                                                 
##    Fence      MiscFeature    MiscVal             MoSold      
##  GdPrv:  59   Gar2:   2   Min.   :    0.00   Min.   : 1.000  
##  GdWo :  54   Othr:   2   1st Qu.:    0.00   1st Qu.: 5.000  
##  MnPrv: 157   Shed:  49   Median :    0.00   Median : 6.000  
##  MnWw :  11   TenC:   1   Mean   :   43.49   Mean   : 6.322  
##  NA's :1179   NA's:1406   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                           Max.   :15500.00   Max.   :12.000  
##                                                              
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                 (Other):   9

Histograms Distribution of some variables

par(mfrow = c(4,3))
hist(train.set$MSSubClass, main = 'Type of Dwelling')
plot(train.set$Neighborhood)
plot(train.set$BldgType)
plot(train.set$HouseStyle)
hist(train.set$OverallQual, main = 'Overall Quality')
hist(train.set$OverallCond, main = 'Overall Condition')
hist(train.set$YearBuilt, main = 'Year Bulit')
hist(train.set$SalePrice, main = 'Sale Price')
hist(train.set$GarageCars, main = 'Garage Cars')
plot(train.set$ExterCond)
hist(train.set$FullBath, main = 'Full Bathroom')
hist(train.set$TotalBsmtSF, main = 'Total BasementSF')

Scatter plot Relationship between SalePrice and some independent varibles

par(mfrow = c(2,2))
scatter.smooth(train.set$Neighborhood, train.set$SalePrice)
plot(train.set$OverallCond, train.set$SalePrice)
plot(train.set$GarageArea, train.set$SalePrice)
plot(train.set$LotArea, train.set$SalePrice)

Correlation matrix

df <- data.frame(HouseType = train.set$MSSubClass, GarageArea = train.set$GarageArea, HouseQual = train.set$OverallQual)
cor_mat <- cor(df)
cor_mat
##              HouseType  GarageArea  HouseQual
## HouseType   1.00000000 -0.09867154 0.03262771
## GarageArea -0.09867154  1.00000000 0.56202176
## HouseQual   0.03262771  0.56202176 1.00000000

Correlation Tests Hypothesis Tests

cor.test(train.set$MSSubClass, train.set$GarageArea)
## 
##  Pearson's product-moment correlation
## 
## data:  train.set$MSSubClass and train.set$GarageArea
## t = -3.7861, df = 1458, p-value = 0.0001592
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.14921849 -0.04761025
## sample estimates:
##         cor 
## -0.09867154

The p-value is closed to zero. We reject the null hypothesis.There is convincing evidence that the correalation between MSSubClass and GarageArea is not null.

cor.test(train.set$MSSubClass, train.set$OverallQual)
## 
##  Pearson's product-moment correlation
## 
## data:  train.set$MSSubClass and train.set$OverallQual
## t = 1.2465, df = 1458, p-value = 0.2128
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01870591  0.08378976
## sample estimates:
##        cor 
## 0.03262771

We do not reject the null hypothesis. The correlation between MSSubClass and OverallQual is null.

cor.test(train.set$GarageArea, train.set$OverallQual)
## 
##  Pearson's product-moment correlation
## 
## data:  train.set$GarageArea and train.set$OverallQual
## t = 25.946, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5258822 0.5961357
## sample estimates:
##       cor 
## 0.5620218

We reject the null hypothesis. There is a convincing evidence that the correlation between GarageArea and OverallQual is not null.

Linear Algebra and Correlation

Precision Matrix

prec_mat <- solve(cor_mat)
prec_mat
##             HouseType GarageArea  HouseQual
## HouseType   1.0215308  0.1747154 -0.1315241
## GarageArea  0.1747154  1.4915893 -0.8440062
## HouseQual  -0.1315241 -0.8440062  1.4786412

Prodcut cor X prec

round(cor_mat%*%prec_mat, 1)
##            HouseType GarageArea HouseQual
## HouseType          1          0         0
## GarageArea         0          1         0
## HouseQual          0          0         1

Product of two matrices prec X cor

round(prec_mat%*%cor_mat,1)
##            HouseType GarageArea HouseQual
## HouseType          1          0         0
## GarageArea         0          1         0
## HouseQual          0          0         1

LU decomposition

library(pracma)
LU_M <- lu(cor_mat)
LU_M
## $L
##              HouseType GarageArea HouseQual
## HouseType   1.00000000  0.0000000         0
## GarageArea -0.09867154  1.0000000         0
## HouseQual   0.03262771  0.5707985         1
## 
## $U
##            HouseType  GarageArea  HouseQual
## HouseType          1 -0.09867154 0.03262771
## GarageArea         0  0.99026393 0.56524118
## HouseQual          0  0.00000000 0.67629660

Calulus-Based Probability & Statistics

hist(train.set$MSSubClass)

dwelling type

X <- train.set$MSSubClass
dt <- (X - min(X))

MASS Package

Exponential density function. Optimal \(\lambda\)

library(MASS)

expdfun <- fitdistr(dt,"exponential")
expdfun$estimate
##       rate 
## 0.02710228

1000 samples from this exponential distribution using this value

sexp <- rexp(1000, expdfun$estimate)
hist(sexp, breaks = 100, main = "Fitted Exponential PDF",xlim = c(1,quantile(rexp(1000,expdfun$estimate),0.99)))

The two histograms are rigth skewed.

hist(dt,main = "Type of dwelling",breaks = 100, xlim = c(1,quantile(dt,0.99)))

5th and 95th percentiles using CDF

qexp(0.05,rate = expdfun$estimate, lower.tail = TRUE,log.p = FALSE)
## [1] 1.892582
qexp(0.95, rate = expdfun$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 110.5343

95% confidence interval from empirical data

mean(dt)
## [1] 36.89726
sd.sexp <- sd(sexp)
margin.error <- qnorm(.975)*(sd.sexp/sqrt(length(sexp)-1))
margin.error
## [1] 2.246614
mean(sexp) - margin.error
## [1] 32.62263
mean(sexp) + margin.error
## [1] 37.11586
We are confident that 95% of the data fall beteewn 33.19 and 37.49

5% and 95% percentile

quantile(dt, c(0.05, 0.95))
##  5% 95% 
##   0 140

Modeling

reg <- lm(formula = SalePrice ~ MSSubClass + GarageQual + BsmtQual + OverallQual + Neighborhood, data = train.set)

Get the test data

test.set <- read.csv('C:\\DATA605\\DATA605\\test.csv')
pred <- predict(reg, test.set)

kaggle Score

dfk <- data.frame( Id = test.set[,"Id"],  SalePrice =pred)
dfk[dfk<0] <- 0
dfk <- replace(dfk,is.na(dfk),0)
write.csv(dfk, file="dfk.csv", row.names = FALSE)

Kaggle username: alainkuiete Kaggle score: 3.058