packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.84 loaded
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

part 1

# creates a sequence of 10000 numbers between 1 and n
X = function(n){
  if (n>=6){
    return(runif(10000,1,n))
  }else{
    print("Input a number greater or equal to 6")
  }
}

# creates a normal dist with 10000 observations with mean= sd = (n+1)/2
Y = function(n){
  mean = (n+1)/2
  return(rnorm(10000,mean,mean))
}
# a P(X>x|X>y)
N=6
val.X = X(N)
val.Y = Y(N)

# x = median(X)
x = median(val.X)

# y = 1st quartile of Y
y = quantile(val.Y,0.25)

X.greater_x = val.X[val.X>x]
X.greater_y = val.X[val.X>y]

# p(X>x n X>y)
p.anb = val.X[val.X>x & val.X>y]

# Probability X>x given that X>y
length(p.anb)/length(X.greater_y)
## [1] 0.5138218
# b P(X>x n Y>y)

# p(X>x)
p.x = length(X.greater_x)/length(val.X)
# p (Y>y)
Y.greater_y = val.Y[val.Y>y]
p.y = length(Y.greater_y)/length(val.Y)

# p(X>x n Y>y) = P(X>x)*P(Y>y) since they are independent

p.x*p.y
## [1] 0.375
# c P(X<x|X>y)

# P(X<x n X>y)
p = val.X[val.X<x & val.X>y]

# Probability X less than x given X greater than y
length(p)/ length(X.greater_y)
## [1] 0.4861782
# investigate weather or not P(X>x n Y>y) = P(X>x)*P(Y>y) (Independent)

row1 = c(sum(val.X<=x & val.Y<=y)/10000,sum(val.X>x & val.Y<=y)/10000)
row2 = c(sum(val.X<=x & val.Y>y)/10000,sum(val.X>x & val.Y>y)/10000)
table = matrix(c(row1,row2),nrow=2)
table1 = rbind(table,apply(table,2,sum))
table1 = cbind(table1,apply(table1,1,sum))
row_names = c('P(Y<=y)','P(Y>y)','Total')

marginal_prob = data.frame(row_names,table1)
names(marginal_prob) = c('X/Y','P(X<=x)','P(X>x)','Total')

marginal_prob
##       X/Y P(X<=x) P(X>x) Total
## 1 P(Y<=y)  0.1267 0.3733   0.5
## 2  P(Y>y)  0.1233 0.3767   0.5
## 3   Total  0.2500 0.7500   1.0
# P(X>x n Y>y)
marginal_prob[2,3]
## [1] 0.3767
# P(X>x) * p(Y>y)
marginal_prob[3,3]*marginal_prob[2,4]
## [1] 0.375
# I conclude that X and Y are independent due to P(X>x n Y>y) = P(X>x) * p(Y>y)
# fisher test
new_table = table*10000

fisher.test(new_table)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  new_table
## p-value = 0.446
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9461832 1.1363998
## sample estimates:
## odds ratio 
##   1.036945
#chi squared test
chisq.test(new_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  new_table
## X-squared = 0.5808, df = 1, p-value = 0.446
# fisher test is for small samples while chi squared is for larger samples. Since our sample size is 10000 random variables, chi squared test would be more appropriate 

# both tests conclude that we should assume independence as there is a large p-value

part 2

# loading data
prices.data = read.csv('https://raw.githubusercontent.com/schoolkidrich/CUNY_MSDS/main/DATA_605/housing_prices/train.csv')
prices.eval = read.csv('https://raw.githubusercontent.com/schoolkidrich/CUNY_MSDS/main/DATA_605/housing_prices/test.csv')

head(prices.data)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      2   2008       WD        Normal    208500
## 2       0      5   2007       WD        Normal    181500
## 3       0      9   2008       WD        Normal    223500
## 4       0      2   2006       WD       Abnorml    140000
## 5       0     12   2008       WD        Normal    250000
## 6     700     10   2009       WD        Normal    143000
summary(prices.data)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

scatterplot Living Area vs SalePrice

prices.data%>%
  ggplot(aes(x=GrLivArea, y=SalePrice))+geom_point()+labs(title='Living Area vs Sale Price',x = 'Living Area', y = 'Sale Price')+geom_smooth(method='lm',formula=y~x)

scatterplot Garage size vs Sale Price

prices.data%>%
  ggplot(aes(x=GarageArea, y=SalePrice))+geom_point()+labs(title='Garage Size vs Sale Price',x = 'Garage Size', y = 'Sale Price')+geom_smooth(method='lm',formula=y~x)

#variables I want to look at
variables = c('GrLivArea', 'GarageArea', 'SalePrice')

#correlation matrix
cor_matrix = cor(prices.data[variables])
cor_matrix
##            GrLivArea GarageArea SalePrice
## GrLivArea  1.0000000  0.4689975 0.7086245
## GarageArea 0.4689975  1.0000000 0.6234314
## SalePrice  0.7086245  0.6234314 1.0000000

testing correlation

# correlation test between Living Area and Garage Area
cor.test(prices.data$GrLivArea, prices.data$GarageArea, conf.level = .8)
## 
##  Pearson's product-moment correlation
## 
## data:  prices.data$GrLivArea and prices.data$GarageArea
## t = 20.276, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4423993 0.4947713
## sample estimates:
##       cor 
## 0.4689975
# Correlation test between sale price and living area
cor.test(prices.data$GrLivArea, prices.data$SalePrice, conf.level = .8)
## 
##  Pearson's product-moment correlation
## 
## data:  prices.data$GrLivArea and prices.data$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245
# correlation test between sale price and garage area
cor.test(prices.data$GarageArea, prices.data$SalePrice, conf.level = .8)
## 
##  Pearson's product-moment correlation
## 
## data:  prices.data$GarageArea and prices.data$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314
# pairwise correlation tests to see if two variables correlations are 0 or not. For all of our tests, since 0 was not within the 80% confidence interval, we were unable to accept the null hypothesis. 

# we should be worried about family-wise error as there are many in this dataset and, if we conducted pairwise hypothesis tests for each pair of variables, error rates compound quickly
# inverse matrix or precision matrix
inverse_cor = solve(cor_matrix)

# inverse * matrix
identity_1= round(inverse_cor%*%cor_matrix) 
# matrix * inverse
identity_2= round(cor_matrix%*%inverse_cor) 

# both matrices are the same
identity_1==identity_2
##            GrLivArea GarageArea SalePrice
## GrLivArea       TRUE       TRUE      TRUE
## GarageArea      TRUE       TRUE      TRUE
## SalePrice       TRUE       TRUE      TRUE
# multiplying a matrix by its inverse produces an identity matrix
identity_1
##            GrLivArea GarageArea SalePrice
## GrLivArea          1          0         0
## GarageArea         0          1         0
## SalePrice          0          0         1
# function that performs LU decomp
LU_decomp = function(m){
  count = dim(m)[1]
  U = matrix(c(rep(0,count*2)),nrow=count,ncol=count)
  L = matrix(c(rep(0,count*2)),nrow=count,ncol=count)
   for(i in seq(count)){
    L[i,i] = 1
    U[i,i] = m[i,i]
    for(j in seq(count)[i+1:count]){
      L[j,i] = m[i,j]/U[i,i]
      U[i,j] = m[j,i]
    }
    for(j in seq(count)[i+1:count]){
      for(k in seq(count)[i+1:count]){
        m[j,k]=m[j,k]-L[j,i]*U[i,k]
      }
    }
   }
  return(list(L=L,U=U))
}

# lu decomposition of correlation matrix
LU_decomp(cor_matrix)
## $L
##           [,1]      [,2] [,3]
## [1,] 1.0000000 0.0000000    0
## [2,] 0.4689975 1.0000000    0
## [3,] 0.7086245 0.3731704    1
## 
## $U
##      [,1]      [,2]      [,3]
## [1,]    1 0.4689975 0.7086245
## [2,]    0 0.7800414 0.2910883
## [3,]    0 0.0000000 0.3892258

fitdistr() function from MASS package

price = prices.data$SalePrice
fit = fitdistr(prices.data$SalePrice, densfun = 'exponential')
sample_fit=rexp(1000, fit$estimate)

par(mfrow=c(1,2))

# sale price is somewhat skewed to the right
hist(price, xlab='Sale Price', main='Observation')
# simulated
hist(sample_fit, xlab = 'Sale Price', main='Simulation')

# 5% and 95% quantiles for sample data
quantile(sample_fit, c(0.05,0.95))
##        5%       95% 
##  11128.84 537083.29
# 95% confidence interval assuming normality
z = 1.96
m = mean(price)
sd = sd(price)
n = length(price)

ci = c(m-z*(sd/sqrt(n)), m+z*(sd/sqrt(n)))

ci
## [1] 176846.1 184996.2
# 5% and 95% quantiles for observed data
quantile(price, c(0.05,0.95))
##     5%    95% 
##  88000 326100
# the sample data overestimates the observed prices while an assumption of normality underestimates. This shows that the data is not completely right skewed but also not completely normal 

modeling

# 70% train test split
size = dim(prices.data)[1]
set.seed(1111)
training = sample(seq(size),size = round(size*.7))

prices.train = prices.data[training,]
prices.test = prices.data[-training,]

prices.model = lm(SalePrice~Neighborhood+GrLivArea+GarageArea+MSSubClass+YearBuilt,
                  data = prices.train)
summary(prices.model)
## 
## Call:
## lm(formula = SalePrice ~ Neighborhood + GrLivArea + GarageArea + 
##     MSSubClass + YearBuilt, data = prices.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -364914  -17450   -1908   13842  281669 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.332e+06  1.696e+05  -7.855 1.03e-14 ***
## NeighborhoodBlueste -2.100e+04  3.947e+04  -0.532 0.594771    
## NeighborhoodBrDale  -2.667e+04  1.642e+04  -1.625 0.104543    
## NeighborhoodBrkSide -1.718e+04  1.431e+04  -1.200 0.230257    
## NeighborhoodClearCr -1.133e+03  1.472e+04  -0.077 0.938685    
## NeighborhoodCollgCr -2.008e+04  1.217e+04  -1.650 0.099172 .  
## NeighborhoodCrawfor  1.870e+04  1.444e+04   1.295 0.195584    
## NeighborhoodEdwards -3.718e+04  1.301e+04  -2.858 0.004352 ** 
## NeighborhoodGilbert -2.903e+04  1.254e+04  -2.315 0.020790 *  
## NeighborhoodIDOTRR  -2.840e+04  1.536e+04  -1.849 0.064790 .  
## NeighborhoodMeadowV -2.234e+04  1.534e+04  -1.456 0.145580    
## NeighborhoodMitchel -3.028e+04  1.320e+04  -2.294 0.022001 *  
## NeighborhoodNAmes   -3.093e+04  1.262e+04  -2.450 0.014443 *  
## NeighborhoodNoRidge  4.340e+04  1.376e+04   3.155 0.001656 ** 
## NeighborhoodNPkVill -6.915e+03  1.933e+04  -0.358 0.720542    
## NeighborhoodNridgHt  4.698e+04  1.283e+04   3.662 0.000263 ***
## NeighborhoodNWAmes  -3.237e+04  1.304e+04  -2.483 0.013176 *  
## NeighborhoodOldTown -2.042e+04  1.419e+04  -1.439 0.150418    
## NeighborhoodSawyer  -3.035e+04  1.320e+04  -2.299 0.021714 *  
## NeighborhoodSawyerW -2.702e+04  1.287e+04  -2.099 0.036071 *  
## NeighborhoodSomerst -9.761e+00  1.237e+04  -0.001 0.999371    
## NeighborhoodStoneBr  7.051e+04  1.438e+04   4.905 1.09e-06 ***
## NeighborhoodSWISU   -2.363e+04  1.583e+04  -1.493 0.135722    
## NeighborhoodTimber  -1.884e+03  1.411e+04  -0.134 0.893786    
## NeighborhoodVeenker  4.677e+04  1.712e+04   2.732 0.006406 ** 
## GrLivArea            7.312e+01  2.882e+00  25.371  < 2e-16 ***
## GarageArea           4.296e+01  7.427e+00   5.784 9.75e-09 ***
## MSSubClass          -2.976e+02  3.322e+01  -8.960  < 2e-16 ***
## YearBuilt            7.163e+02  8.450e+01   8.478  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37700 on 993 degrees of freedom
## Multiple R-squared:  0.7727, Adjusted R-squared:  0.7663 
## F-statistic: 120.6 on 28 and 993 DF,  p-value: < 2.2e-16
# testing model on test set
predictions = predict(prices.model,prices.test)

# how well the model fits the test data (R^2)
cor(predictions,prices.test$SalePrice)^2
## [1] 0.795895
# replace na's for Garage Area variable
prices.eval$GarageArea = prices.eval$GarageArea%>%
  replace_na(0)
# predictions on evaluation set
prices.eval$SalePrice = predict(prices.model, prices.eval)

# create submission file for kaggle
submission = prices.eval[c('Id','SalePrice')]

head(submission)
##     Id SalePrice
## 1 1461  132807.5
## 2 1462  144360.8
## 3 1463  191540.8
## 4 1464  189913.6
## 5 1465  245150.2
## 6 1466  188772.1
write.csv(submission, file = "submission.csv", row.names = FALSE)

#kaggle user: schoolboyrich
#kaggle score: 0.20458