#libraries
library(tidyr)
library(dplyr)
library(pander)
library(ggplot2)
library(corrplot)
library(matlib)
library(matrixcalc)
library(MASS)
library(mltools)
library(caTools)

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of u=o=(N+1)/2.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x; is estimated as the median of the X variable, and the small letter”y; is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

n <- 3
X <- runif(10000, min = 1, max = n)
Y <- rnorm(10000, mean = (n+1)/2, sd = (n+1)/2)
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.497   2.002   2.000   2.503   3.000
x <- mean(X)
y <- summary(X)[2]

a. P(X>x | X>y)

pA <- length(X[X > x])
pB <- length(X[X > y])
((pA / length(X)) * (pB * length(X))) / (pB * length(X))
## [1] 0.5005

b. P(X>x | Y>y)

pA <- length(X[X > x])
pB <- length(X[Y > y])
((pA / length(X)) * (pB * length(X))) / (pB * length(X))
## [1] 0.5005

c. P(X<x | X>y)

pA <- length(X[X < x])
pB <- length(X[X > y])
((pA / length(X)) * (pB * length(X))) / (pB * length(X))
## [1] 0.4995

Problem 2

Load Data

train <- read.csv("train.csv")
test <- read.csv("test.csv")

summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
summary(test)
##        Id         MSSubClass       MSZoning          LotFrontage    
##  Min.   :1461   Min.   : 20.00   Length:1459        Min.   : 21.00  
##  1st Qu.:1826   1st Qu.: 20.00   Class :character   1st Qu.: 58.00  
##  Median :2190   Median : 50.00   Mode  :character   Median : 67.00  
##  Mean   :2190   Mean   : 57.38                      Mean   : 68.58  
##  3rd Qu.:2554   3rd Qu.: 70.00                      3rd Qu.: 80.00  
##  Max.   :2919   Max.   :190.00                      Max.   :200.00  
##                                                     NA's   :227     
##     LotArea         Street             Alley             LotShape        
##  Min.   : 1470   Length:1459        Length:1459        Length:1459       
##  1st Qu.: 7391   Class :character   Class :character   Class :character  
##  Median : 9399   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 9819                                                           
##  3rd Qu.:11518                                                           
##  Max.   :56600                                                           
##                                                                          
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1459        Min.   : 1.000   Min.   :1.000   Min.   :1879  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1953  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.079   Mean   :5.554   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2001  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1459        Length:1459        Length:1459       
##  1st Qu.:1963   Class :character   Class :character   Class :character  
##  Median :1992   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1984                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 100.7                     
##                                        3rd Qu.: 164.0                     
##                                        Max.   :1290.0                     
##                                        NA's   :15                         
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 350.5   Mode  :character  
##                                        Mean   : 439.2                     
##                                        3rd Qu.: 753.5                     
##                                        Max.   :4010.0                     
##                                        NA's   :1                          
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF     Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0   Length:1459       
##  1st Qu.:   0.00   1st Qu.: 219.2   1st Qu.: 784   Class :character  
##  Median :   0.00   Median : 460.0   Median : 988   Mode  :character  
##  Mean   :  52.62   Mean   : 554.3   Mean   :1046                     
##  3rd Qu.:   0.00   3rd Qu.: 797.8   3rd Qu.:1305                     
##  Max.   :1526.00   Max.   :2140.0   Max.   :5095                     
##  NA's   :1         NA's   :1        NA's   :1                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF     
##  Length:1459        Length:1459        Length:1459        Min.   : 407.0  
##  Class :character   Class :character   Class :character   1st Qu.: 873.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :1079.0  
##                                                           Mean   :1156.5  
##                                                           3rd Qu.:1382.5  
##                                                           Max.   :5095.0  
##                                                                           
##    X2ndFlrSF     LowQualFinSF        GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :   0.000   Min.   : 407   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:   0.000   1st Qu.:1118   1st Qu.:0.0000  
##  Median :   0   Median :   0.000   Median :1432   Median :0.0000  
##  Mean   : 326   Mean   :   3.543   Mean   :1486   Mean   :0.4345  
##  3rd Qu.: 676   3rd Qu.:   0.000   3rd Qu.:1721   3rd Qu.:1.0000  
##  Max.   :1862   Max.   :1064.000   Max.   :5095   Max.   :3.0000  
##                                                   NA's   :2       
##   BsmtHalfBath       FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.0000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.0652   Mean   :1.571   Mean   :0.3777   Mean   :2.854  
##  3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.0000   Max.   :4.000   Max.   :2.0000   Max.   :6.000  
##  NA's   :2                                                        
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1459        Min.   : 3.000   Length:1459       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.042                      Mean   : 6.385                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :2.000                      Max.   :15.000                     
##                                                                        
##    Fireplaces     FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.0000   Length:1459        Length:1459        Min.   :1895  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:1959  
##  Median :0.0000   Mode  :character   Mode  :character   Median :1979  
##  Mean   :0.5812                                         Mean   :1978  
##  3rd Qu.:1.0000                                         3rd Qu.:2002  
##  Max.   :4.0000                                         Max.   :2207  
##                                                         NA's   :78    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1459        Min.   :0.000   Min.   :   0.0   Length:1459       
##  Class :character   1st Qu.:1.000   1st Qu.: 318.0   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.766   Mean   : 472.8                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :5.000   Max.   :1488.0                     
##                     NA's   :1       NA's   :1                          
##   GarageCond         PavedDrive          WoodDeckSF       OpenPorchSF    
##  Length:1459        Length:1459        Min.   :   0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:   0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :   0.00   Median : 28.00  
##                                        Mean   :  93.17   Mean   : 48.31  
##                                        3rd Qu.: 168.00   3rd Qu.: 72.00  
##                                        Max.   :1424.00   Max.   :742.00  
##                                                                          
##  EnclosedPorch       X3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :   0.00   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :   0.00   Median :  0.000   Median :  0.00   Median :  0.000  
##  Mean   :  24.24   Mean   :  1.794   Mean   : 17.06   Mean   :  1.744  
##  3rd Qu.:   0.00   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :1012.00   Max.   :360.000   Max.   :576.00   Max.   :800.000  
##                                                                        
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1459        Length:1459        Length:1459        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   58.17  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :17000.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1459        Length:1459       
##  1st Qu.: 4.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.104   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
## 

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

train %>%
  summarize(variable = "Sale Price",
            mean = mean(SalePrice),
            st_dev = sd(SalePrice),
            q0.25 = quantile(SalePrice, 0.25),
            q0.5 = quantile(SalePrice, 0.5),
            q0.75 = quantile(SalePrice, 0.75),
            min = min(SalePrice), 
            max = max(SalePrice)) %>%
  pander
variable mean st_dev q0.25 q0.5 q0.75 min max
Sale Price 180921 79443 129975 163000 214000 34900 755000
#Histogram of Sale Prices
hist(train$SalePrice, 
     main = "Sale Price Distribution", 
     xlab = "Sale Price",
     col = "brown1")

The Scatterplot Matrix and Correlation Matrix will compare the dependent variable Sale Price to 4 independent variables: - Year Built - Year Remodeled - First Floor Square Feet - Second Floor Square Feet

Scatterplot Matrix

df_scatter <- train %>%
  dplyr::select(SalePrice, YearBuilt, YearRemodAdd, X1stFlrSF,X2ndFlrSF)
pairs(df_scatter)

Correlation matrix

c_matrix <- cor(df_scatter)
c_matrix
##              SalePrice  YearBuilt YearRemodAdd  X1stFlrSF   X2ndFlrSF
## SalePrice    1.0000000 0.52289733    0.5071010  0.6058522  0.31933380
## YearBuilt    0.5228973 1.00000000    0.5928550  0.2819859  0.01030766
## YearRemodAdd 0.5071010 0.59285498    1.0000000  0.2403793  0.14002378
## X1stFlrSF    0.6058522 0.28198586    0.2403793  1.0000000 -0.20264618
## X2ndFlrSF    0.3193338 0.01030766    0.1400238 -0.2026462  1.00000000
corrplot(c_matrix, type = "upper")

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

cor.test(df_scatter$SalePrice, df_scatter$YearBuilt, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  df_scatter$SalePrice and df_scatter$YearBuilt
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4980766 0.5468619
## sample estimates:
##       cor 
## 0.5228973
cor.test(df_scatter$SalePrice, df_scatter$YearRemodAdd, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  df_scatter$SalePrice and df_scatter$YearRemodAdd
## t = 22.466, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4817381 0.5316150
## sample estimates:
##      cor 
## 0.507101
cor.test(df_scatter$SalePrice, df_scatter$X1stFlrSF, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  df_scatter$SalePrice and df_scatter$X1stFlrSF
## t = 29.078, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5841687 0.6266715
## sample estimates:
##       cor 
## 0.6058522
cor.test(df_scatter$SalePrice, df_scatter$X2ndFlrSF, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  df_scatter$SalePrice and df_scatter$X2ndFlrSF
## t = 12.867, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2888681 0.3491534
## sample estimates:
##       cor 
## 0.3193338

The p-value for all 4 correlation tests are under 2.0e-16 so correlation is statistically significant. The strongest correlation is between the Sales Price and the First Floor square footage (0.6059) followed by the Year Built (0.5229). The Second Floor square footage has the lowest correlation of the group (0.3193).

Linear Algebra and Correlation

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

#invert the correlation matrix
inv_matrix <- inv(c_matrix)
#multiply the correclation matix by the precision matrix
c1 <- c_matrix %*% inv_matrix
#multiply the precision matrix by the correlation matrix
c2 <- inv_matrix %*% c_matrix
#LU Decomposition for both matricies 
lu.decomposition(c1)
## $L
##               [,1]          [,2]          [,3]          [,4] [,5]
## [1,]  1.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00    0
## [2,] -2.636570e-09  1.000000e+00  0.000000e+00  0.000000e+00    0
## [3,] -2.852093e-09 -6.981081e-09  1.000000e+00  0.000000e+00    0
## [4,] -5.222103e-09 -5.862071e-09 -2.092188e-09  1.000000e+00    0
## [5,] -4.574617e-09  1.460747e-09  1.804052e-12 -5.977337e-10    1
## 
## $U
##      [,1]          [,2]          [,3]          [,4]          [,5]
## [1,]    1 -5.542962e-09 -2.225343e-09 -9.510496e-09 -3.102032e-09
## [2,]    0  1.000000e+00 -3.742093e-09 -8.125003e-09 -1.300352e-09
## [3,]    0  0.000000e+00  1.000000e+00 -7.009452e-09 -1.728247e-09
## [4,]    0  0.000000e+00  0.000000e+00  1.000000e+00 -2.845748e-09
## [5,]    0  0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00
lu.decomposition(c2)
## $L
##               [,1]          [,2]          [,3]          [,4] [,5]
## [1,]  1.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00    0
## [2,] -5.542962e-09  1.000000e+00  0.000000e+00  0.000000e+00    0
## [3,] -2.225343e-09 -3.742093e-09  1.000000e+00  0.000000e+00    0
## [4,] -9.510496e-09 -8.125003e-09 -7.009452e-09  1.000000e+00    0
## [5,] -3.102032e-09 -1.300352e-09 -1.728247e-09 -2.845748e-09    1
## 
## $U
##      [,1]         [,2]          [,3]          [,4]          [,5]
## [1,]    1 -2.63657e-09 -2.852093e-09 -5.222102e-09 -4.574617e-09
## [2,]    0  1.00000e+00 -6.981081e-09 -5.862071e-09  1.460747e-09
## [3,]    0  0.00000e+00  1.000000e+00 -2.092188e-09  1.804052e-12
## [4,]    0  0.00000e+00  0.000000e+00  1.000000e+00 -5.977337e-10
## [5,]    0  0.00000e+00  0.000000e+00  0.000000e+00  1.000000e+00

Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of  for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5 th and 95 th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5 th percentile and 95 th percentile of the data. Discuss.

#Histogram of Sale Prices
hist(train$SalePrice, 
     main = "Sale Price Distribution", 
     xlab = "Sale Price",
     col = "brown1")

exp <- fitdistr(train$SalePrice, densfun = "exponential")
exp_est <- exp$estimate
hist(rexp(1000,exp_est),
     main = "Exponential Distribution of Sales Prices",
     col = "brown1",
     xlab = "1,000 Samples of Sales Prices")

quantile(train$SalePrice, c(0.05,0.95))
##     5%    95% 
##  88000 326100

Modeling

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

#regressor =  lm(formula = Salary~YearsExperience, data = training_set)
regressor = lm(formula = SalePrice~X1stFlrSF + YearBuilt, data = train)
test_filtered <- test %>%
  dplyr::select(X1stFlrSF, YearBuilt)
summary(regressor)
## 
## Call:
## lm(formula = SalePrice ~ X1stFlrSF + YearBuilt, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -419062  -33106  -10242   24623  420077 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.921e+06  9.875e+04  -19.45   <2e-16 ***
## X1stFlrSF    1.023e+02  3.961e+00   25.84   <2e-16 ***
## YearBuilt    1.006e+03  5.070e+01   19.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56120 on 1457 degrees of freedom
## Multiple R-squared:  0.5017, Adjusted R-squared:  0.501 
## F-statistic: 733.5 on 2 and 1457 DF,  p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(regressor)

pred <- predict(regressor, test_filtered)
pred_df <- data.frame(Id = test$Id, SalePrice = pred)
head(pred_df)
##     Id SalePrice
## 1 1461  143305.8
## 2 1462  184600.0
## 3 1463  182796.7
## 4 1464  183598.0
## 5 1465  213789.6
## 6 1466  161887.0
#export predicted values to csv
write.csv(pred_df, "LCancel_Kaggle_Submission.csv",row.names = FALSE)

I used the same fields from the correlation matix in this linear model since I know those files have a high correlation.

Residuals vs Fitted: The plots for the residuals are clustered together around the linear model.

Scale-Location The plots are also clusted together in this graph by there are more spoced out plots, showing more outliers.

Normal Q-Q This plot follows the linear model the most out of the other plots.

Kaggle Score = 0.285 Kaggle Username = ltcancel