library(readr)
library(tidyverse)
library(stringr)
library(readr)
library(purrr)
library(plyr)
library(corrplot)
library(Hmisc)
library(GGally)
library(matlib)
library(MASS)

Problem 2

Assignment

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following:

path = "https://raw.githubusercontent.com/kelloggjohnd/Data605/master/train.csv"
train.set<- read_csv(file = path)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   Id = col_double(),
##   MSSubClass = col_double(),
##   LotFrontage = col_double(),
##   LotArea = col_double(),
##   OverallQual = col_double(),
##   OverallCond = col_double(),
##   YearBuilt = col_double(),
##   YearRemodAdd = col_double(),
##   MasVnrArea = col_double(),
##   BsmtFinSF1 = col_double(),
##   BsmtFinSF2 = col_double(),
##   BsmtUnfSF = col_double(),
##   TotalBsmtSF = col_double(),
##   `1stFlrSF` = col_double(),
##   `2ndFlrSF` = col_double(),
##   LowQualFinSF = col_double(),
##   GrLivArea = col_double(),
##   BsmtFullBath = col_double(),
##   BsmtHalfBath = col_double(),
##   FullBath = col_double()
##   # ... with 18 more columns
## )
## See spec(...) for full column specifications.
summary(train.set)
##        Id         MSSubClass      MSZoning          LotFrontage 
##  Min.   :   1   Min.   : 20.0   Length:1460        Min.   : 21  
##  1st Qu.: 366   1st Qu.: 20.0   Class :character   1st Qu.: 59  
##  Median : 730   Median : 50.0   Mode  :character   Median : 69  
##  Mean   : 730   Mean   : 56.9                      Mean   : 70  
##  3rd Qu.:1095   3rd Qu.: 70.0                      3rd Qu.: 80  
##  Max.   :1460   Max.   :190.0                      Max.   :313  
##                                                    NA's   :259  
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig        
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   LandSlope         Neighborhood        Condition1       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   Condition2          BldgType          HouseStyle         OverallQual  
##  Length:1460        Length:1460        Length:1460        Min.   : 1.0  
##  Class :character   Class :character   Class :character   1st Qu.: 5.0  
##  Mode  :character   Mode  :character   Mode  :character   Median : 6.0  
##                                                           Mean   : 6.1  
##                                                           3rd Qu.: 7.0  
##                                                           Max.   :10.0  
##                                                                         
##   OverallCond     YearBuilt     YearRemodAdd   RoofStyle        
##  Min.   :1.00   Min.   :1872   Min.   :1950   Length:1460       
##  1st Qu.:5.00   1st Qu.:1954   1st Qu.:1967   Class :character  
##  Median :5.00   Median :1973   Median :1994   Mode  :character  
##  Mean   :5.58   Mean   :1971   Mean   :1985                     
##  3rd Qu.:6.00   3rd Qu.:2000   3rd Qu.:2004                     
##  Max.   :9.00   Max.   :2010   Max.   :2010                     
##                                                                 
##    RoofMatl         Exterior1st        Exterior2nd       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   MasVnrType          MasVnrArea    ExterQual          ExterCond        
##  Length:1460        Min.   :   0   Length:1460        Length:1460       
##  Class :character   1st Qu.:   0   Class :character   Class :character  
##  Mode  :character   Median :   0   Mode  :character   Mode  :character  
##                     Mean   : 104                                        
##                     3rd Qu.: 166                                        
##                     Max.   :1600                                        
##                     NA's   :8                                           
##   Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  BsmtExposure       BsmtFinType1         BsmtFinSF1   BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0   Class :character  
##  Mode  :character   Mode  :character   Median : 384   Mode  :character  
##                                        Mean   : 444                     
##                                        3rd Qu.: 712                     
##                                        Max.   :5644                     
##                                                                         
##    BsmtFinSF2       BsmtUnfSF     TotalBsmtSF     Heating         
##  Min.   :   0.0   Min.   :   0   Min.   :   0   Length:1460       
##  1st Qu.:   0.0   1st Qu.: 223   1st Qu.: 796   Class :character  
##  Median :   0.0   Median : 478   Median : 992   Mode  :character  
##  Mean   :  46.5   Mean   : 567   Mean   :1057                     
##  3rd Qu.:   0.0   3rd Qu.: 808   3rd Qu.:1298                     
##  Max.   :1474.0   Max.   :2336   Max.   :6110                     
##                                                                   
##   HeatingQC          CentralAir         Electrical           1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##     2ndFlrSF     LowQualFinSF     GrLivArea     BsmtFullBath  
##  Min.   :   0   Min.   :  0.0   Min.   : 334   Min.   :0.000  
##  1st Qu.:   0   1st Qu.:  0.0   1st Qu.:1130   1st Qu.:0.000  
##  Median :   0   Median :  0.0   Median :1464   Median :0.000  
##  Mean   : 347   Mean   :  5.8   Mean   :1515   Mean   :0.425  
##  3rd Qu.: 728   3rd Qu.:  0.0   3rd Qu.:1777   3rd Qu.:1.000  
##  Max.   :2065   Max.   :572.0   Max.   :5642   Max.   :3.000  
##                                                               
##   BsmtHalfBath       FullBath       HalfBath      BedroomAbvGr 
##  Min.   :0.0000   Min.   :0.00   Min.   :0.000   Min.   :0.00  
##  1st Qu.:0.0000   1st Qu.:1.00   1st Qu.:0.000   1st Qu.:2.00  
##  Median :0.0000   Median :2.00   Median :0.000   Median :3.00  
##  Mean   :0.0575   Mean   :1.57   Mean   :0.383   Mean   :2.87  
##  3rd Qu.:0.0000   3rd Qu.:2.00   3rd Qu.:1.000   3rd Qu.:3.00  
##  Max.   :2.0000   Max.   :3.00   Max.   :2.000   Max.   :8.00  
##                                                                
##   KitchenAbvGr  KitchenQual         TotRmsAbvGrd    Functional       
##  Min.   :0.00   Length:1460        Min.   : 2.00   Length:1460       
##  1st Qu.:1.00   Class :character   1st Qu.: 5.00   Class :character  
##  Median :1.00   Mode  :character   Median : 6.00   Mode  :character  
##  Mean   :1.05                      Mean   : 6.52                     
##  3rd Qu.:1.00                      3rd Qu.: 7.00                     
##  Max.   :3.00                      Max.   :14.00                     
##                                                                      
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1978  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars     GarageArea    GarageQual       
##  Length:1460        Min.   :0.00   Min.   :   0   Length:1460       
##  Class :character   1st Qu.:1.00   1st Qu.: 334   Class :character  
##  Mode  :character   Median :2.00   Median : 480   Mode  :character  
##                     Mean   :1.77   Mean   : 473                     
##                     3rd Qu.:2.00   3rd Qu.: 576                     
##                     Max.   :4.00   Max.   :1418                     
##                                                                     
##   GarageCond         PavedDrive          WoodDeckSF     OpenPorchSF   
##  Length:1460        Length:1460        Min.   :  0.0   Min.   :  0.0  
##  Class :character   Class :character   1st Qu.:  0.0   1st Qu.:  0.0  
##  Mode  :character   Mode  :character   Median :  0.0   Median : 25.0  
##                                        Mean   : 94.2   Mean   : 46.7  
##                                        3rd Qu.:168.0   3rd Qu.: 68.0  
##                                        Max.   :857.0   Max.   :547.0  
##                                                                       
##  EnclosedPorch   3SsnPorch      ScreenPorch       PoolArea    
##  Min.   :  0   Min.   :  0.0   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:  0   1st Qu.:  0.0   1st Qu.:  0.0   1st Qu.:  0.0  
##  Median :  0   Median :  0.0   Median :  0.0   Median :  0.0  
##  Mean   : 22   Mean   :  3.4   Mean   : 15.1   Mean   :  2.8  
##  3rd Qu.:  0   3rd Qu.:  0.0   3rd Qu.:  0.0   3rd Qu.:  0.0  
##  Max.   :552   Max.   :508.0   Max.   :480.0   Max.   :738.0  
##                                                               
##     PoolQC             Fence           MiscFeature           MiscVal     
##  Length:1460        Length:1460        Length:1460        Min.   :    0  
##  Class :character   Class :character   Class :character   1st Qu.:    0  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0  
##                                                           Mean   :   43  
##                                                           3rd Qu.:    0  
##                                                           Max.   :15500  
##                                                                          
##      MoSold          YrSold       SaleType         SaleCondition     
##  Min.   : 1.00   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.00   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.00   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.32   Mean   :2008                                        
##  3rd Qu.: 8.00   3rd Qu.:2009                                        
##  Max.   :12.00   Max.   :2010                                        
##                                                                      
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

Part 1: Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Analysis

In order to ensure we have clean data to work from, I removed any column with heavy NA values, such as alley and lot frontage. Replacing the NA’s in these columns will not give any value to our purposes.

First I want to cast a very wide net to see on a simple correlation of non-categorical columns, if there is any stiking correlations I can work with later. Using this data, I can see what over varibles could be added

corr.df <- train.set%>%
  dplyr::select("MSSubClass","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars","GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold","YrSold","SalePrice")%>%
  replace(is.na(.),0)
corr.data <- cor(corr.df)
corrplot(corr.data, order = "hclust", tl.col = "black", tl.srt = 45, method = "ellipse", bg="black")  

The graph shows a lot of strong correlations on Above ground living area (GrLivArea). There is definate correlation between GrLivArea and total rooms above ground [TotRmsAbvGrd] (which makes sense). Another correlation which makes sense is SalePrice and GrLivArea.

The Graph also shows strong correlations on Overall Quality (OverallQual).

corr.data.df <- as.data.frame(corr.data)
rnames <- list("OverallQual", "SalePrice", "GrLivArea", "TotRmsAbvGrd","LotArea", "OverallCond")
corr.data.matrix <- corr.data.df %>%
  dplyr::select(OverallQual, SalePrice, GrLivArea, TotRmsAbvGrd, LotArea, OverallCond)
corr.data.matrix<-subset(corr.data.matrix, row.names(corr.data.matrix) %in% rnames)

train.set$OverallQual.f <-as.factor(as.character(train.set$OverallQual))
ggplot(train.set, aes(x=OverallQual, y=SalePrice, fill=OverallQual.f))+
         geom_boxplot()+ 
  ggtitle("Overall Quality & Sale Price")

ggplot(train.set, aes(x=GrLivArea, y=SalePrice, fill=OverallQual.f))+
         geom_boxplot()+
  ggtitle("Ground Living Area & Sale Price")

ggplot(train.set, aes(x=TotRmsAbvGrd, y=SalePrice, fill=OverallQual.f))+
         geom_boxplot()+
    ggtitle("Total Rooms above Ground & Sale Price")

corr.data.matrix
##              OverallQual SalePrice GrLivArea TotRmsAbvGrd   LotArea
## LotArea          0.10581   0.26384   0.26312      0.19001  1.000000
## OverallQual      1.00000   0.79098   0.59301      0.42745  0.105806
## OverallCond     -0.09193  -0.07786  -0.07969     -0.05758 -0.005636
## GrLivArea        0.59301   0.70862   1.00000      0.82549  0.263116
## TotRmsAbvGrd     0.42745   0.53372   0.82549      1.00000  0.190015
## SalePrice        0.79098   1.00000   0.70862      0.53372  0.263843
##              OverallCond
## LotArea        -0.005636
## OverallQual    -0.091932
## OverallCond     1.000000
## GrLivArea      -0.079686
## TotRmsAbvGrd   -0.057583
## SalePrice      -0.077856
Total rooms above ground vs Above Living Area
cor.test(corr.data.matrix$TotRmsAbvGrd,corr.data.matrix$GrLivArea, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  corr.data.matrix$TotRmsAbvGrd and corr.data.matrix$GrLivArea
## t = 5.3, df = 4, p-value = 0.006
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7470 0.9851
## sample estimates:
##    cor 
## 0.9362

This was a given to test the data. It would make logical sense for total Rooms above ground to be correlated to Above Ground Living area. The P value is acceptably low, the correlation is NOT zero and the interval is close.

OverAll Quality vs Sales Price
cor.test(corr.data.matrix$OverallQual,corr.data.matrix$SalePrice, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  corr.data.matrix$OverallQual and corr.data.matrix$SalePrice
## t = 5.2, df = 4, p-value = 0.007
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7331 0.9842
## sample estimates:
##    cor 
## 0.9323

The correlation is NOT zero. The P value is acceptible and the confidence interval is within a decent gap of each other to be confident in the model

Above Living Area vs Sales Price
cor.test(corr.data.matrix$GrLivArea,corr.data.matrix$SalePrice, conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  corr.data.matrix$GrLivArea and corr.data.matrix$SalePrice
## t = 2.8, df = 4, p-value = 0.05
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.3705 0.9535
## sample estimates:
##    cor 
## 0.8106

The correlation is NOT zero and has very close to an acceptible P value (near enough to <0.05 to accept it). We have a large large confidence interval of between 0.37 and .95 leading us to not be as confident in the model

Total rooms above ground vs Sales Price
cor.test(corr.data.matrix$TotRmsAbvGrd,corr.data.matrix$SalePrice, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  corr.data.matrix$TotRmsAbvGrd and corr.data.matrix$SalePrice
## t = 1.6, df = 4, p-value = 0.2
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.004588  0.900569
## sample estimates:
##    cor 
## 0.6263

The correlation is NOT zero. The p value is NOT acceptible and the confidence interval has a larger gap with each other than other models we will need run further work to be confident in the model.

Part 2: Linear Algebra and Correlation

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

corr.matrix <- as.matrix(corr.data.matrix)
(precisionmatrix <- solve(corr.matrix))
##               LotArea OverallQual OverallCond GrLivArea TotRmsAbvGrd
## OverallQual   0.33252     2.80353     0.07353  -0.43954      0.23295
## SalePrice    -0.41401    -2.11241    -0.01098  -1.35036      0.14210
## GrLivArea    -0.28038    -0.43954     0.05977   4.70614     -2.91955
## TotRmsAbvGrd  0.09491     0.23295    -0.01446  -2.91955      3.21577
## LotArea       1.12972     0.33252    -0.01217  -0.28038      0.09491
## OverallCond  -0.01217     0.07353     1.00977   0.05977     -0.01446
##              SalePrice
## OverallQual   -2.11241
## SalePrice      3.66031
## GrLivArea     -1.35036
## TotRmsAbvGrd   0.14210
## LotArea       -0.41401
## OverallCond   -0.01098
matrix1 <- as.matrix(corr.data.matrix) %*% as.matrix(precisionmatrix)
matrix1
##                               LotArea             OverallQual
## LotArea       1.000000000000000000000  0.00000000000000009259
## OverallQual   0.000000000000000008240  1.00000000000000044409
## OverallCond  -0.000000000000000003469 -0.00000000000000001388
## GrLivArea     0.000000000000000049006 -0.00000000000000013618
## TotRmsAbvGrd -0.000000000000000051608  0.00000000000000015786
## SalePrice    -0.000000000000000009541 -0.00000000000000003816
##                          OverallCond                GrLivArea
## LotArea      0.000000000000000000000 -0.000000000000000140133
## OverallQual  0.000000000000000000000  0.000000000000000001735
## OverallCond  1.000000000000000000000  0.000000000000000006939
## GrLivArea    0.000000000000000000000  1.000000000000000000000
## TotRmsAbvGrd 0.000000000000000006939  0.000000000000000385542
## SalePrice    0.000000000000000000000 -0.000000000000000272352
##                         TotRmsAbvGrd                SalePrice
## LotArea      -0.00000000000000012059 -0.000000000000000052530
## OverallQual   0.00000000000000002494  0.000000000000000022985
## OverallCond  -0.00000000000000001041  0.000000000000000019082
## GrLivArea     0.00000000000000004055 -0.000000000000000002711
## TotRmsAbvGrd  0.99999999999999977796 -0.000000000000000049440
## SalePrice    -0.00000000000000003903  1.000000000000000000000
matrix2 <-as.matrix(precisionmatrix) %*% as.matrix(corr.data.matrix)
matrix2
##                         OverallQual                SalePrice
## OverallQual  1.00000000000000044409  0.000000000000000444089
## SalePrice    0.00000000000000000000  0.999999999999999555911
## GrLivArea    0.00000000000000000000 -0.000000000000000222045
## TotRmsAbvGrd 0.00000000000000040246  0.000000000000000527356
## LotArea      0.00000000000000000000  0.000000000000000000000
## OverallCond  0.00000000000000001214  0.000000000000000008674
##                             GrLivArea             TotRmsAbvGrd
## OverallQual   0.000000000000000000000  0.000000000000000222045
## SalePrice    -0.000000000000000444089 -0.000000000000000444089
## GrLivArea     0.999999999999999555911  0.000000000000000333067
## TotRmsAbvGrd  0.000000000000000180411  1.000000000000000000000
## LotArea      -0.000000000000000055511 -0.000000000000000111022
## OverallCond   0.000000000000000007806  0.000000000000000008674
##                               LotArea              OverallCond
## OverallQual   0.000000000000000111022 -0.000000000000000027756
## SalePrice    -0.000000000000000222045  0.000000000000000055511
## GrLivArea    -0.000000000000000055511 -0.000000000000000013878
## TotRmsAbvGrd -0.000000000000000027756 -0.000000000000000078063
## LotArea       1.000000000000000222045  0.000000000000000006939
## OverallCond   0.000000000000000002168  1.000000000000000000000
b <- corr.data.matrix$SalePrice

LU(as.matrix(corr.data.matrix),b, verbose = FALSE)
## $P
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    0    0    0    0    0
## [2,]    0    1    0    0    0    0
## [3,]    0    0    1    0    0    0
## [4,]    0    0    0    1    0    0
## [5,]    0    0    0    0    1    0
## [6,]    0    0    0    0    0    1
## 
## $L
##         [,1]     [,2]    [,3]   [,4]   [,5] [,6]
## [1,]  1.0000  0.00000   0.000 0.0000  0.000    0
## [2,]  9.4513  1.00000   0.000 0.0000  0.000    0
## [3,] -0.8689 -0.08891   1.000 0.0000  0.000    0
## [4,]  5.6047  0.45231 -19.631 1.0000  0.000    0
## [5,]  4.0400  0.31256 -18.220 3.9593  1.000    0
## [6,]  7.4758  0.57112   9.088 0.2305 -1.317    1
## 
## $U
##      OverallQual SalePrice GrLivArea TotRmsAbvGrd LotArea OverallCond
## [1,]      0.1058    0.2638   0.26312      0.19001  1.0000   -0.005636
## [2,]      0.0000   -1.7027  -1.89378     -1.36843 -9.3455   -0.038662
## [3,]      0.0000    0.0000  -0.01945     -0.01416  0.0323    0.991665
## [4,]      0.0000    0.0000   0.00000      0.10157 -0.4805   19.436660
## [5,]      0.0000    0.0000   0.00000      0.00000  1.5620  -58.910675
## [6,]      0.0000    0.0000   0.00000      0.00000  0.0000  -91.095864
## 
## $d
##         [,1]
## [1,]  0.2638
## [2,] -1.7027
## [3,]  0.0000
## [4,]  0.0000
## [5,]  0.0000
## [6,]  0.0000
## 
## $x
##      [,1]
## [1,]    0
## [2,]    1
## [3,]    0
## [4,]    0
## [5,]    0
## [6,]    0

Part 3: Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data.

  • Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.
  • Then, load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).
  • Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000,\(\lambda\))).
  • Plot a histogram and compare it with a histogram of your original variable.
  • Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).
  • Also, generate a 95% confidence interval from the empirical data, assuming normality.
  • Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
Selecting Value and shifting to Zero
hist(train.set$LotArea,breaks = 50)

fitmodel<-train.set$LotArea
min(fitmodel)
## [1] 1300
fit <- fitdistr(fitmodel, "exponential")
fit
##       rate    
##   0.000095086 
##  (0.000002489)
Optimal Value and sample
optmodel<- fit$estimate
sim<- rexp(1000,optmodel)
hist(sim,breaks = 50)

5th and 95th percentiles (CDF)
quantile(sim, probs = c(0.05,0.95))
##    5%   95% 
##   644 28987
95% confidence (assume normality)
normality <- rnorm(length(fitmodel),mean(fitmodel),sd(fitmodel))
hist(normality)

quantile(normality, probs = c(0.05,0.95))
##    5%   95% 
## -6131 26138
normality.df <- data.frame(length = normality)
normality.df$from <- "Normality"
sim.df <- data.frame(length = sim)
sim.df$from <- "Sim"
fitmodel.df <- data.frame(length = fitmodel)
fitmodel.df$from <- "Model"

total.df <- rbind(normality.df, sim.df, fitmodel.df)

ggplot(total.df, aes(length, fill=from))+
  geom_density(alpha =0.5)

It seems the data had a lot of huge outlyers which through the model into a heavy right skew at first. When looked at when compared to the Normilization and simulation, the Model data stays very close to both of them.

Part 4:

10 points. Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

good.corr <- corr.data.df%>%
  mutate(Vname = row.names(corr.data.df))%>%
  filter(SalePrice >=.3)%>%
  dplyr::select(Vname,SalePrice) 

train.frame <- dplyr::select(corr.df, good.corr$Vname)%>%
  mutate(YearBuilt.m = 2017-YearBuilt)%>%
  dplyr::select(-YearBuilt)

train.model <- lm(SalePrice ~ .,data = train.frame)
summary(train.model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = train.frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -515923  -17443   -1977   14269  288868 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  -703322.67  122643.83   -5.73      0.0000000118802 ***
## OverallQual    19407.48    1173.62   16.54 < 0.0000000000000002 ***
## YearRemodAdd     323.37      62.26    5.19      0.0000002358310 ***
## BsmtFinSF1        18.32       2.60    7.04      0.0000000000029 ***
## TotalBsmtSF       12.42       4.33    2.87              0.00418 ** 
## `1stFlrSF`        32.00      20.96    1.53              0.12710    
## `2ndFlrSF`        23.65      20.61    1.15              0.25135    
## GrLivArea         17.73      20.52    0.86              0.38758    
## FullBath       -2423.25    2639.40   -0.92              0.35872    
## TotRmsAbvGrd    1668.36    1097.28    1.52              0.12862    
## Fireplaces      7697.14    1791.53    4.30      0.0000185196269 ***
## GarageCars     10216.91    2985.56    3.42              0.00064 ***
## GarageArea        12.89      10.12    1.27              0.20310    
## WoodDeckSF        31.75       8.18    3.88              0.00011 ***
## OpenPorchSF        6.31      15.77    0.40              0.68898    
## YearBuilt.m     -191.59      49.72   -3.85              0.00012 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36700 on 1444 degrees of freedom
## Multiple R-squared:  0.789,  Adjusted R-squared:  0.786 
## F-statistic:  359 on 15 and 1444 DF,  p-value: <0.0000000000000002
plot(train.model)

par(mfrow = c(2,3))
X1 <- train.frame$OverallQual
X2 <- train.frame$GrLivArea
X3 <- train.frame$TotalBsmtSF
X4 <- train.frame$GarageArea
X5 <- train.frame$TotRmsAbvGrd

Y1 <- train.frame$SalePrice


plot(X1,Y1, main = "OverAll Quality", ylab = "Sale Price")
  abline  (lm(Y1~X1),col ="Red",lwd =3)

plot(X2,Y1, main = "Ground Living Area", ylab = "Sale Price")
  abline  (lm(Y1~X2),col ="Red",lwd =3)
  
plot(X3,Y1, main = "Total Basement SF", ylab = "Sale Price")
  abline  (lm(Y1~X3),col ="Red",lwd =3)  

plot(X4,Y1, main = "Garage Area", ylab = "Sale Price")
  abline  (lm(Y1~X4),col ="Red",lwd =3)  

plot(X5,Y1, main = "Total Rooms AbvGrd", ylab = "Sale Price")
  abline  (lm(Y1~X5),col ="Red",lwd =3) 
  
train.frame <- train.frame%>%
  dplyr::select(OverallQual,GrLivArea,TotalBsmtSF,GarageArea,TotRmsAbvGrd, SalePrice)
train.model.best <- lm(SalePrice ~ .,data = train.frame)

ggpairs(
  train.frame,
  lower = list(continuous = ggally_points, combo = ggally_dot_no_facet)
  )

path2 = "https://raw.githubusercontent.com/kelloggjohnd/Data605/master/test.csv"
test.set<- read_csv(file = path2)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   Id = col_double(),
##   MSSubClass = col_double(),
##   LotFrontage = col_double(),
##   LotArea = col_double(),
##   OverallQual = col_double(),
##   OverallCond = col_double(),
##   YearBuilt = col_double(),
##   YearRemodAdd = col_double(),
##   MasVnrArea = col_double(),
##   BsmtFinSF1 = col_double(),
##   BsmtFinSF2 = col_double(),
##   BsmtUnfSF = col_double(),
##   TotalBsmtSF = col_double(),
##   `1stFlrSF` = col_double(),
##   `2ndFlrSF` = col_double(),
##   LowQualFinSF = col_double(),
##   GrLivArea = col_double(),
##   BsmtFullBath = col_double(),
##   BsmtHalfBath = col_double(),
##   FullBath = col_double()
##   # ... with 17 more columns
## )
## See spec(...) for full column specifications.
test.set <- test.set %>%
  dplyr::select("Id","MSSubClass","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","1stFlrSF","2ndFlrSF","LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageYrBlt","GarageCars","GarageArea","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold","YrSold")%>%
  replace(is.na(.),0)
Testframe <- dplyr::select(test.set, good.corr$Vname[1:15])%>%
  mutate(YearBuilt.m = 2017-YearBuilt)%>%
  dplyr::select(-YearBuilt)
  
Test.model <- model.matrix(YearBuilt.m ~ OverallQual+ GrLivArea+ TotalBsmtSF+ GarageArea+ TotRmsAbvGrd, Testframe)%>%
  as.data.frame%>%
  dplyr::select(-'(Intercept)')

sale.prediction <- predict.lm(train.model.best,Test.model)
sale.prediction <- as.data.frame(sale.prediction)

Product <- data.frame(ID = test.set$Id, SalePrice = sale.prediction$sale.prediction)
head(Product)
##     ID SalePrice
## 1 1461    134881
## 2 1462    168721
## 3 1463    155533
## 4 1464    178303
## 5 1465    226404
## 6 1466    174023
write_csv(Product, 'submission.csv')

Kaggle User name: johnkellogg
Kaggle Score of: .65805