Prediction of House Sale Price

Installing required Libraries
suppressWarnings(library(data.table))
suppressWarnings(library(corrplot))
## corrplot 0.84 loaded
suppressWarnings(library(ggplot2))
suppressWarnings(library(knitr))
suppressWarnings(library(Hmisc))
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
suppressWarnings(library(MASS))
suppressWarnings(library(forecast))

Data

houses <- fread("https://raw.githubusercontent.com/gpsingh12/Data-605-Final-Project/master/train.csv")

names(houses) <- tolower(gsub(" ", "_", names(houses)))

str(houses)
## Classes 'data.table' and 'data.frame':   1460 obs. of  81 variables:
##  $ id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ mssubclass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ mszoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ lotfrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ lotarea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ alley        : chr  NA NA NA NA ...
##  $ lotshape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ landcontour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ lotconfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ landslope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ bldgtype     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ housestyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ overallqual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ overallcond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ yearbuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ yearremodadd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ roofstyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ roofmatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ masvnrtype   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ masvnrarea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ exterqual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ extercond    : chr  "TA" "TA" "TA" "TA" ...
##  $ foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ bsmtqual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ bsmtcond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ bsmtexposure : chr  "No" "Gd" "Mn" "No" ...
##  $ bsmtfintype1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ bsmtfinsf1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ bsmtfintype2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ bsmtfinsf2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ bsmtunfsf    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ totalbsmtsf  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ heatingqc    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ centralair   : chr  "Y" "Y" "Y" "Y" ...
##  $ electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stflrsf     : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ 2ndflrsf     : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ lowqualfinsf : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ grlivarea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ bsmtfullbath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ bsmthalfbath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ fullbath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ halfbath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ bedroomabvgr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ kitchenabvgr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ kitchenqual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ totrmsabvgrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ fireplacequ  : chr  NA "TA" "TA" "Gd" ...
##  $ garagetype   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ garageyrblt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ garagefinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ garagecars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ garagearea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ garagequal   : chr  "TA" "TA" "TA" "TA" ...
##  $ garagecond   : chr  "TA" "TA" "TA" "TA" ...
##  $ paveddrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ wooddecksf   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ openporchsf  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ enclosedporch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ 3ssnporch    : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ screenporch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poolarea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poolqc       : chr  NA NA NA NA ...
##  $ fence        : chr  NA NA NA NA ...
##  $ miscfeature  : chr  NA NA NA NA ...
##  $ miscval      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ mosold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ yrsold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ saletype     : chr  "WD" "WD" "WD" "WD" ...
##  $ salecondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ saleprice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  - attr(*, ".internal.selfref")=<externalptr>
dim(houses)
## [1] 1460   81
kable(head(houses))
id mssubclass mszoning lotfrontage lotarea street alley lotshape landcontour utilities lotconfig landslope neighborhood condition1 condition2 bldgtype housestyle overallqual overallcond yearbuilt yearremodadd roofstyle roofmatl exterior1st exterior2nd masvnrtype masvnrarea exterqual extercond foundation bsmtqual bsmtcond bsmtexposure bsmtfintype1 bsmtfinsf1 bsmtfintype2 bsmtfinsf2 bsmtunfsf totalbsmtsf heating heatingqc centralair electrical 1stflrsf 2ndflrsf lowqualfinsf grlivarea bsmtfullbath bsmthalfbath fullbath halfbath bedroomabvgr kitchenabvgr kitchenqual totrmsabvgrd functional fireplaces fireplacequ garagetype garageyrblt garagefinish garagecars garagearea garagequal garagecond paveddrive wooddecksf openporchsf enclosedporch 3ssnporch screenporch poolarea poolqc fence miscfeature miscval mosold yrsold saletype salecondition saleprice
1 60 RL 65 8450 Pave NA Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 NA Attchd 2003 RFn 2 548 TA TA Y 0 61 0 0 0 0 NA NA NA 0 2 2008 WD Normal 208500
2 20 RL 80 9600 Pave NA Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 TA Attchd 1976 RFn 2 460 TA TA Y 298 0 0 0 0 0 NA NA NA 0 5 2007 WD Normal 181500
3 60 RL 68 11250 Pave NA IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001 RFn 2 608 TA TA Y 0 42 0 0 0 0 NA NA NA 0 9 2008 WD Normal 223500
4 70 RL 60 9550 Pave NA IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998 Unf 3 642 TA TA Y 0 35 272 0 0 0 NA NA NA 0 2 2006 WD Abnorml 140000
5 60 RL 84 14260 Pave NA IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000 RFn 3 836 TA TA Y 192 84 0 0 0 0 NA NA NA 0 12 2008 WD Normal 250000
6 50 RL 85 14115 Pave NA IR1 Lvl AllPub Inside Gtl Mitchel Norm Norm 1Fam 1.5Fin 5 5 1993 1995 Gable CompShg VinylSd VinylSd None 0 TA TA Wood Gd TA No GLQ 732 Unf 0 64 796 GasA Ex Y SBrkr 796 566 0 1362 1 0 1 1 1 1 TA 5 Typ 0 NA Attchd 1993 Unf 2 480 TA TA Y 40 30 0 320 0 0 NA MnPrv Shed 700 10 2009 WD Normal 143000
Missing Values and NA’s

Handling missing values in the berginning to perform analysis smoothly later on. Creating a dataframe for columns of the dataframe houses to remove NA’swith required values(variables). Table out the values connecting each other to analyze NA’s and remove or replace with required data. In our dataset garage condition, garage type, garage condition are interlinked, creating pairs to check this and handle the missing values.

df_na<- data.frame(colSums(is.na(houses)))
names(df_na)<- c( "na")




kable(table(houses$poolqc,houses$poolarea, useNA = 'ifany'))
0 480 512 519 555 576 648 738
Ex 0 0 1 0 1 0 0 0
Fa 0 0 0 1 0 0 1 0
Gd 0 1 0 0 0 1 0 1
NA 1453 0 0 0 0 0 0 0
## 1,453 NA's with poolarea 0. assuming there is no pool, replace with "None"
houses$poolqc[is.na(houses$poolqc)]<- "None"

## Using similar technique for fire places
table(houses$fireplacequ,houses$fireplaces,useNA = 'ifany')
##       
##          0   1   2   3
##   Ex     0  19   4   1
##   Fa     0  28   4   1
##   Gd     0 324  54   2
##   Po     0  20   0   0
##   TA     0 259  53   1
##   <NA> 690   0   0   0
houses$fireplacequ[is.na(houses$fireplacequ)] <- "None"




## missing values for allcolumns related to garage, assuming no garage at all
#table(houses$garagetype,houses$garageyrblt,useNA = 'ifany')


houses$garagetype[is.na(houses$garagetype)] <- "None"
houses$garageyrblt[is.na(houses$garageyrblt)] <- ""
houses$garagecond[is.na(houses$garagecond)] <- "None"
houses$garagefinish[is.na(houses$garagefinish)] <- "None"
houses$garagequal[is.na(houses$garagequal)] <- "None"



## another category
table(is.na(houses$masvnrarea),is.na(houses$masvnrtype))
##        
##         FALSE TRUE
##   FALSE  1452    0
##   TRUE      0    8
houses$masvnrtype[is.na(houses$masvnrtype)] <- "None"
houses$masvnrarea[is.na(houses$masvnrarea)] <- 0


##for basement

table(houses$bsmtfintype1,houses$bsmtfintype2, useNA = 'ifany')
##       
##        ALQ BLQ GLQ LwQ Rec Unf <NA>
##   ALQ    0  15   2  15  22 166    0
##   BLQ    2   1   5  13  15 112    0
##   GLQ    4   2   0  10   9 392    1
##   LwQ    9   4   7   0   8  46    0
##   Rec    4  11   0   8   0 110    0
##   Unf    0   0   0   0   0 430    0
##   <NA>   0   0   0   0   0   0   37
houses$bsmtfintype1[is.na(houses$bsmtfintype1)] <- "None"
houses$bsmtfintype2[is.na(houses$bsmtfintype2)] <- "None"

table(houses$bsmtqual,houses$bsmtcond, useNA = 'ifany')
##       
##         Fa  Gd  Po  TA <NA>
##   Ex     0  11   0 110    0
##   Fa     8   0   2  25    0
##   Gd     2  36   0 580    0
##   TA    35  18   0 596    0
##   <NA>   0   0   0   0   37
houses$bsmtqual[is.na(houses$bsmtqual)] <- "None"
houses$bsmtcond[is.na(houses$bsmtcond)] <- "None"
houses$bsmtexposure[is.na(houses$bsmtexposure)] <- "None"  
## we can replace basement exposure  with "No" as it is included in the values, we assume it is different than ##having a basement with no exposure or no basement at all



### misc. columns
houses$miscfeature[is.na(houses$miscfeature)] <- "None"
houses$fence[is.na(houses$fence)] <- "None"
houses$alley[is.na(houses$alley)] <- "None"
houses$lotfrontage[is.na(houses$lotfrontage)] <- 0
houses$electrical[is.na(houses$electrical)] <- "None"




colSums(is.na(houses))
##            id    mssubclass      mszoning   lotfrontage       lotarea 
##             0             0             0             0             0 
##        street         alley      lotshape   landcontour     utilities 
##             0             0             0             0             0 
##     lotconfig     landslope  neighborhood    condition1    condition2 
##             0             0             0             0             0 
##      bldgtype    housestyle   overallqual   overallcond     yearbuilt 
##             0             0             0             0             0 
##  yearremodadd     roofstyle      roofmatl   exterior1st   exterior2nd 
##             0             0             0             0             0 
##    masvnrtype    masvnrarea     exterqual     extercond    foundation 
##             0             0             0             0             0 
##      bsmtqual      bsmtcond  bsmtexposure  bsmtfintype1    bsmtfinsf1 
##             0             0             0             0             0 
##  bsmtfintype2    bsmtfinsf2     bsmtunfsf   totalbsmtsf       heating 
##             0             0             0             0             0 
##     heatingqc    centralair    electrical      1stflrsf      2ndflrsf 
##             0             0             0             0             0 
##  lowqualfinsf     grlivarea  bsmtfullbath  bsmthalfbath      fullbath 
##             0             0             0             0             0 
##      halfbath  bedroomabvgr  kitchenabvgr   kitchenqual  totrmsabvgrd 
##             0             0             0             0             0 
##    functional    fireplaces   fireplacequ    garagetype   garageyrblt 
##             0             0             0             0             0 
##  garagefinish    garagecars    garagearea    garagequal    garagecond 
##             0             0             0             0             0 
##    paveddrive    wooddecksf   openporchsf enclosedporch     3ssnporch 
##             0             0             0             0             0 
##   screenporch      poolarea        poolqc         fence   miscfeature 
##             0             0             0             0             0 
##       miscval        mosold        yrsold      saletype salecondition 
##             0             0             0             0             0 
##     saleprice 
##             0
## no misiing values, we can move forward with the analysis.

Another required check is to find unique levels of categorical variables.Categories are incorrectly entered as lower case or first letter is capitalized. e.g. column misc feature has the categoies

“None” “Shed” “Gar2” “Othr” “TenC”. We can perform a check at the variables if the data is correct. “Othr” “othr” might corresponds to same category. Random check was performed on few categorical variables. Based on the variables required for analysis, this can be performed on them.

## performing a random check at cat. variables.

unique(houses$extercond)
## [1] "TA" "Gd" "Fa" "Po" "Ex"
unique(houses$extercond)
## [1] "TA" "Gd" "Fa" "Po" "Ex"
unique(houses$miscval)
##  [1]     0   700   350   500   400   480   450 15500  1200   800  2000
## [12]   600  3500  1300    54   620   560  1400  8300  1150  2500
unique(houses$street)
## [1] "Pave" "Grvl"

Selecting Variables for Probability

Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Pick SalePrice as the dependent variable, and define it as Y for the next analysis.

The variable to be selected for the prediction is lot area.Variable X will be the lot area and vriable Y is described as the slae price.

X<- houses$lotarea
Y<- houses$saleprice

Plotting the variables.

ggplot(data=houses, aes(houses$saleprice)) + geom_histogram(bins=20)

ggplot(data=houses, aes(houses$lotarea)) + geom_histogram(bins=150) +coord_cartesian(xlim = c(-1000, 50000)) 

Probability :

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities

## first quartile of X
x <- quantile(X, .25)

x
##    25% 
## 7553.5
## second quartile of Y
y<- quantile(Y,.50)
y
##    50% 
## 163000

Probability

  1. P(X>x|Y>y) Probability of X greater than first quartile of X (7553.5 ) given that Y is greater than 2nd quartile of Y (163,000).
p_X <- nrow(houses[(houses$lotarea > x),])/nrow(houses)
p_X_and_Y <- nrow(houses[(houses$lotarea > x &houses$saleprice > y),])/nrow(houses)
p_Y <- nrow(houses[houses$saleprice> y,])/nrow(houses)

p_X_given_Y <- p_X_and_Y/p_Y

p_X_given_Y
## [1] 0.8653846
  1. P(X>x & Y>y) Probability that X is greater than first qurtile of X (7553.5 ) and Y (sale price) is greater than second quartile of Y (163,000)
p_X_and_Y
## [1] 0.4315068

c.P(Xy) Probability of X less than first quartile of X given Y is greater than second quartile of Y.

p_X_and_Y <- nrow(houses[(houses$lotarea < x &houses$saleprice > y),])/nrow(houses)
p_Y <- nrow(houses[houses$saleprice> y,])/nrow(houses)

p_X_given_Y <- p_X_and_Y/p_Y

p_X_given_Y
## [1] 0.1346154

Does splitting the training data in this fashion make them independent? In other words, does P(X|Y)=P(X)P(Y))? Check mathematically, and then evaluate by running a Chi Square test for association.

For independence :

P(XY)= P(X)P(Y) or

P(X|Y) = P(X)

p_X_and_Y == p_X * p_Y
## [1] FALSE

The variables are independent mathematically. We will perform chi-square test to check the assumption.

Chi-Square Test

H0 : Sale Price and Lot Area are independent

Ha : Sale Price and Lot Area are not independent

chisq.test(houses$saleprice,houses$lotarea)
## Warning in chisq.test(houses$saleprice, houses$lotarea): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  houses$saleprice and houses$lotarea
## X-squared = 735090, df = 709660, p-value < 2.2e-16

p-value is very small, we will reject null hypothesis. The variables are dependent. Lot area does effects the sale price of house.

Descriptive nd Inferential Statistics :

Provide univariate descriptive statistics and appropriate plots for both variables. Provide a scatterplot of X and Y. Transform both variables simultaneously using Box-Cox transformations. You might have to research this.

describe(houses$saleprice)
## houses$saleprice 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1460        0      663        1   180921    81086    88000   106475 
##      .25      .50      .75      .90      .95 
##   129975   163000   214000   278000   326100 
## 
## lowest :  34900  35311  37900  39300  40000, highest: 582933 611657 625000 745000 755000
describe(houses$overallqual)
## houses$overallqual 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1460        0       10    0.951    6.099    1.522        4        5 
##      .25      .50      .75      .90      .95 
##        5        6        7        8        8 
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency      2     3    20   116   397   374   319   168    43    18
## Proportion 0.001 0.002 0.014 0.079 0.272 0.256 0.218 0.115 0.029 0.012
summary(houses$saleprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
summary(houses$lotarea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
## transform using boxcox transformation for normality

lambda_X = BoxCox.lambda(X)
trans_X<-  BoxCox(X,lambda_X)




lambda_Y<- BoxCox.lambda(Y)
trans_Y <- BoxCox(Y,lambda_Y)
trans_df <- data.frame(trans_X, trans_Y)

hist(trans_X)

ggplot(data=trans_df, aes(trans_df$trans_X)) + geom_histogram(bins=30)

ggplot(data=trans_df, aes(trans_df$trans_Y)) + geom_histogram(bins=30)

## 

Link: boxcox

Linear Algebra and Correlation :

Using at least three untransformed variables, build a correlation matrix. Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix

houses1 <- houses[,c("lotarea", "overallqual", "saleprice")]
cor_matrix<-cor(houses1)
corrplot(cor_matrix, method = "square")

corrplot(cor_matrix, method = "number")

prec_matrix <- round(solve(cor_matrix),1)
prec_matrix
##             lotarea overallqual saleprice
## lotarea         1.1         0.3      -0.5
## overallqual     0.3         2.8      -2.3
## saleprice      -0.5        -2.3       2.9
prec_cor<- round(cor_matrix%*%prec_matrix,1)
prec_cor
##             lotarea overallqual saleprice
## lotarea           1           0       0.0
## overallqual       0           1      -0.1
## saleprice         0           0       0.9
Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For your non-transformed independent variable, location shift (if necessary) it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit a density function of your choice. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of the parameters for this distribution, and then take 1000 samples from this distribution (e.g., rexp(1000, ???) for an exponential). Plot a histogram and compare it with a histogram of your non-transformed original variable

min(houses$lotarea)
## [1] 1300
## minimum value above zero, no location shift required


houses_fit <- fitdistr(houses$lotarea, 'normal')
mean_fit <- houses_fit$estimate[1]
sd_fit <- houses_fit$estimate[2]
mean_fit
##     mean 
## 10516.83
sd_fit
##       sd 
## 9977.846
houses_sample <-rnorm(1000,mean_fit, sd_fit)


hist(houses_sample)

hist(houses$lotarea)

##nearly normal compared to original
Modeling:

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Multiple Regression model for prediction: 14 variabless were selected from the dataset for model. In order to avoid noise, the variable influencing ( in general) the saleprice of houses were selected.

houses_cols <- houses[,c("saleprice", "mssubclass", "lotarea", "lotfrontage", "overallcond",                                       "overallqual", "grlivarea", "mosold", "yearbuilt", "yearremodadd",
                         "bedroomabvgr", "fullbath", "garagearea", "garagecars", "poolarea")]
model <- lm(saleprice ~ .,data=houses_cols)

summary(model)
## 
## Call:
## lm(formula = saleprice ~ ., data = houses_cols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -443561  -18954   -2210   15141  277561 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.182e+06  1.261e+05  -9.376  < 2e-16 ***
## mssubclass   -2.120e+02  2.445e+01  -8.671  < 2e-16 ***
## lotarea       6.576e-01  1.024e-01   6.421 1.83e-10 ***
## lotfrontage   1.537e+01  2.982e+01   0.515  0.60646    
## overallcond   5.087e+03  1.047e+03   4.859 1.31e-06 ***
## overallqual   1.997e+04  1.159e+03  17.235  < 2e-16 ***
## grlivarea     6.986e+01  3.331e+00  20.975  < 2e-16 ***
## mosold       -1.368e+02  3.586e+02  -0.381  0.70291    
## yearbuilt     4.720e+02  5.453e+01   8.656  < 2e-16 ***
## yearremodadd  9.751e+01  6.776e+01   1.439  0.15038    
## bedroomabvgr -1.097e+04  1.525e+03  -7.194 1.00e-12 ***
## fullbath     -1.999e+03  2.643e+03  -0.756  0.44966    
## garagearea    1.229e+01  9.985e+00   1.231  0.21863    
## garagecars    9.646e+03  2.947e+03   3.273  0.00109 ** 
## poolarea     -2.268e+01  2.465e+01  -0.920  0.35774    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36840 on 1445 degrees of freedom
## Multiple R-squared:  0.787,  Adjusted R-squared:  0.785 
## F-statistic: 381.5 on 14 and 1445 DF,  p-value: < 2.2e-16

R-squared value is 0.78. We will remove the variables with high p-value and update the model using backward elimination.

ggplot(model, aes(x = .fitted, y = .resid)) + geom_point() +geom_hline(yintercept=0)+
  expand_limits(y = c(0, -800))

qqnorm(model$residuals)
qqline(model$residuals)

model <-update(model, .~. -mosold-lotfrontage-fullbath-poolarea-yearremodadd-garagearea, data = houses_cols)

summary(model)
## 
## Call:
## lm(formula = saleprice ~ mssubclass + lotarea + overallcond + 
##     overallqual + grlivarea + yearbuilt + bedroomabvgr + garagecars, 
##     data = houses_cols)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -438879  -19210   -2140   15409  276785 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.045e+06  8.999e+04 -11.615  < 2e-16 ***
## mssubclass   -2.206e+02  2.342e+01  -9.418  < 2e-16 ***
## lotarea       6.536e-01  1.021e-01   6.400 2.10e-10 ***
## overallcond   5.686e+03  9.574e+02   5.940 3.57e-09 ***
## overallqual   2.018e+04  1.140e+03  17.713  < 2e-16 ***
## grlivarea     6.994e+01  3.029e+00  23.087  < 2e-16 ***
## yearbuilt     4.981e+02  4.604e+01  10.819  < 2e-16 ***
## bedroomabvgr -1.160e+04  1.466e+03  -7.912 4.99e-15 ***
## garagecars    1.264e+04  1.765e+03   7.164 1.24e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36830 on 1451 degrees of freedom
## Multiple R-squared:  0.7863, Adjusted R-squared:  0.7851 
## F-statistic: 667.3 on 8 and 1451 DF,  p-value: < 2.2e-16
ggplot(model, aes(x = .fitted, y = .resid)) + geom_point() +geom_hline(yintercept=0)+
  expand_limits(y = c(0, -800))

qqnorm(model$residuals)
qqline(model$residuals)

Udating the model with backward elimination by removing the variables with high p-value dows not provide improvement to the model. R-squared is still 0.78.Residuals does not follow any pattern, they are clusterd around the line, which explains fit of the model. A closer look at qqplot reveals the normality with only a few outliers in the end.