suppressWarnings(library(data.table))
suppressWarnings(library(corrplot))
## corrplot 0.84 loaded
suppressWarnings(library(ggplot2))
suppressWarnings(library(knitr))
suppressWarnings(library(Hmisc))
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
suppressWarnings(library(MASS))
suppressWarnings(library(forecast))
houses <- fread("https://raw.githubusercontent.com/gpsingh12/Data-605-Final-Project/master/train.csv")
names(houses) <- tolower(gsub(" ", "_", names(houses)))
str(houses)
## Classes 'data.table' and 'data.frame': 1460 obs. of 81 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ mssubclass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ mszoning : chr "RL" "RL" "RL" "RL" ...
## $ lotfrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ lotarea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ alley : chr NA NA NA NA ...
## $ lotshape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ landcontour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ lotconfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ landslope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ bldgtype : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ housestyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ overallqual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ overallcond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ yearbuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ yearremodadd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ roofstyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ roofmatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ masvnrtype : chr "BrkFace" "None" "BrkFace" "None" ...
## $ masvnrarea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ exterqual : chr "Gd" "TA" "Gd" "TA" ...
## $ extercond : chr "TA" "TA" "TA" "TA" ...
## $ foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ bsmtqual : chr "Gd" "Gd" "Gd" "TA" ...
## $ bsmtcond : chr "TA" "TA" "TA" "Gd" ...
## $ bsmtexposure : chr "No" "Gd" "Mn" "No" ...
## $ bsmtfintype1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ bsmtfinsf1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ bsmtfintype2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ bsmtfinsf2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ bsmtunfsf : int 150 284 434 540 490 64 317 216 952 140 ...
## $ totalbsmtsf : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ heatingqc : chr "Ex" "Ex" "Ex" "Gd" ...
## $ centralair : chr "Y" "Y" "Y" "Y" ...
## $ electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stflrsf : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ 2ndflrsf : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ lowqualfinsf : int 0 0 0 0 0 0 0 0 0 0 ...
## $ grlivarea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ bsmtfullbath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ bsmthalfbath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ fullbath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ halfbath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ bedroomabvgr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ kitchenabvgr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ kitchenqual : chr "Gd" "TA" "Gd" "Gd" ...
## $ totrmsabvgrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ fireplacequ : chr NA "TA" "TA" "Gd" ...
## $ garagetype : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ garageyrblt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ garagefinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ garagecars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ garagearea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ garagequal : chr "TA" "TA" "TA" "TA" ...
## $ garagecond : chr "TA" "TA" "TA" "TA" ...
## $ paveddrive : chr "Y" "Y" "Y" "Y" ...
## $ wooddecksf : int 0 298 0 0 192 40 255 235 90 0 ...
## $ openporchsf : int 61 0 42 35 84 30 57 204 0 4 ...
## $ enclosedporch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ 3ssnporch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ screenporch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poolarea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poolqc : chr NA NA NA NA ...
## $ fence : chr NA NA NA NA ...
## $ miscfeature : chr NA NA NA NA ...
## $ miscval : int 0 0 0 0 0 700 0 350 0 0 ...
## $ mosold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ yrsold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ saletype : chr "WD" "WD" "WD" "WD" ...
## $ salecondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ saleprice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(houses)
## [1] 1460 81
kable(head(houses))
| id | mssubclass | mszoning | lotfrontage | lotarea | street | alley | lotshape | landcontour | utilities | lotconfig | landslope | neighborhood | condition1 | condition2 | bldgtype | housestyle | overallqual | overallcond | yearbuilt | yearremodadd | roofstyle | roofmatl | exterior1st | exterior2nd | masvnrtype | masvnrarea | exterqual | extercond | foundation | bsmtqual | bsmtcond | bsmtexposure | bsmtfintype1 | bsmtfinsf1 | bsmtfintype2 | bsmtfinsf2 | bsmtunfsf | totalbsmtsf | heating | heatingqc | centralair | electrical | 1stflrsf | 2ndflrsf | lowqualfinsf | grlivarea | bsmtfullbath | bsmthalfbath | fullbath | halfbath | bedroomabvgr | kitchenabvgr | kitchenqual | totrmsabvgrd | functional | fireplaces | fireplacequ | garagetype | garageyrblt | garagefinish | garagecars | garagearea | garagequal | garagecond | paveddrive | wooddecksf | openporchsf | enclosedporch | 3ssnporch | screenporch | poolarea | poolqc | fence | miscfeature | miscval | mosold | yrsold | saletype | salecondition | saleprice |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 60 | RL | 65 | 8450 | Pave | NA | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NA | Attchd | 2003 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 2 | 2008 | WD | Normal | 208500 |
| 2 | 20 | RL | 80 | 9600 | Pave | NA | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 5 | 2007 | WD | Normal | 181500 |
| 3 | 60 | RL | 68 | 11250 | Pave | NA | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 9 | 2008 | WD | Normal | 223500 |
| 4 | 70 | RL | 60 | 9550 | Pave | NA | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NA | NA | NA | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 5 | 60 | RL | 84 | 14260 | Pave | NA | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NA | NA | NA | 0 | 12 | 2008 | WD | Normal | 250000 |
| 6 | 50 | RL | 85 | 14115 | Pave | NA | IR1 | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | 1.5Fin | 5 | 5 | 1993 | 1995 | Gable | CompShg | VinylSd | VinylSd | None | 0 | TA | TA | Wood | Gd | TA | No | GLQ | 732 | Unf | 0 | 64 | 796 | GasA | Ex | Y | SBrkr | 796 | 566 | 0 | 1362 | 1 | 0 | 1 | 1 | 1 | 1 | TA | 5 | Typ | 0 | NA | Attchd | 1993 | Unf | 2 | 480 | TA | TA | Y | 40 | 30 | 0 | 320 | 0 | 0 | NA | MnPrv | Shed | 700 | 10 | 2009 | WD | Normal | 143000 |
Handling missing values in the berginning to perform analysis smoothly later on. Creating a dataframe for columns of the dataframe houses to remove NA’swith required values(variables). Table out the values connecting each other to analyze NA’s and remove or replace with required data. In our dataset garage condition, garage type, garage condition are interlinked, creating pairs to check this and handle the missing values.
df_na<- data.frame(colSums(is.na(houses)))
names(df_na)<- c( "na")
kable(table(houses$poolqc,houses$poolarea, useNA = 'ifany'))
| 0 | 480 | 512 | 519 | 555 | 576 | 648 | 738 | |
|---|---|---|---|---|---|---|---|---|
| Ex | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| Fa | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| Gd | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| NA | 1453 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
## 1,453 NA's with poolarea 0. assuming there is no pool, replace with "None"
houses$poolqc[is.na(houses$poolqc)]<- "None"
## Using similar technique for fire places
table(houses$fireplacequ,houses$fireplaces,useNA = 'ifany')
##
## 0 1 2 3
## Ex 0 19 4 1
## Fa 0 28 4 1
## Gd 0 324 54 2
## Po 0 20 0 0
## TA 0 259 53 1
## <NA> 690 0 0 0
houses$fireplacequ[is.na(houses$fireplacequ)] <- "None"
## missing values for allcolumns related to garage, assuming no garage at all
#table(houses$garagetype,houses$garageyrblt,useNA = 'ifany')
houses$garagetype[is.na(houses$garagetype)] <- "None"
houses$garageyrblt[is.na(houses$garageyrblt)] <- ""
houses$garagecond[is.na(houses$garagecond)] <- "None"
houses$garagefinish[is.na(houses$garagefinish)] <- "None"
houses$garagequal[is.na(houses$garagequal)] <- "None"
## another category
table(is.na(houses$masvnrarea),is.na(houses$masvnrtype))
##
## FALSE TRUE
## FALSE 1452 0
## TRUE 0 8
houses$masvnrtype[is.na(houses$masvnrtype)] <- "None"
houses$masvnrarea[is.na(houses$masvnrarea)] <- 0
##for basement
table(houses$bsmtfintype1,houses$bsmtfintype2, useNA = 'ifany')
##
## ALQ BLQ GLQ LwQ Rec Unf <NA>
## ALQ 0 15 2 15 22 166 0
## BLQ 2 1 5 13 15 112 0
## GLQ 4 2 0 10 9 392 1
## LwQ 9 4 7 0 8 46 0
## Rec 4 11 0 8 0 110 0
## Unf 0 0 0 0 0 430 0
## <NA> 0 0 0 0 0 0 37
houses$bsmtfintype1[is.na(houses$bsmtfintype1)] <- "None"
houses$bsmtfintype2[is.na(houses$bsmtfintype2)] <- "None"
table(houses$bsmtqual,houses$bsmtcond, useNA = 'ifany')
##
## Fa Gd Po TA <NA>
## Ex 0 11 0 110 0
## Fa 8 0 2 25 0
## Gd 2 36 0 580 0
## TA 35 18 0 596 0
## <NA> 0 0 0 0 37
houses$bsmtqual[is.na(houses$bsmtqual)] <- "None"
houses$bsmtcond[is.na(houses$bsmtcond)] <- "None"
houses$bsmtexposure[is.na(houses$bsmtexposure)] <- "None"
## we can replace basement exposure with "No" as it is included in the values, we assume it is different than ##having a basement with no exposure or no basement at all
### misc. columns
houses$miscfeature[is.na(houses$miscfeature)] <- "None"
houses$fence[is.na(houses$fence)] <- "None"
houses$alley[is.na(houses$alley)] <- "None"
houses$lotfrontage[is.na(houses$lotfrontage)] <- 0
houses$electrical[is.na(houses$electrical)] <- "None"
colSums(is.na(houses))
## id mssubclass mszoning lotfrontage lotarea
## 0 0 0 0 0
## street alley lotshape landcontour utilities
## 0 0 0 0 0
## lotconfig landslope neighborhood condition1 condition2
## 0 0 0 0 0
## bldgtype housestyle overallqual overallcond yearbuilt
## 0 0 0 0 0
## yearremodadd roofstyle roofmatl exterior1st exterior2nd
## 0 0 0 0 0
## masvnrtype masvnrarea exterqual extercond foundation
## 0 0 0 0 0
## bsmtqual bsmtcond bsmtexposure bsmtfintype1 bsmtfinsf1
## 0 0 0 0 0
## bsmtfintype2 bsmtfinsf2 bsmtunfsf totalbsmtsf heating
## 0 0 0 0 0
## heatingqc centralair electrical 1stflrsf 2ndflrsf
## 0 0 0 0 0
## lowqualfinsf grlivarea bsmtfullbath bsmthalfbath fullbath
## 0 0 0 0 0
## halfbath bedroomabvgr kitchenabvgr kitchenqual totrmsabvgrd
## 0 0 0 0 0
## functional fireplaces fireplacequ garagetype garageyrblt
## 0 0 0 0 0
## garagefinish garagecars garagearea garagequal garagecond
## 0 0 0 0 0
## paveddrive wooddecksf openporchsf enclosedporch 3ssnporch
## 0 0 0 0 0
## screenporch poolarea poolqc fence miscfeature
## 0 0 0 0 0
## miscval mosold yrsold saletype salecondition
## 0 0 0 0 0
## saleprice
## 0
## no misiing values, we can move forward with the analysis.
Another required check is to find unique levels of categorical variables.Categories are incorrectly entered as lower case or first letter is capitalized. e.g. column misc feature has the categoies
“None” “Shed” “Gar2” “Othr” “TenC”. We can perform a check at the variables if the data is correct. “Othr” “othr” might corresponds to same category. Random check was performed on few categorical variables. Based on the variables required for analysis, this can be performed on them.
## performing a random check at cat. variables.
unique(houses$extercond)
## [1] "TA" "Gd" "Fa" "Po" "Ex"
unique(houses$extercond)
## [1] "TA" "Gd" "Fa" "Po" "Ex"
unique(houses$miscval)
## [1] 0 700 350 500 400 480 450 15500 1200 800 2000
## [12] 600 3500 1300 54 620 560 1400 8300 1150 2500
unique(houses$street)
## [1] "Pave" "Grvl"
Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X. Pick SalePrice as the dependent variable, and define it as Y for the next analysis.
The variable to be selected for the prediction is lot area.Variable X will be the lot area and vriable Y is described as the slae price.
X<- houses$lotarea
Y<- houses$saleprice
Plotting the variables.
ggplot(data=houses, aes(houses$saleprice)) + geom_histogram(bins=20)
ggplot(data=houses, aes(houses$lotarea)) + geom_histogram(bins=150) +coord_cartesian(xlim = c(-1000, 50000))
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities
## first quartile of X
x <- quantile(X, .25)
x
## 25%
## 7553.5
## second quartile of Y
y<- quantile(Y,.50)
y
## 50%
## 163000
p_X <- nrow(houses[(houses$lotarea > x),])/nrow(houses)
p_X_and_Y <- nrow(houses[(houses$lotarea > x &houses$saleprice > y),])/nrow(houses)
p_Y <- nrow(houses[houses$saleprice> y,])/nrow(houses)
p_X_given_Y <- p_X_and_Y/p_Y
p_X_given_Y
## [1] 0.8653846
p_X_and_Y
## [1] 0.4315068
c.P(X
p_X_and_Y <- nrow(houses[(houses$lotarea < x &houses$saleprice > y),])/nrow(houses)
p_Y <- nrow(houses[houses$saleprice> y,])/nrow(houses)
p_X_given_Y <- p_X_and_Y/p_Y
p_X_given_Y
## [1] 0.1346154
Does splitting the training data in this fashion make them independent? In other words, does P(X|Y)=P(X)P(Y))? Check mathematically, and then evaluate by running a Chi Square test for association.
For independence :
P(XY)= P(X)P(Y) or
P(X|Y) = P(X)
p_X_and_Y == p_X * p_Y
## [1] FALSE
The variables are independent mathematically. We will perform chi-square test to check the assumption.
H0 : Sale Price and Lot Area are independent
Ha : Sale Price and Lot Area are not independent
chisq.test(houses$saleprice,houses$lotarea)
## Warning in chisq.test(houses$saleprice, houses$lotarea): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: houses$saleprice and houses$lotarea
## X-squared = 735090, df = 709660, p-value < 2.2e-16
p-value is very small, we will reject null hypothesis. The variables are dependent. Lot area does effects the sale price of house.
Provide univariate descriptive statistics and appropriate plots for both variables. Provide a scatterplot of X and Y. Transform both variables simultaneously using Box-Cox transformations. You might have to research this.
describe(houses$saleprice)
## houses$saleprice
## n missing distinct Info Mean Gmd .05 .10
## 1460 0 663 1 180921 81086 88000 106475
## .25 .50 .75 .90 .95
## 129975 163000 214000 278000 326100
##
## lowest : 34900 35311 37900 39300 40000, highest: 582933 611657 625000 745000 755000
describe(houses$overallqual)
## houses$overallqual
## n missing distinct Info Mean Gmd .05 .10
## 1460 0 10 0.951 6.099 1.522 4 5
## .25 .50 .75 .90 .95
## 5 6 7 8 8
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 2 3 20 116 397 374 319 168 43 18
## Proportion 0.001 0.002 0.014 0.079 0.272 0.256 0.218 0.115 0.029 0.012
summary(houses$saleprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
summary(houses$lotarea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
## transform using boxcox transformation for normality
lambda_X = BoxCox.lambda(X)
trans_X<- BoxCox(X,lambda_X)
lambda_Y<- BoxCox.lambda(Y)
trans_Y <- BoxCox(Y,lambda_Y)
trans_df <- data.frame(trans_X, trans_Y)
hist(trans_X)
ggplot(data=trans_df, aes(trans_df$trans_X)) + geom_histogram(bins=30)
ggplot(data=trans_df, aes(trans_df$trans_Y)) + geom_histogram(bins=30)
##
Link: boxcox
Using at least three untransformed variables, build a correlation matrix. Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix
houses1 <- houses[,c("lotarea", "overallqual", "saleprice")]
cor_matrix<-cor(houses1)
corrplot(cor_matrix, method = "square")
corrplot(cor_matrix, method = "number")
prec_matrix <- round(solve(cor_matrix),1)
prec_matrix
## lotarea overallqual saleprice
## lotarea 1.1 0.3 -0.5
## overallqual 0.3 2.8 -2.3
## saleprice -0.5 -2.3 2.9
prec_cor<- round(cor_matrix%*%prec_matrix,1)
prec_cor
## lotarea overallqual saleprice
## lotarea 1 0 0.0
## overallqual 0 1 -0.1
## saleprice 0 0 0.9
Many times, it makes sense to fit a closed form distribution to data. For your non-transformed independent variable, location shift (if necessary) it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit a density function of your choice. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of the parameters for this distribution, and then take 1000 samples from this distribution (e.g., rexp(1000, ???) for an exponential). Plot a histogram and compare it with a histogram of your non-transformed original variable
min(houses$lotarea)
## [1] 1300
## minimum value above zero, no location shift required
houses_fit <- fitdistr(houses$lotarea, 'normal')
mean_fit <- houses_fit$estimate[1]
sd_fit <- houses_fit$estimate[2]
mean_fit
## mean
## 10516.83
sd_fit
## sd
## 9977.846
houses_sample <-rnorm(1000,mean_fit, sd_fit)
hist(houses_sample)
hist(houses$lotarea)
##nearly normal compared to original
Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
Multiple Regression model for prediction: 14 variabless were selected from the dataset for model. In order to avoid noise, the variable influencing ( in general) the saleprice of houses were selected.
houses_cols <- houses[,c("saleprice", "mssubclass", "lotarea", "lotfrontage", "overallcond", "overallqual", "grlivarea", "mosold", "yearbuilt", "yearremodadd",
"bedroomabvgr", "fullbath", "garagearea", "garagecars", "poolarea")]
model <- lm(saleprice ~ .,data=houses_cols)
summary(model)
##
## Call:
## lm(formula = saleprice ~ ., data = houses_cols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -443561 -18954 -2210 15141 277561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.182e+06 1.261e+05 -9.376 < 2e-16 ***
## mssubclass -2.120e+02 2.445e+01 -8.671 < 2e-16 ***
## lotarea 6.576e-01 1.024e-01 6.421 1.83e-10 ***
## lotfrontage 1.537e+01 2.982e+01 0.515 0.60646
## overallcond 5.087e+03 1.047e+03 4.859 1.31e-06 ***
## overallqual 1.997e+04 1.159e+03 17.235 < 2e-16 ***
## grlivarea 6.986e+01 3.331e+00 20.975 < 2e-16 ***
## mosold -1.368e+02 3.586e+02 -0.381 0.70291
## yearbuilt 4.720e+02 5.453e+01 8.656 < 2e-16 ***
## yearremodadd 9.751e+01 6.776e+01 1.439 0.15038
## bedroomabvgr -1.097e+04 1.525e+03 -7.194 1.00e-12 ***
## fullbath -1.999e+03 2.643e+03 -0.756 0.44966
## garagearea 1.229e+01 9.985e+00 1.231 0.21863
## garagecars 9.646e+03 2.947e+03 3.273 0.00109 **
## poolarea -2.268e+01 2.465e+01 -0.920 0.35774
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36840 on 1445 degrees of freedom
## Multiple R-squared: 0.787, Adjusted R-squared: 0.785
## F-statistic: 381.5 on 14 and 1445 DF, p-value: < 2.2e-16
R-squared value is 0.78. We will remove the variables with high p-value and update the model using backward elimination.
ggplot(model, aes(x = .fitted, y = .resid)) + geom_point() +geom_hline(yintercept=0)+
expand_limits(y = c(0, -800))
qqnorm(model$residuals)
qqline(model$residuals)
model <-update(model, .~. -mosold-lotfrontage-fullbath-poolarea-yearremodadd-garagearea, data = houses_cols)
summary(model)
##
## Call:
## lm(formula = saleprice ~ mssubclass + lotarea + overallcond +
## overallqual + grlivarea + yearbuilt + bedroomabvgr + garagecars,
## data = houses_cols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -438879 -19210 -2140 15409 276785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.045e+06 8.999e+04 -11.615 < 2e-16 ***
## mssubclass -2.206e+02 2.342e+01 -9.418 < 2e-16 ***
## lotarea 6.536e-01 1.021e-01 6.400 2.10e-10 ***
## overallcond 5.686e+03 9.574e+02 5.940 3.57e-09 ***
## overallqual 2.018e+04 1.140e+03 17.713 < 2e-16 ***
## grlivarea 6.994e+01 3.029e+00 23.087 < 2e-16 ***
## yearbuilt 4.981e+02 4.604e+01 10.819 < 2e-16 ***
## bedroomabvgr -1.160e+04 1.466e+03 -7.912 4.99e-15 ***
## garagecars 1.264e+04 1.765e+03 7.164 1.24e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36830 on 1451 degrees of freedom
## Multiple R-squared: 0.7863, Adjusted R-squared: 0.7851
## F-statistic: 667.3 on 8 and 1451 DF, p-value: < 2.2e-16
ggplot(model, aes(x = .fitted, y = .resid)) + geom_point() +geom_hline(yintercept=0)+
expand_limits(y = c(0, -800))
qqnorm(model$residuals)
qqline(model$residuals)
Udating the model with backward elimination by removing the variables with high p-value dows not provide improvement to the model. R-squared is still 0.78.Residuals does not follow any pattern, they are clusterd around the line, which explains fit of the model. A closer look at qqplot reveals the normality with only a few outliers in the end.