This data is from Kaggle, specifically the House Prices: Advanced Regression Techniques competition.
This data is available here:
https://www.kaggle.com/c/house-prices-advanced-regression-techniques
Load libraries.
library(ggplot2)
library(tidyr)
library(dplyr)
library(gridExtra)
library(MASS)
Read in training data.
Look at shape and first few rows.
train <- read.csv("train.csv",
header=TRUE,
row.names=1,
check.names=FALSE,
stringsAsFactors=FALSE)
dim(train)
## [1] 1460 80
head(train)
## MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 60 RL 65 8450 Pave <NA> Reg
## 2 20 RL 80 9600 Pave <NA> Reg
## 3 60 RL 68 11250 Pave <NA> IR1
## 4 70 RL 60 9550 Pave <NA> IR1
## 5 60 RL 84 14260 Pave <NA> IR1
## 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
Next, use the data descriptions to convert MSSubClass from numeric to character.
train$MSSubClass <- plyr::mapvalues(train$MSSubClass,
from=c(20,30,40,45,50,60,70,75,80,85,90,120,150,160,180,190),
to=c("1-STORY 1946 & NEWER ALL STYLES","1-STORY 1945 & OLDER","1-STORY W/FINISHED ATTIC ALL AGES","1-1/2 STORY - UNFINISHED ALL AGES","1-1/2 STORY FINISHED ALL AGES","2-STORY 1946 & NEWER","2-STORY 1945 & OLDER","2-1/2 STORY ALL AGES","SPLIT OR MULTI-LEVEL","SPLIT FOYER","DUPLEX - ALL STYLES AND AGES","1-STORY PUD - 1946 & NEWER","1-1/2 STORY PUD - ALL AGES","2-STORY PUD - 1946 & NEWER","PUD - MULTILEVEL - INCL SPLIT LEV/FOYER","2 FAMILY CONVERSION - ALL STYLES AND AGES"))
Check type of each variable. How many variables are categorical vs. numeric?
class_per_variable <- sapply(train,class)
table(class_per_variable)
## class_per_variable
## character integer
## 44 36
Check number of unique values per variable for the numeric variables.
numeric_variables_train_data <- train[,class_per_variable == "integer"]
apply(numeric_variables_train_data,2,function(x)length(unique(x)))
## LotFrontage LotArea OverallQual OverallCond YearBuilt
## 111 1073 10 9 112
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## 61 328 637 144 780
## TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea
## 721 753 417 24 861
## BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 4 3 4 3 8
## KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars
## 4 12 4 98 5
## GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 441 274 202 120 20
## ScreenPorch PoolArea MiscVal MoSold YrSold
## 76 8 21 12 5
## SalePrice
## 663
Some of these have only a few possible values. This makes sense. For example, FullBath counts the number of above-ground full bathrooms.
Let’s also count the number of 0’s per numeric variable.
apply(numeric_variables_train_data,2,function(x)length(which(x == 0)))
## LotFrontage LotArea OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## 0 861 467 1293 118
## TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea
## 37 0 829 1434 0
## BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 856 1378 9 913 6
## KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars
## 1 0 690 0 81
## GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 81 761 656 1252 1436
## ScreenPorch PoolArea MiscVal MoSold YrSold
## 1344 1453 1408 0 0
## SalePrice
## 0
We find a number of variables have a large proportion of 0’s. Most of these seem to make sense. For example, if MasVnrType = None, it makes sense that MasVnrArea would equal 0.
Let’s also check number of levels for each of the categorical variables.
categorical_variables_train_data <- train[,class_per_variable == "character"]
apply(categorical_variables_train_data,2,function(x)length(unique(x)))
## MSSubClass MSZoning Street Alley LotShape
## 15 5 2 3 4
## LandContour Utilities LotConfig LandSlope Neighborhood
## 4 2 5 3 25
## Condition1 Condition2 BldgType HouseStyle RoofStyle
## 9 8 5 8 6
## RoofMatl Exterior1st Exterior2nd MasVnrType ExterQual
## 8 15 16 5 4
## ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 5 6 5 5 5
## BsmtFinType1 BsmtFinType2 Heating HeatingQC CentralAir
## 7 7 6 5 2
## Electrical KitchenQual Functional FireplaceQu GarageType
## 6 4 7 6 7
## GarageFinish GarageQual GarageCond PavedDrive PoolQC
## 4 6 6 3 4
## Fence MiscFeature SaleType SaleCondition
## 5 5 9 6
Max number of levels for a categorical variable is 25 (for neighborhood).
Note the following variables have NAs by design:
Would be good to also look at the number of NAs for these, and to see if there are any other variables with NAs.
First, get the table of possible values (incl. NA) for variables with known NAs.
vars_with_known_NAs <- c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature","GarageYrBlt")
apply(train[,vars_with_known_NAs],2,function(x)table(x,useNA="ifany"))
## $Alley
## x
## Grvl Pave <NA>
## 50 41 1369
##
## $BsmtQual
## x
## Ex Fa Gd TA <NA>
## 121 35 618 649 37
##
## $BsmtCond
## x
## Fa Gd Po TA <NA>
## 45 65 2 1311 37
##
## $BsmtExposure
## x
## Av Gd Mn No <NA>
## 221 134 114 953 38
##
## $BsmtFinType1
## x
## ALQ BLQ GLQ LwQ Rec Unf <NA>
## 220 148 418 74 133 430 37
##
## $BsmtFinType2
## x
## ALQ BLQ GLQ LwQ Rec Unf <NA>
## 19 33 14 46 54 1256 38
##
## $FireplaceQu
## x
## Ex Fa Gd Po TA <NA>
## 24 33 380 20 313 690
##
## $GarageType
## x
## 2Types Attchd Basment BuiltIn CarPort Detchd <NA>
## 6 870 19 88 9 387 81
##
## $GarageFinish
## x
## Fin RFn Unf <NA>
## 352 422 605 81
##
## $GarageQual
## x
## Ex Fa Gd Po TA <NA>
## 3 48 14 3 1311 81
##
## $GarageCond
## x
## Ex Fa Gd Po TA <NA>
## 2 35 9 7 1326 81
##
## $PoolQC
## x
## Ex Fa Gd <NA>
## 2 2 3 1453
##
## $Fence
## x
## GdPrv GdWo MnPrv MnWw <NA>
## 59 54 157 11 1179
##
## $MiscFeature
## x
## Gar2 Othr Shed TenC <NA>
## 2 2 49 1 1406
##
## $GarageYrBlt
## x
## 1900 1906 1908 1910 1914 1915 1916 1918 1920 1921 1922 1923 1924 1925 1926
## 1 1 1 3 2 2 5 2 14 3 5 3 3 10 6
## 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941
## 1 4 2 8 4 3 1 2 4 5 2 3 9 14 10
## 1942 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
## 2 4 4 2 11 8 24 6 3 12 19 13 16 20 21
## 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
## 17 19 13 21 16 18 21 21 15 26 15 20 13 14 14
## 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
## 18 9 29 35 19 15 15 10 4 7 8 10 6 11 14
## 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
## 10 16 9 13 22 18 18 20 19 31 30 27 20 26 50
## 2004 2005 2006 2007 2008 2009 2010 <NA>
## 53 65 59 49 29 21 3 81
Next, check other variables for NAs.
num_NAs_per_column_minus_vars_with_known_NAs <- apply(train[,setdiff(colnames(train),vars_with_known_NAs)],2,function(x)length(which(is.na(x) == TRUE)))
print("Number of NAs for other variables:")
## [1] "Number of NAs for other variables:"
num_NAs_per_column_minus_vars_with_known_NAs[num_NAs_per_column_minus_vars_with_known_NAs > 0]
## LotFrontage MasVnrType MasVnrArea Electrical
## 259 8 8 1
Examine these variables in more detail.
print("Distribution of values for MasVnrType and Electrical:")
## [1] "Distribution of values for MasVnrType and Electrical:"
apply(train[,c("MasVnrType","Electrical")],2,function(x)table(x,useNA="ifany"))
## $MasVnrType
## x
## BrkCmn BrkFace None Stone <NA>
## 15 445 864 128 8
##
## $Electrical
## x
## FuseA FuseF FuseP Mix SBrkr <NA>
## 94 27 3 1 1334 1
print("Number of observations where both MasVnrType and MasVnrArea are NA:")
## [1] "Number of observations where both MasVnrType and MasVnrArea are NA:"
length(which(is.na(train$MasVnrType) == TRUE & is.na(train$MasVnrArea) == TRUE))
## [1] 8
print("Table of MasVnrArea when MasVnrType == None")
## [1] "Table of MasVnrArea when MasVnrType == None"
table(train$MasVnrArea[train$MasVnrType == "None"])
##
## 0 1 288 312 344
## 859 2 1 1 1
We find that most values for “Alley” are NA. Only 1460 - 1369 = 91 homes, or a bit over 6% of homes, have alley access. Then half and half gravel vs. paved. This variable seems like it may be best as a three-level variable.
Similar idea for “PoolQC”. Looks like just 7 homes in this data set (1460 - 1453) have a pool.
Most homes have basements. The fact that the number of NAs for basement-related variables is similar suggests that there are 37 homes with no basement, and then 1 home with a basement but where info on BsmtExposure specifically is missing.
Similar idea for all the garage-related variables. Assuming that there are 81 homes with no garage.
A bit under half of homes have an NA for FireplaceQu, suggesting that around half of homes have a fireplace. Of the remainder, the majority have either a “good” or “average” fireplace. This suggests that we may want to convert this variable to a simple binary of whether or not a home has a fireplace.
Finally, (1460 - 1179)/1460, or around 19%, of homes have a fence. Then, a “minimum privacy” fence seems to be the most common even when homes do have a fence. Then can probably combine “MnWw” (minimum wood/wire) into minimum privacy, and “GdWo” (good wood) into “GdPrv” (good privacy). So this variable can probably be converted to three levels (no fence, minimal fence, good fence) if not a binary fence/no fence.
Lastly, 54 homes have some kind of miscellaneous feature. Of these, a shed is the most common.
We also find missing values for some other variables (LotFrontage, MasVnrType, MasVnrArea, and Electrical).
LotFrontage is “linear feet of street connected to property”. This seems like an important variable, and it is missing for quite a few homes. We’ll need to figure out a strategy for filling this in.
For MasVnrType and MasVnrArea, it seems most logical to just assume that type=“None” and area=0 to fill in those 8 NA’s. For Electrical, it seems logical to just assume the NA is equal to “SBrkr”.
Let’s run some transformations to either convert NAs to their own factor level or fill them in, as appropriate.
Also switch some variables with intentional NAs to binary/tertiary variables.
These transformations will include:
train$Alley <- ifelse(is.na(train$Alley) == TRUE,"None",train$Alley)
train$BsmtQual <- ifelse(is.na(train$BsmtQual) == TRUE,"None",train$BsmtQual)
for(var in c("BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2"))
{
train[,var] <- ifelse(train$BsmtQual == "None","None",train[,var])
}
train$BsmtExposure[is.na(train$BsmtExposure) == TRUE] <- "No"
train$BsmtFinType2[is.na(train$BsmtFinType2) == TRUE] <- "Unf"
train$FireplaceQu <- ifelse(is.na(train$FireplaceQu) == TRUE,"N","Y")
for(var in c("GarageType","GarageFinish","GarageQual","GarageCond"))
{
train[,var] <- ifelse(is.na(train[,var]) == TRUE,"None",train[,var])
}
train$GarageYrBlt[is.na(train$GarageYrBlt) == TRUE] <- median(train$GarageYrBlt,na.rm=TRUE)
train$PoolQC <- ifelse(is.na(train$PoolQC) == TRUE,"N","Y")
train$Fence <- ifelse(is.na(train$Fence) == TRUE,"None",train$Fence)
train$Fence <- plyr::mapvalues(train$Fence,from=c("MnPrv","MnWw","GdPrv","GdWo"),to=c("Minimal","Minimal","Good","Good"))
train$MiscFeature <- ifelse(is.na(train$MiscFeature) == TRUE,"N","Y")
train$MasVnrType <- ifelse(is.na(train$MasVnrType) == TRUE,"None",train$MasVnrType)
train$MasVnrArea <- ifelse(is.na(train$MasVnrArea) == TRUE,0,train$MasVnrArea)
train$Electrical[is.na(train$Electrical) == TRUE] <- "SBrkr"
Now only NAs left to fill in are for LotFrontage.
Let’s come back to this question later on, after we’ve done some additional exploratory analysis.
Start with Neighborhood since this has a lot more levels than the others.
Then do MSSubClass and Exterior1st, which each have 15-16 levels.
Exterior2nd is nearly always the same as Exterior1st, especially after correct a few typos (“Wd Shng” to “WdShing”, “CmentBd” to “CemntBd”, “BrkCmn” to “BrkComm”). So skip this variable.
Other variables we can remove include Condition2 (nearly always either identical to Condition1, or “Normal”) and BsmtFinType2 (nearly always either Unfinished, identical to BsmtFinType1, or a bit lower than BsmtFinType1).
mycol <- c("#004949","#009292","#FF6DB6","#FFB677","#490092","#006DDB","#B66DFF","#6DB6FF","#B6DBFF","#920000","#924900","#DBD100","#24FF24","#FFFF6D","#000000")
barplot_single_var <- function(var,horizontal=FALSE){
count_per_level <- data.frame(table(train[,var]))
colnames(count_per_level) <- c("Level","Num.homes")
count_per_level$Level <- as.vector(count_per_level$Level)
count_per_level <- count_per_level[order(count_per_level$Num.homes,decreasing=TRUE),]
count_per_level$Level <- factor(count_per_level$Level,levels=count_per_level$Level)
my_barplot <- ggplot(count_per_level,
aes(x=Level,y=Num.homes,fill=Level)) +
geom_bar(stat="identity") +
scale_fill_manual(values=c(mycol,mycol)) +
ylab("Number of homes") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=Num.homes),vjust=0) +
xlab(var)
if(horizontal == TRUE){my_barplot <- my_barplot + coord_flip()}
return(my_barplot)
}
print(barplot_single_var("Neighborhood"))
print(barplot_single_var("MSSubClass",horizontal=TRUE))
print(barplot_single_var("Exterior1st"))
Next, we find a number of variables with most or all of the same levels indicating various quality levels. These include:
quality_vars <- c("ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC","KitchenQual","GarageQual","GarageCond")
quality_var_counts <- train[,quality_vars] %>% gather() %>% count(key,value)
quality_var_counts <- data.frame(quality_var_counts)
quality_var_counts$value <- factor(quality_var_counts$value,levels=c("None","Po","Fa","TA","Gd","Ex"))
ggplot(quality_var_counts,
aes(x=key,y=n,fill=value)) +
geom_bar(stat="identity",position="dodge") +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2")) +
ylab("Number of homes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=n),vjust=-0.25,position=position_dodge(width=0.9)) +
xlab("")
Plot Street and Alley.
street_vs_alley_counts <- train[,c("Street","Alley")] %>% gather() %>% count(key,value)
street_vs_alley_counts <- data.frame(street_vs_alley_counts)
street_vs_alley_counts$value <- factor(street_vs_alley_counts$value,levels=c("None","Grvl","Pave"))
ggplot(street_vs_alley_counts,
aes(x=key,y=n,fill=value)) +
geom_bar(stat="identity",position="dodge") +
scale_fill_manual(values=c("lightgrey","#E69F00","darkgrey")) +
ylab("Number of homes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=n),vjust=-0.25,position=position_dodge(width=0.9)) +
xlab("")
Plot a number of yes/no variables.
yes_no_vars_data <- train[,c("CentralAir","FireplaceQu","PavedDrive","PoolQC","Utilities","MiscFeature")]
colnames(yes_no_vars_data) <- c("CentralAir","Fireplace","PavedDrive","Pool","Utilities.AllPub","MiscFeature")
yes_no_vars_data$Utilities.AllPub[yes_no_vars_data$Utilities.AllPub == "AllPub"] <- "Y"
yes_no_vars_data$Utilities.AllPub[yes_no_vars_data$Utilities.AllPub == "NoSeWa"] <- "N"
yes_no_vars_data$PavedDrive[yes_no_vars_data$PavedDrive == "P"] <- "Partial"
yes_no_vars_data <- yes_no_vars_data %>% gather() %>% count(key,value)
yes_no_vars_data <- data.frame(yes_no_vars_data)
yes_no_vars_data$value <- factor(yes_no_vars_data$value,levels=c("N","Y","Partial"))
ggplot(yes_no_vars_data,
aes(x=key,y=n,fill=value)) +
geom_bar(stat="identity",position="dodge") +
scale_fill_manual(values=c("lightgrey","black","darkgrey")) +
ylab("Number of homes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=n),vjust=-0.25,position=position_dodge(width=0.9)) +
xlab("")
Plot remaining assorted other variables.
pairs_of_vars <- data.frame(Var1 = c("MSZoning","Condition1","LotShape","LandContour","BldgType","BsmtExposure","Heating","GarageType","SaleType","RoofStyle","MasVnrType"),
Var2 = c("Fence","Functional","LotConfig","LandSlope","HouseStyle","BsmtFinType1","Electrical","GarageFinish","SaleCondition","RoofMatl","Foundation"),
stringsAsFactors=FALSE)
for(i in 1:nrow(pairs_of_vars))
{
plot1 <- barplot_single_var(pairs_of_vars$Var1[i])
plot2 <- barplot_single_var(pairs_of_vars$Var2[i])
grid.arrange(plot1,plot2,ncol=2)
}
Some variables are numeric, but are things like counts or years that are not continuous in this data set.
Let’s also make barplots of these.
barplot_single_var_from_numeric <- function(var,new_levels=unique(train[,var])){
count_per_level <- data.frame(table(train[,var]))
colnames(count_per_level) <- c("Level","Num.homes")
count_per_level$Level <- as.vector(count_per_level$Level)
count_per_level <- count_per_level[order(count_per_level$Num.homes,decreasing=TRUE),]
count_per_level$Level <- factor(count_per_level$Level,levels=new_levels)
my_barplot <- ggplot(count_per_level,
aes(x=Level,y=Num.homes,fill=Level)) +
geom_bar(stat="identity") +
scale_fill_manual(values=c(mycol,mycol)) +
ylab("Number of homes") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=Num.homes),vjust=0) +
xlab(var)
return(my_barplot)
}
barplot_single_var_from_numeric("KitchenAbvGr",new_levels=0:3)
plot1 <- barplot_single_var_from_numeric("BedroomAbvGr",new_levels=c(0:6,8))
plot2 <- barplot_single_var_from_numeric("TotRmsAbvGrd",new_levels=c(2:12,14))
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("Fireplaces",new_levels=0:3)
plot2 <- barplot_single_var_from_numeric("GarageCars",new_levels=0:4)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("MoSold",new_levels=1:12)
plot2 <- barplot_single_var_from_numeric("YrSold",new_levels=2006:2010)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("OverallQual",new_levels=1:10)
plot2 <- barplot_single_var_from_numeric("OverallCond",new_levels=1:9)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("BsmtFullBath",new_levels=0:3)
plot2 <- barplot_single_var_from_numeric("BsmtHalfBath",new_levels=0:3)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("FullBath",new_levels=0:3)
plot2 <- barplot_single_var_from_numeric("HalfBath",new_levels=0:3)
grid.arrange(plot1,plot2,ncol=2)
Make a histogram for all remaining numeric variables.
One thing we still need to deal with though, is large number of zeroes in many variables.
Get the list again of number of zeros per variable, minus variables where we already made barplots.
numeric_vars <- colnames(train)[class_per_variable == "integer"]
numeric_vars <- setdiff(numeric_vars,c("OverallQual","OverallCond","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageCars","MoSold","YrSold"))
num_zeroes <- apply(train[,numeric_vars],2,function(x)length(which(x == 0)))
num_zeroes[num_zeroes > 0]
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 869 467 1293 118 37
## 2ndFlrSF LowQualFinSF GarageArea WoodDeckSF OpenPorchSF
## 829 1434 81 761 656
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal
## 1252 1436 1344 1453 1408
Let’s remove zeroes for all histograms.
Let’s also remove BsmtFinSF2, since we are not including corresponding categorical variable BsmtFinType2.
Let’s make histograms for the following variables:
numeric_vars <- setdiff(numeric_vars,"BsmtFinSF2")
numeric_vars
## [1] "LotFrontage" "LotArea" "YearBuilt" "YearRemodAdd"
## [5] "MasVnrArea" "BsmtFinSF1" "BsmtUnfSF" "TotalBsmtSF"
## [9] "1stFlrSF" "2ndFlrSF" "LowQualFinSF" "GrLivArea"
## [13] "GarageYrBlt" "GarageArea" "WoodDeckSF" "OpenPorchSF"
## [17] "EnclosedPorch" "3SsnPorch" "ScreenPorch" "PoolArea"
## [21] "MiscVal" "SalePrice"
par(mfrow=c(2,2))
for(var in numeric_vars[1:20])
{
num_zeroes = length(which(train[,var] == 0))
hist(train[train[,var] > 0,var],labels=TRUE,xlab="",ylab="Number of homes",main=ifelse(num_zeroes == 0,var,paste0(var," >0")))
}
par(mfrow=c(1,2))
var = numeric_vars[21]
num_zeroes = length(which(train[,var] == 0))
hist(train[train[,var] > 0,var],labels=TRUE,xlab="",ylab="Number of homes",main=ifelse(num_zeroes == 0,var,paste0(var," >0")))
var = numeric_vars[22]
num_zeroes = length(which(train[,var] == 0))
hist(train[train[,var] > 0,var],labels=TRUE,xlab="",ylab="Number of homes",main=ifelse(num_zeroes == 0,var,paste0(var," >0")))
I checked and the two very high-value items for MiscVal are both “Gar2”. Surprisingly, the tennis court is only valued at $2,000, comparable to some of the fancier sheds.
Anyway, we see that many variables have various degrees of skew. With right skew being especially common. This makes sense for measurements of area.
Get summary stats for values > 0 for all variables where just got histograms.
for(var in numeric_vars)
{
print(var)
print(summary(train[train[,var] > 0,var]))
}
## [1] "LotFrontage"
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 59.00 69.00 70.05 80.00 313.00 259
## [1] "LotArea"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
## [1] "YearBuilt"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1872 1954 1973 1971 2000 2010
## [1] "YearRemodAdd"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1950 1967 1994 1985 2004 2010
## [1] "MasVnrArea"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 113.0 203.0 254.7 330.5 1600.0
## [1] "BsmtFinSF1"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 371.0 604.0 652.3 867.0 5644.0
## [1] "BsmtUnfSF"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.0 288.0 536.0 617.1 843.2 2336.0
## [1] "TotalBsmtSF"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 105.0 810.5 1004.0 1084.9 1309.5 6110.0
## [1] "1stFlrSF"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 882 1087 1163 1391 4692
## [1] "2ndFlrSF"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 110.0 625.0 776.0 802.9 926.5 2065.0
## [1] "LowQualFinSF"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 53.0 168.2 377.5 328.2 477.5 572.0
## [1] "GrLivArea"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
## [1] "GarageYrBlt"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1900 1962 1980 1979 2001 2010
## [1] "GarageArea"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 160.0 380.0 484.0 500.8 580.0 1418.0
## [1] "WoodDeckSF"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.0 120.0 171.0 196.8 240.0 857.0
## [1] "OpenPorchSF"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 39.00 63.00 84.73 112.00 547.00
## [1] "EnclosedPorch"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.0 104.2 144.5 154.1 205.0 552.0
## [1] "3SsnPorch"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.0 150.8 180.0 207.4 239.8 508.0
## [1] "ScreenPorch"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.0 143.8 180.0 189.6 224.0 480.0
## [1] "PoolArea"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 480.0 515.5 555.0 575.4 612.0 738.0
## [1] "MiscVal"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 54.0 437.5 500.0 1221.0 887.5 15500.0
## [1] "SalePrice"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
For now, I will run correlations between variables mainly in the context of the Data 605 final.
“Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?”
Three continuous numeric variables that seem like they would be correlated with SalePrice are LotArea, 1stFlrSF, and GrLivArea.
Let’s make scatterplots.
sale_price_vs_vars_of_interest <- train[,c("LotArea","1stFlrSF","GrLivArea")] %>% gather()
sale_price_vs_vars_of_interest <- data.frame(sale_price_vs_vars_of_interest,SalePrice = rep(train$SalePrice,times=3))
ggplot(sale_price_vs_vars_of_interest,
aes(x=value,y=SalePrice)) +
facet_wrap(~key,switch="x",scales="free_x") +
xlab("") +
geom_point()
1stFlrSF and GrLivArea are both definitely correlated with SalePrice.
LotArea appears to be as well, but it is a bit hard to tell because of the outliers. Replot only for homes with LotArea < 20,000.
ggplot(train[train$LotArea < 20000,],
aes(LotArea,SalePrice)) +
geom_point() +
ggtitle("Homes with LotArea < 20,000 only")
Actually looks like LotArea is less correlated with SalePrice than expected, at least within the range of LotArea where most homes fall and without running any transformations on LotArea. But it definitely looks like there is some level of correlation, even if it is a bit weak.
Let’s check the correlations within the same three independent variables we just plotted.
Start with pairs scatterplots.
pairs(train[,c("LotArea","1stFlrSF","GrLivArea")])
pairs(train[train$LotArea < 20000,c("LotArea","1stFlrSF","GrLivArea")],main="Homes with LotArea < 20,000 only")
print("Does 1stFlrSF + 2ndFlrSF + LowQualFinSF always = GrLivArea in this dataset?")
## [1] "Does 1stFlrSF + 2ndFlrSF + LowQualFinSF always = GrLivArea in this dataset?"
length(which((train[,"1stFlrSF"] + train[,"2ndFlrSF"] + train[,"LowQualFinSF"]) == train$GrLivArea)) == nrow(train)
## [1] TRUE
For many homes, it appears that GrLivArea and 1stFlrSF are exactly identical.
For those where they are not, they still look quite correlated.
This is because GrLivArea (“Above grade (ground) living area square feet”) is just the sum of 1stFlrSF, 2ndFlrSF, and LowQualFinSF.
Many homes will have these values identical if they have no second floor or low quality finished area. Even within those homes that do have a second floor, obviously first and second floor size are going to be very correlated, so that GrLivArea is then also very correlated.
As for LotArea, it does appear somewhat correlated with both 1stFloorSF and GrLivArea. But the correlation does not appear to be super strong.
Get the correlation coefficient matrix.
cor(train[,c("LotArea","1stFlrSF","GrLivArea")])
## LotArea 1stFlrSF GrLivArea
## LotArea 1.0000000 0.2994746 0.2631162
## 1stFlrSF 0.2994746 1.0000000 0.5660240
## GrLivArea 0.2631162 0.5660240 1.0000000
Run hypothesis testing for the 3 pairs of correlations.
print("LotArea vs. 1stFlrSF")
## [1] "LotArea vs. 1stFlrSF"
cor.test(train[,"LotArea"],train[,"1stFlrSF"],conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: train[, "LotArea"] and train[, "1stFlrSF"]
## t = 11.985, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2686127 0.3297222
## sample estimates:
## cor
## 0.2994746
print("LotArea vs. GrLivArea:")
## [1] "LotArea vs. GrLivArea:"
cor.test(train[,"LotArea"],train[,"GrLivArea"],conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: train[, "LotArea"] and train[, "GrLivArea"]
## t = 10.414, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2315997 0.2940809
## sample estimates:
## cor
## 0.2631162
print("1stFlrSF vs. GrLivArea:")
## [1] "1stFlrSF vs. GrLivArea:"
cor.test(train[,"1stFlrSF"],train[,"GrLivArea"],conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: train[, "1stFlrSF"] and train[, "GrLivArea"]
## t = 26.217, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5427732 0.5884078
## sample estimates:
## cor
## 0.566024
print("p-value LotArea vs. 1stFlrSF")
## [1] "p-value LotArea vs. 1stFlrSF"
cor.test(train[,"LotArea"],train[,"1stFlrSF"],conf.level=0.80)$p.value
## [1] 1.234238e-31
print("p-value LotArea vs. GrLivArea:")
## [1] "p-value LotArea vs. GrLivArea:"
cor.test(train[,"LotArea"],train[,"GrLivArea"],conf.level=0.80)$p.value
## [1] 1.520481e-24
print("p-value 1stFlrSF vs. GrLivArea:")
## [1] "p-value 1stFlrSF vs. GrLivArea:"
cor.test(train[,"1stFlrSF"],train[,"GrLivArea"],conf.level=0.80)$p.value
## [1] 1.936813e-124
print("Min p-value for FDR 0.1% plus Bonferonni correction after pairwise correlations of 79 features:")
## [1] "Min p-value for FDR 0.1% plus Bonferonni correction after pairwise correlations of 79 features:"
.001/ncol(combn(1:79,2))
## [1] 3.245699e-07
A hypothesis test run on these 3 pairs of correlations suggests that for all 3 of these correlations, it is very very unlikely that any of them are equal to 0. The p-value here corresponds to the probability that the correlation is 0, and the p-values are all very low here.
The 80% confidence interval means that if we were to repeatedly collect a new set of 1460 observations from a theoretical infinite size population of home sales, we would expect the correlation coefficient to fall within these values 80% of the time.
For LotArea and 1stFlrSF, the confidence interval shows that we would expect to find a correlation coefficient between 0.269 and 0.330 80% of the time in this scenario. For LotArea and GrLivArea, the 80% confidence interval is 0.232 to 0.294. For 1stFlrSF and GrLivArea, the 80% confidence interval is 0.543 to 0.588.
The familywise error rate is the probability of making at least one type I error (saying something is significant when it actually isn’t). For this analysis, this would mean that we are falsely saying that the correlation is non-zero when it actually is zero. Based on the extremely low p-values, I am not very concerned about a type I error here.
It is true that we ran multiple tests here, which can increase the familywise error rate. However, this can be solved by applying a Bonferroni correction, where you divide the significance cutoff by the number of tests (e.g. use p=(.05/3 for 3 tests plus a 5% allowed false discovery rate). Here, even if we had a very strict 0.1% FDR cutoff plus all pairwise comparisons in a 79-feature dataset (number of tests = 3081), the p-value cutoff would still be above the p-values we get from our tests.
Also from the Data 605 final:
“Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.”
correlation_matrix <- cor(train[,c("LotArea","1stFlrSF","GrLivArea")])
print("Correlation matrix:")
## [1] "Correlation matrix:"
correlation_matrix
## LotArea 1stFlrSF GrLivArea
## LotArea 1.0000000 0.2994746 0.2631162
## 1stFlrSF 0.2994746 1.0000000 0.5660240
## GrLivArea 0.2631162 0.5660240 1.0000000
precision_matrix <- solve(correlation_matrix)
print("Precision matrix:")
## [1] "Precision matrix:"
precision_matrix
## LotArea 1stFlrSF GrLivArea
## LotArea 1.1143027 -0.2468334 -0.1534774
## 1stFlrSF -0.2468334 1.5260943 -0.7988601
## GrLivArea -0.1534774 -0.7988601 1.4925563
correlation_times_precision = correlation_matrix %*% precision_matrix
correlation_times_precision[abs(correlation_times_precision) < 2.2e-16] = 0
print("Correlation x precision:")
## [1] "Correlation x precision:"
correlation_times_precision
## LotArea 1stFlrSF GrLivArea
## LotArea 1 0 0
## 1stFlrSF 0 1 0
## GrLivArea 0 0 1
precision_times_correlation = precision_matrix %*% correlation_matrix
precision_times_correlation[abs(precision_times_correlation) < 2.2e-16] = 0
print("Precision x correlation:")
## [1] "Precision x correlation:"
precision_times_correlation
## LotArea 1stFlrSF GrLivArea
## LotArea 1 0 0
## 1stFlrSF 0 1 0
## GrLivArea 0 0 1
Either order of matrix multiplication results in the identity matrix.
Run LU decomposition on the correlation matrix.
factorize_matrix <- function(input_matrix){
if(nrow(input_matrix) < 2 | ncol(input_matrix) < 2){print("Require at least 2 rows and columns. Exiting.");return(NA)}
if(nrow(input_matrix) != ncol(input_matrix)){print("Not a square matrix. Exiting.");return(NA)}
combinations <- combn(rev(1:nrow(input_matrix)),2)
combinations <- combinations[,rev(1:ncol(combinations))]
if(nrow(input_matrix) == 2){combinations <- data.frame(column = combinations)} #Object combinations must be a data frame, not a vector. It turns into a vector for 2x2, so we need to fix.
lower_triangular_matrix <- matrix(NA,nrow=nrow(input_matrix),ncol=nrow(input_matrix)) #Set up blank lower triangular matrix.
upper_triangular_matrix <- input_matrix
for(i in 1:ncol(combinations))
{
row_to_zero_out <- combinations[1,i]
row_to_use_to_calculate_coefficient <- combinations[2,i]
column_to_zero_out <- row_to_use_to_calculate_coefficient #Same variable to have two names for clarity in later part of this function.
coefficient_to_zero <- -1 * (upper_triangular_matrix[row_to_zero_out,column_to_zero_out]/upper_triangular_matrix[row_to_use_to_calculate_coefficient,column_to_zero_out])
upper_triangular_matrix[row_to_zero_out,] <- upper_triangular_matrix[row_to_zero_out,] +
coefficient_to_zero*upper_triangular_matrix[row_to_use_to_calculate_coefficient,]
lower_triangular_matrix[row_to_zero_out,column_to_zero_out] <- -1*coefficient_to_zero
}
for(i in 1:nrow(input_matrix))
{
lower_triangular_matrix[i,i] <- 1
if(i == ncol(input_matrix)){break}
for(j in seq(from=(i + 1),to=ncol(input_matrix),by=1)){
lower_triangular_matrix[i,j] <- 0
}
}
return(list(L = lower_triangular_matrix,U = upper_triangular_matrix))
}
print("Correlation matrix:")
## [1] "Correlation matrix:"
correlation_matrix
## LotArea 1stFlrSF GrLivArea
## LotArea 1.0000000 0.2994746 0.2631162
## 1stFlrSF 0.2994746 1.0000000 0.5660240
## GrLivArea 0.2631162 0.5660240 1.0000000
LU_decomp = factorize_matrix(correlation_matrix)
print("LU decomposition:")
## [1] "LU decomposition:"
LU_decomp
## $L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.2994746 1.0000000 0
## [3,] 0.2631162 0.5352294 1
##
## $U
## LotArea 1stFlrSF GrLivArea
## LotArea 1 0.2994746 0.2631162
## 1stFlrSF 0 0.9103150 0.4872274
## GrLivArea 0 0.0000000 0.6699915
print("Lower from LU decomposition x Upper from LU decomposition")
## [1] "Lower from LU decomposition x Upper from LU decomposition"
LU_decomp$L %*% LU_decomp$U
## LotArea 1stFlrSF GrLivArea
## [1,] 1.0000000 0.2994746 0.2631162
## [2,] 0.2994746 1.0000000 0.5660240
## [3,] 0.2631162 0.5660240 1.0000000
print("inverse(Upper from LU decomposition) x inverse(Lower from LU decomposition)")
## [1] "inverse(Upper from LU decomposition) x inverse(Lower from LU decomposition)"
solve(LU_decomp$U) %*% solve(LU_decomp$L)
## [,1] [,2] [,3]
## LotArea 1.1143027 -0.2468334 -0.1534774
## 1stFlrSF -0.2468334 1.5260943 -0.7988601
## GrLivArea -0.1534774 -0.7988601 1.4925563
print("Precision matrix:")
## [1] "Precision matrix:"
precision_matrix
## LotArea 1stFlrSF GrLivArea
## LotArea 1.1143027 -0.2468334 -0.1534774
## 1stFlrSF -0.2468334 1.5260943 -0.7988601
## GrLivArea -0.1534774 -0.7988601 1.4925563
We get the LU decomposition of the correlation matrix.
We also confirm the following properties of the LU decomposition:
From the Data 605 final:
“Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of lambda for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, lambda)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.”
One very right-skewed variable from our histograms looks like LotArea. Let’s use that.
fitted <- fitdistr(train$LotArea,densfun="exponential")
lambda <- fitted$estimate
print("Rate (lambda) of fitted exponential distribution:")
## [1] "Rate (lambda) of fitted exponential distribution:"
lambda
## rate
## 9.50857e-05
set.seed(1392)
samples_fitted <- rexp(1000,lambda)
par(mfrow=c(1,2))
hist(train$LotArea,xlab="LotArea",ylab="Num observations",main="Original",labels=TRUE)
hist(samples_fitted,xlab="LotArea",ylab="Num observations",main="Fitted")
hist(train$LotArea[train$LotArea < 60000],xlab="LotArea",ylab="Num observations",main="Original\nfiltered to < 60k",breaks=seq(from=0,to=60000,by=5000))
hist(samples_fitted,xlab="LotArea",ylab="Num observations",main="Fitted",breaks=seq(from=0,to=60000,by=5000))
The shapes are actually not entirely dissimilar, though we do find some major differences.
print("5th percentile fitted exponential distribution:")
## [1] "5th percentile fitted exponential distribution:"
qexp(p=.05,rate=lambda)
## [1] 539.4428
print("95th percentile fitted exponential distribution:")
## [1] "95th percentile fitted exponential distribution:"
qexp(p=.95,rate=lambda)
## [1] 31505.6
The 5th percentile of the fitted exponential distribution is 539. The 95th percentile of the fitted exponential distribution is 31506.
First, get percentiles from a normal distribution with same mean and sd as the actual data.
print("Mean actual data")
## [1] "Mean actual data"
mean(train$LotArea)
## [1] 10516.83
print("Sd actual data")
## [1] "Sd actual data"
sd(train$LotArea)
## [1] 9981.265
print("5th percentile normal dist with same mean and sd:")
## [1] "5th percentile normal dist with same mean and sd:"
qnorm(p=.05,mean=mean(train$LotArea),sd=sd(train$LotArea))
## [1] -5900.892
print("95th percentile normal dist with same mean and sd:")
## [1] "95th percentile normal dist with same mean and sd:"
qnorm(p=.95,mean=mean(train$LotArea),sd=sd(train$LotArea))
## [1] 26934.55
The 5th percentile of a normal distribution with same mean and sd as the actual data is equal to -5901. The 95th percentile is 26935.
Compare to the percentiles from the actual data.
print("5th percentile actual data:")
## [1] "5th percentile actual data:"
quantile(train$LotArea,prob=.05,type=2)
## 5%
## 3273
print("95th percentile actual data:")
## [1] "95th percentile actual data:"
quantile(train$LotArea,prob=.95,type=2)
## 95%
## 17411.5
print("73rd and 74th LotArea values after order:")
## [1] "73rd and 74th LotArea values after order:"
train$LotArea[order(train$LotArea)[73:74]]
## [1] 3230 3316
print("1387th and 1388th LotArea values after order:")
## [1] "1387th and 1388th LotArea values after order:"
train$LotArea[order(train$LotArea)[1387:1388]]
## [1] 17400 17423
The 5th and 95th percentile of the actual data are 3273 and 17411.5 (mean of the 73rd and 74th values and 1387th and 1388th values respectively).
The 5th percentile in the actual data is larger than one based on a normal distribution with same mean and sd as the actual data. This makes sense because the normal distribution, unlike the actual data, does not exclude negative values by nature.
The 95th percentile in the actual data is smaller than one based on a normal distribution with same mean and sd as the actual data. The 95th percentile of the normal distribution with same mean and sd as the actual data is inflated because the actual data has a high mean and standard deviation that is inflated by very large outliers.
What does this all mean for our analysis?
For one, we clearly find that any models that would assume that the variable is normally distributed are not going to work very well here.
If we did want to fit a closed form distribution to the data, an exponential distribution (while not perfect) might actually not be a bad choice.
Let’s actually try fitting the data to an exponential distribution and compare the fitted to the actual values.
For each observation, get the corresponding value in the exponential distribution like so. For an example where the observation is ranked number 25 out of 1460 (rank lowest=1):
For the last observation, it is a bit tricky, as it is technically infinite. Let’s use 1459.9/1460 as the last probability.
actual_values <- train$LotArea[order(train$LotArea)]
fitted_values <- qexp(c(1:1459,1459.9)/1460,rate=lambda)
plot(actual_values,fitted_values,xlab="Actual",ylab="Fitted")
abline(0,1,lty=2)
As expected based on the histograms, we find that the fitted values are lower than the actual values when the actual values are very low or very high, and the fitted values are higher when the actual values are toward the middle of the range.
This is not necessarily a bad thing. The whole point of this is to reduce the right skew a bit, as this can make it hard to fit a good model.
How does the correlation of the actual values with SalePrice compare to the correlation of the fitted values?
print("Correlation actual LotArea vs SalePrice:")
## [1] "Correlation actual LotArea vs SalePrice:"
cor(train$LotArea,train$SalePrice)
## [1] 0.2638434
print("Correlation LotArea fitted to an exponential distribution vs. SalePrice:")
## [1] "Correlation LotArea fitted to an exponential distribution vs. SalePrice:"
cor(fitted_values,train$SalePrice[order(train$LotArea)])
## [1] 0.426399
par(mfrow=c(1,2))
plot(train$LotArea,train$SalePrice,xlab="LotArea",ylab="SalePrice",main="Actual data")
abline(lm(train$SalePrice ~ train$LotArea))
plot(fitted_values,train$SalePrice[order(train$LotArea)],xlab="LotArea",ylab="SalePrice",main="LotArea fitted to exp")
abline(lm(train$SalePrice[order(train$LotArea)] ~ fitted_values))
plot(train$LotArea[train$LotArea < 20000],train$SalePrice[train$LotArea < 20000],xlab="LotArea",ylab="SalePrice",main="Actual data\nActual LotArea < 20k")
abline(lm(train$SalePrice[train$LotArea < 20000] ~ train$LotArea[train$LotArea < 20000]))
plot(fitted_values[1:length(which(train$LotArea < 20000))],train$SalePrice[order(train$LotArea)[1:length(which(train$LotArea < 20000))]],xlab="LotArea",ylab="SalePrice",main="LotArea fitted to exp\nActual LotArea < 20k")
abline(lm(train$SalePrice[order(train$LotArea)[1:length(which(train$LotArea < 20000))]] ~ fitted_values[1:length(which(train$LotArea < 20000))]))
The correlation with SalePrice is definitely improved by fitting to an exponential distribution! This seems to be because the fitted distribution is less skewed by the extreme outliers we see in the actual data.
From the Data 605 final:
“Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.”
This section of the final will be contained within a different Rmarkdown document and Rpubs.