suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(kableExtra)))
suppressWarnings(suppressMessages(library(GGally)))
suppressWarnings(suppressMessages(library(gridExtra)))
suppressWarnings(suppressMessages(library(MASS)))
suppressWarnings(suppressMessages(library(car)))
Question:
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.
train_full_df <- read.csv('./data/train.csv')
train_full_df[train_full_df == "NA"] <- NA
Answer:
For this question, I picked first floor area as the \(X\) value and sale price as the \(Y\) variable. The \(X\) variable is right skewed, as requested.
summary(train_full_df[c("X1stFlrSF", "SalePrice")])
## X1stFlrSF SalePrice
## Min. : 334 Min. : 34900
## 1st Qu.: 882 1st Qu.:129975
## Median :1087 Median :163000
## Mean :1163 Mean :180921
## 3rd Qu.:1391 3rd Qu.:214000
## Max. :4692 Max. :755000
probability_df <- train_full_df %>%
filter(!(is.na(X1stFlrSF))) %>%
dplyr::select(X1stFlrSF,
SalePrice)
ggplot(probability_df,
aes(x=X1stFlrSF)) +
geom_histogram(bins = 40, aes(y = ..density..)) +
stat_function(fun=dnorm,
color="steelblue",
args=list(mean=mean(probability_df$X1stFlrSF),
sd=sd(probability_df$X1stFlrSF))) +
labs(x="First Floor Square Feet", y="Density") +
ggtitle("Probability Question Plot") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
qqnorm(probability_df$X1stFlrSF)
qqline(probability_df$X1stFlrSF)
Question:
Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
Answer:
This is the probability that a sample’s first floor Area is greater than the 1st quartile of the total sample first floor area GIVEN that a sample’s sale price is greater than the 1st quartile of the total sample price. This can be calculated by simply applying the conditional probability formula:
\[P(X>x | Y>y) = \frac{P(X>x \cap Y>y)}{P(Y>y)}\]
p_Y_gt_y <- nrow(probability_df[probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],]) / nrow(probability_df)
p_X_gt_x_and_Y_gt_y <- nrow(probability_df[probability_df$X1stFlrSF > quantile(probability_df$X1stFlrSF)[[2]] & probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],]) / nrow(probability_df)
probability_a <- (p_X_gt_x_and_Y_gt_y) / p_Y_gt_y
The probability of this event is 82.74%.
Question:
Answer:
This is the probability that a sample’s first floor Area is greater than the 1st quartile of the total sample first floor area AND that a sample’s sale price is greater than the 1st quartile of the total sample price.
probability_b <- p_X_gt_x_and_Y_gt_y
The probability of this event is 62.05%.
Question:
Answer:
This is the probability that a sample’s first floor Area is less than the 1st quartile of the total sample first floor area GIVEN that a sample’s sale price is greater than the 1st quartile of the total sample price.
p_Y_gt_y <- nrow(probability_df[probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],])
p_X_lt_x_and_Y_gt_y <- nrow(probability_df[probability_df$X1stFlrSF < quantile(probability_df$X1stFlrSF)[[2]] & probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],])
probability_c <- p_X_lt_x_and_Y_gt_y / p_Y_gt_y
The probability of this event is 17.08%.
Question:
In addition, make a table of counts as shown below.
probability_table_df <- data.frame(
"x_y" = c("le_1st_Quartile", "gt_1st_Quartile", "Total"),
"le_1st_Quartile" = c(NA, NA, NA),
"gt_1st_Quartile" = c(NA, NA, NA),
"Total" = c(NA, NA, NA)
)
probability_table_df %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(2, background = "lightgrey")
| x_y | le_1st_Quartile | gt_1st_Quartile | Total |
|---|---|---|---|
| le_1st_Quartile | NA | NA | NA |
| gt_1st_Quartile | NA | NA | NA |
| Total | NA | NA | NA |
Answer:
The requested table is shown below:
probability_table_df[probability_table_df$x_y == "le_1st_Quartile","le_1st_Quartile"] <- nrow(probability_df[probability_df$X1stFlrSF <= quantile(probability_df$X1stFlrSF)[[2]] & probability_df$SalePrice <= quantile(probability_df$SalePrice)[[2]],])
probability_table_df[probability_table_df$x_y == "le_1st_Quartile","gt_1st_Quartile"] <- nrow(probability_df[probability_df$X1stFlrSF <= quantile(probability_df$X1stFlrSF)[[2]] & probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],])
probability_table_df[probability_table_df$x_y == "le_1st_Quartile","Total"] <- nrow(probability_df[probability_df$X1stFlrSF <= quantile(probability_df$X1stFlrSF)[[2]],])
probability_table_df[probability_table_df$x_y == "gt_1st_Quartile","le_1st_Quartile"] <- nrow(probability_df[probability_df$X1stFlrSF > quantile(probability_df$X1stFlrSF)[[2]] & probability_df$SalePrice <= quantile(probability_df$SalePrice)[[2]],])
probability_table_df[probability_table_df$x_y == "gt_1st_Quartile","gt_1st_Quartile"] <- nrow(probability_df[probability_df$X1stFlrSF > quantile(probability_df$X1stFlrSF)[[2]] & probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],])
probability_table_df[probability_table_df$x_y == "gt_1st_Quartile","Total"] <- nrow(probability_df[probability_df$X1stFlrSF > quantile(probability_df$X1stFlrSF)[[2]],])
probability_table_df[probability_table_df$x_y == "Total","le_1st_Quartile"] <- nrow(probability_df[probability_df$SalePrice <= quantile(probability_df$SalePrice)[[2]],])
probability_table_df[probability_table_df$x_y == "Total","gt_1st_Quartile"] <- nrow(probability_df[probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],])
probability_table_df[probability_table_df$x_y == "Total","Total"] <- nrow(probability_df)
probability_table_df %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(2, background = "lightgrey")
| x_y | le_1st_Quartile | gt_1st_Quartile | Total |
|---|---|---|---|
| le_1st_Quartile | 179 | 189 | 368 |
| gt_1st_Quartile | 186 | 906 | 1092 |
| Total | 365 | 1095 | 1460 |
Question:
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
Answer:
p_AB <- nrow(probability_df[probability_df$X1stFlrSF > quantile(probability_df$X1stFlrSF)[[2]] & probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],]) / nrow(probability_df)
p_A <- nrow(probability_df[probability_df$X1stFlrSF > quantile(probability_df$X1stFlrSF)[[2]],]) / nrow(probability_df)
p_B <- nrow(probability_df[probability_df$SalePrice > quantile(probability_df$SalePrice)[[2]],]) / nrow(probability_df)
p_A_p_B <- p_A * p_B
The probablity of that both events are true, \(P(AB)\) is 62.05%. The probability that A is true, \(P(A)\), is 74.79%. The probability that B is true, \(P(B)\), is 75%. Therefore, the multiplying the two independent probabilities (\(P(A) * P(B)\)) returns 56.1%, which shows us that splitting the training data in this fashion does not make them independent.
Now, we can check the original question with a chi-square test:
chisq.test(probability_table_df[1:2,c("le_1st_Quartile", "gt_1st_Quartile")])
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: probability_table_df[1:2, c("le_1st_Quartile", "gt_1st_Quartile")]
## X-squared = 144.98, df = 1, p-value < 2.2e-16
The chi-square test returns a p-value < 0.05; therefore, we can reject the null hypothesis that the two variables are independent.
Question:
Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set.
Answer:
First, we need to do a little formatting and some adjustments for the future lm() fit. Note, some levels within the atributes of unordered factors are so rarely used that I have combined them to make a larger “other” group so that errors do not occur due to levels being missing upon the lm() fit.
format_function <- function(data_df) {
data_df <- data_df %>%
mutate(Alley = ifelse(is.na(as.character(Alley)), "NA", as.character(Alley)),
MasVnrType = ifelse(is.na(as.character(MasVnrType)), "NA", as.character(MasVnrType)),
Electrical = ifelse(is.na(as.character(Electrical)), "NA", as.character(Electrical)),
GarageType = ifelse(is.na(as.character(GarageType)), "NA", as.character(GarageType)),
MiscFeature = ifelse(is.na(as.character(MiscFeature)), "NA", as.character(MiscFeature))) %>%
mutate(MSSubClass = as.factor(MSSubClass),
MSZoning = as.factor(MSZoning),
Alley = as.factor(Alley),
MasVnrType = as.factor(MasVnrType),
BsmtQual = as.factor(BsmtQual),
BsmtCond = as.factor(BsmtCond),
BsmtExposure = as.factor(BsmtExposure),
Electrical = as.factor(Electrical),
GarageType = as.factor(GarageType),
MiscFeature = as.factor(MiscFeature),
OverallQual = factor(OverallQual, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), ordered = TRUE),
OverallCond = factor(OverallCond, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), ordered = TRUE),
ExterQual = factor(ExterQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
ExterCond = factor(ExterCond, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
BsmtQual = factor(BsmtQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
BsmtCond = factor(BsmtCond, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
BsmtExposure = factor(BsmtExposure, levels = c("No", "Mn", "Av", "Gd"), ordered = TRUE),
BsmtFinType1 = factor(BsmtFinType1, levels = c("Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"), ordered = TRUE),
BsmtFinType2 = factor(BsmtFinType1, levels = c("Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"), ordered = TRUE),
HeatingQC = factor(HeatingQC, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
CentralAir = factor(CentralAir, levels = c("N", "Y"), ordered = TRUE),
KitchenQual = factor(KitchenQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
Functional = factor(Functional, levels = c("Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"), ordered = TRUE),
FireplaceQu = factor(FireplaceQu, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
GarageFinish = factor(GarageFinish, levels = c("Unf", "Rfn", "Fin"), ordered = TRUE),
GarageQual = factor(GarageQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
GarageCond = factor(GarageCond, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
PavedDrive = factor(PavedDrive, levels = c("N", "P", "Y"), ordered = TRUE),
PoolQC = factor(PoolQC, levels = c("N", "P", "Y"), ordered = TRUE),
Fence = factor(Fence, levels = c("MnWw", "GdWo", "MnPrv", "GdPrv"), ordered = TRUE)
) %>%
#Some categories are used so infrequently they need to be either grouped together(ordered fators) or NA needs to be an option (unordered factors)
mutate(OverallQual = ifelse(OverallQual==1, 2, OverallQual)) %>%
mutate(OverallQual = factor(OverallQual, levels = c(2,3,4,5,6,7,8,9,10), ordered = TRUE)) %>%
mutate(OverallCond = ifelse(OverallCond==1, 2, OverallCond)) %>%
mutate(OverallCond = factor(OverallCond, levels = c(2,3,4,5,6,7,8,9,10), ordered = TRUE)) %>%
mutate(ExterCond = ifelse(as.character(ExterCond)=="Po", "Fa", as.character(ExterCond))) %>%
mutate(ExterCond = factor(ExterCond, levels = c("Fa", "TA", "Gd", "Ex"), ordered = TRUE)) %>%
mutate(Functional = ifelse(as.character(Functional)=="Sal" | as.character(Functional)=="Sev", "Maj2", as.character(Functional))) %>%
mutate(MSZoning = ifelse(as.character(MSZoning) %in% c("C (all)", "FV", "RH", "RM"), "Other", as.character(MSZoning)),
MSZoning = as.factor(MSZoning)) %>%
mutate(Functional = factor(Functional, levels = c("Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"), ordered = TRUE)) %>%
mutate(RoofStyle = ifelse(as.character(RoofStyle) %in% c("Flat", "Gambrel", "Mansard", "Shed"), "Other", as.character(RoofStyle))) %>%
mutate(RoofStyle = as.factor(RoofStyle)) %>%
mutate(Condition1 = ifelse(as.character(Condition1) %in% c("Artery", "PosA", "PosN", "RRAe", "RRAn", "RRNe", "RRNn"), "Other", as.character(Condition1))) %>%
mutate(Condition1 = as.factor(Condition1)) %>%
mutate(RoofMatl = ifelse(as.character(RoofMatl)!="CompShg", "Other", as.character(RoofMatl))) %>%
mutate(RoofMatl = as.factor(RoofMatl)) %>%
mutate(Exterior1st = ifelse(as.character(Exterior1st) %in% c("AsphShn", "AsbShng", "BrkComm", "BrkFace", "CBlock", "CemntBd", "ImStucc", "Stone", "Stucco", "WdShing"), "Other", as.character(Exterior1st))) %>%
mutate(Exterior1st = as.factor(Exterior1st)) %>%
mutate(Exterior2nd = ifelse(as.character(Exterior2nd) %in% c("AsbShng", "AsphShn", "Brk Cmn", "BrkFace", "CBlock", "ImStucc", "Other", "Stone", "Stucco", "Wd Shng"), "Other", as.character(Exterior2nd))) %>%
mutate(Exterior2nd = as.factor(Exterior2nd)) %>%
mutate(Foundation = ifelse(as.character(Foundation) %in% c("Slab", "Stone", "Wood"), "Other", as.character(Foundation))) %>%
mutate(Foundation = as.factor(Foundation)) %>%
mutate(Heating = ifelse(as.character(Heating) %in% c("Floor", "Wall", "Grav", "OthW"), "Other", as.character(Heating))) %>%
#training discrepancies
mutate(Heating = as.factor(Heating)) %>%
mutate(HeatingQC = ifelse(as.character(HeatingQC)=="Po", "Fa", as.character(HeatingQC))) %>%
mutate(HeatingQC = factor(HeatingQC, levels = c("Fa", "TA", "Gd", "Ex"), ordered = TRUE)) %>%
mutate(MSSubClass = ifelse(as.character(MSSubClass) %in% c("150", "180", "40", "45", "75", "85"), "Other", as.character(MSSubClass)),
MSSubClass = factor(MSSubClass)) %>%
mutate(MasVnrType = ifelse(as.character(MasVnrType) == "NA" | as.character(MasVnrType) == "BrkCmn" | as.character(MasVnrType) == "None" | as.character(MasVnrType) == "Stone", "Other", as.character(MasVnrType)),
MasVnrType = as.factor(MasVnrType)) %>%
mutate(SaleType = ifelse(as.character(SaleType) %in% c("Con", "ConLD", "ConLI", "ConLw", "CWD", "Oth"), "Other", as.character(SaleType)),
SaleType = as.factor(SaleType)) %>%
mutate(GarageType = ifelse(as.character(GarageType) %in% c("2Types", "Basment", "CarPort", "NA"), "Other", as.character(GarageType)),
GarageType = as.factor(GarageType))
return(data_df)
}
After that, we can show the summary statistics for each attribute:
summary(format_function(train_full_df) %>% dplyr::select(-Id))
## MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities
## 20 :536 Other: 309 Min. : 21.00 Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
## 60 :299 RL :1151 1st Qu.: 59.00 1st Qu.: 7554 Pave:1454 NA :1369 IR2: 41 HLS: 50 NoSeWa: 1
## 50 :144 Median : 69.00 Median : 9478 Pave: 41 IR3: 10 Low: 36
## 120 : 87 Mean : 70.05 Mean : 10517 Reg:925 Lvl:1311
## 30 : 69 3rd Qu.: 80.00 3rd Qu.: 11602
## 160 : 63 Max. :313.00 Max. :215245
## (Other):262 NA's :259
## LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond
## Corner : 263 Gtl:1382 NAmes :225 Feedr: 81 Norm :1445 1Fam :1220 1Story :726 5 :397 5 :821
## CulDSac: 94 Mod: 65 CollgCr:150 Norm :1260 Feedr : 6 2fmCon: 31 2Story :445 6 :374 6 :252
## FR2 : 47 Sev: 13 OldTown:113 Other: 119 Artery : 2 Duplex: 52 1.5Fin :154 7 :319 7 :205
## FR3 : 4 Edwards:100 PosN : 2 Twnhs : 43 SLvl : 65 8 :168 8 : 72
## Inside :1052 Somerst: 86 RRNn : 2 TwnhsE: 114 SFoyer : 37 4 :116 4 : 57
## Gilbert: 79 PosA : 1 1.5Unf : 14 9 : 43 3 : 25
## (Other):707 (Other): 2 (Other): 19 (Other): 43 (Other): 28
## YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual
## Min. :1872 Min. :1950 Gable:1141 CompShg:1434 HdBoard:222 CmentBd: 60 BrkFace: 445 Min. : 0.0 Po: 0
## 1st Qu.:1954 1st Qu.:1967 Hip : 286 Other : 26 MetalSd:220 HdBoard:207 Other :1015 1st Qu.: 0.0 Fa: 14
## Median :1973 Median :1994 Other: 33 Other :189 MetalSd:214 Median : 0.0 TA:906
## Mean :1971 Mean :1985 Plywood:108 Other :136 Mean : 103.7 Gd:488
## 3rd Qu.:2000 3rd Qu.:2004 VinylSd:515 Plywood:142 3rd Qu.: 166.0 Ex: 52
## Max. :2010 Max. :2010 Wd Sdng:206 VinylSd:504 Max. :1600.0
## Wd Sdng:197 NA's :8
## ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## Fa: 29 BrkTil:146 Po : 0 Po : 2 No :953 Unf :430 Min. : 0.0 Unf :430 Min. : 0.00
## TA:1282 CBlock:634 Fa : 35 Fa : 45 Mn :114 LwQ : 74 1st Qu.: 0.0 LwQ : 74 1st Qu.: 0.00
## Gd: 146 Other : 33 TA :649 TA :1311 Av :221 Rec :133 Median : 383.5 Rec :133 Median : 0.00
## Ex: 3 PConc :647 Gd :618 Gd : 65 Gd :134 BLQ :148 Mean : 443.6 BLQ :148 Mean : 46.55
## Ex :121 Ex : 0 NA's: 38 ALQ :220 3rd Qu.: 712.2 ALQ :220 3rd Qu.: 0.00
## NA's: 37 NA's: 37 GLQ :418 Max. :5644.0 GLQ :418 Max. :1474.00
## NA's: 37 NA's: 37
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## Min. : 0.0 Min. : 0.0 GasA :1428 Fa: 50 N: 95 FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## 1st Qu.: 223.0 1st Qu.: 795.8 GasW : 18 TA:428 Y:1365 FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## Median : 477.5 Median : 991.5 Other: 14 Gd:241 FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mean : 567.2 Mean :1057.4 Ex:741 Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## 3rd Qu.: 808.0 3rd Qu.:1298.2 NA : 1 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## Max. :2336.0 Max. :6110.0 SBrkr:1334 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.000 Po: 0
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000 Median :0.0000 Median :3.000 Median :1.000 TA:735
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866 Mean :1.047 Gd:586
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 Ex:100
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.000
##
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## Min. : 2.000 Maj2: 6 Min. :0.000 Po : 20 Attchd :870 Min. :1900 Unf :605 Min. :0.000
## 1st Qu.: 5.000 Maj1: 14 1st Qu.:0.000 Fa : 33 BuiltIn: 88 1st Qu.:1961 Rfn : 0 1st Qu.:1.000
## Median : 6.000 Mod : 15 Median :1.000 TA :313 Detchd :387 Median :1980 Fin :352 Median :2.000
## Mean : 6.518 Min2: 34 Mean :0.613 Gd :380 Other :115 Mean :1979 NA's:503 Mean :1.767
## 3rd Qu.: 7.000 Min1: 31 3rd Qu.:1.000 Ex : 24 3rd Qu.:2002 3rd Qu.:2.000
## Max. :14.000 Typ :1360 Max. :3.000 NA's:690 Max. :2010 Max. :4.000
## NA's :81
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.0 Po : 3 Po : 7 N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 334.5 Fa : 48 Fa : 35 P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 480.0 TA :1311 TA :1326 Y:1340 Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 473.0 Gd : 14 Gd : 9 Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.: 576.0 Ex : 3 Ex : 2 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :1418.0 NA's: 81 NA's: 81 Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## Min. : 0.00 Min. : 0.000 N : 0 MnWw : 11 Gar2: 2 Min. : 0.00 Min. : 1.000 Min. :2006
## 1st Qu.: 0.00 1st Qu.: 0.000 P : 0 GdWo : 54 NA :1406 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007
## Median : 0.00 Median : 0.000 Y : 0 MnPrv: 157 Othr: 2 Median : 0.00 Median : 6.000 Median :2008
## Mean : 15.06 Mean : 2.759 NA's:1460 GdPrv: 59 Shed: 49 Mean : 43.49 Mean : 6.322 Mean :2008
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 TenC: 1 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :480.00 Max. :738.000 Max. :15500.00 Max. :12.000 Max. :2010
##
## SaleType SaleCondition SalePrice
## COD : 43 Abnorml: 101 Min. : 34900
## New : 122 AdjLand: 4 1st Qu.:129975
## Other: 28 Alloca : 12 Median :163000
## WD :1267 Family : 20 Mean :180921
## Normal :1198 3rd Qu.:214000
## Partial: 125 Max. :755000
##
Specifically, for our case (First Floor Area and Sale Price), we can look a little deeper:
two_vals_df <- format_function(train_full_df)[,c("X1stFlrSF", "SalePrice")] %>% gather(category, val, 1:2)
p1 <- ggplot(two_vals_df[two_vals_df$category=="X1stFlrSF",], aes(x=category, y=val)) + geom_boxplot()
p2 <- ggplot(two_vals_df[two_vals_df$category=="SalePrice",], aes(x=category, y=val)) + geom_boxplot()
p3 <- ggplot(two_vals_df[two_vals_df$category=="X1stFlrSF",], aes(x=val)) + geom_histogram(bins = 30) + labs(x="X11stFlrSF")
p4 <- ggplot(two_vals_df[two_vals_df$category=="SalePrice",], aes(x=val)) + geom_histogram(bins = 30) + labs(x="SalePrice")
grid.arrange(p1, p2, p3, p4, ncol=2)
Question:
Provide a scatterplot of X and Y.
Answer:
ggplot(train_full_df, aes(x=X1stFlrSF, y=SalePrice)) +
geom_point() +
labs(x="First Floor Square Feet", y="Sale Price") +
ggtitle("Correlation Scatter Plot") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
Question:
Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval.
Answer:
Here we can first do a Correlation test for Lot Area, Year Built, and 1st Floor Area. Note, I did not take any columns with NA values so those rows do not need to be sorted out. Then we can do the pairwise t-test for each variable, as requested.
correlation_matrix <- round(cor(train_full_df[,c("LotArea", "YearBuilt", "X1stFlrSF")]), 4)
(correlation_matrix)
## LotArea YearBuilt X1stFlrSF
## LotArea 1.0000 0.0142 0.2995
## YearBuilt 0.0142 1.0000 0.2820
## X1stFlrSF 0.2995 0.2820 1.0000
cor.test(train_full_df$LotArea, train_full_df$YearBuilt, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train_full_df$LotArea and train_full_df$YearBuilt
## t = 0.54332, df = 1458, p-value = 0.587
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## -0.03162553 0.06002107
## sample estimates:
## cor
## 0.01422765
cor.test(train_full_df$LotArea, train_full_df$X1stFlrSF, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train_full_df$LotArea and train_full_df$X1stFlrSF
## t = 11.985, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.2571719 0.3406317
## sample estimates:
## cor
## 0.2994746
cor.test(train_full_df$YearBuilt, train_full_df$X1stFlrSF, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train_full_df$YearBuilt and train_full_df$X1stFlrSF
## t = 11.223, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.2392453 0.3236357
## sample estimates:
## cor
## 0.2819859
Question:
Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
Answer:
As can be seen above, the t-test correlation results are the same values as the cell entries in the correlation matrix. This is shows how usefual a correlation matrix is, since it can provide a large number of results without doing the individual t-tests.
To summarize our results, we failed to reject the null hypothesis for the following:
This means that we accept the null hypothesis of there being no correlation between the two variables.
For the following, we were able to reject the null hypothesis:
This means that we reject the null hypothesis of there being no correlation between the two variables.
Pertaining to the worriedness about familywise error, I would be very cautious about Type I errors (false positives) if I were using this method to determine correlation. As in our example above, we accepted that there was some correlation for two variables that had values below 0.3. While this does show some correlation, I’d be suspicious on saying tha this was definitely the case, and that there could be other underlying factors influencing the results just enough to get a null rejection. For Type II errors, I think i’d be less concerned since it seems like if the variable is not even moderately correlated (|value| > 0.5), then we would not be missing out on much by excluding it.
Question:
Linear Algebra and Correlation. Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
Answer:
Matrix Multiplication:
Here, we can make the precision matrix fairly easily. As mentioned, the variance is along the diagonal of the matrix and all other values are the covariance, or unstandardized correlations.
(precision_matrix <- solve(correlation_matrix))
## LotArea YearBuilt X1stFlrSF
## LotArea 1.1050494 0.0843473 -0.3547482
## YearBuilt 0.0843473 1.0928326 -0.3334408
## X1stFlrSF -0.3547482 -0.3334408 1.2002774
We can multiple these matrices (precistion and correlation), in which case we see that multiplying matrices by their inverses results in the identity matrix being returned.
(round(correlation_matrix %*% precision_matrix, 3))
## LotArea YearBuilt X1stFlrSF
## LotArea 1 0 0
## YearBuilt 0 1 0
## X1stFlrSF 0 0 1
(round(precision_matrix %*% correlation_matrix, 3))
## LotArea YearBuilt X1stFlrSF
## LotArea 1 0 0
## YearBuilt 0 1 0
## X1stFlrSF 0 0 1
LU Decomposition:
Earlier this year, I wrote a rather robust function that does this operation for me. I figured it would be worth including here and testing to see if it actually works. Also, I most likely would have just re-traced these steps to get the appropriate solution.
lower_matrix <- function(A) {
mtx <- A
Imtx <- diag(dim(A)[1])
mtxs <- list()
i = 1
fixrow = 0
adj_mtx <- mtx
for (col in 1:as.integer(dim(mtx)[2])) {
for (row in 1:as.integer(dim(mtx)[1])) {
if(row==col & mtx[row, col]==0) {
for (subrow in (row+1):as.integer(dim(mtx)[1])) {
if(subrow>dim(mtx)[1]) {stop("A more complex matrix solver is necessary")}
if(mtx[subrow, col] != 0) {
tmprow = mtx[row,]
mtx[row,] = mtx[subrow,]
mtx[subrow,] = tmprow
fixrow = 1
adj_mtx <- mtx
}
}
if (fixrow==0) {stop("A more complex matrix solver is necessary")}
}
if(row>col) {
Imtxtmp <- Imtx
Imtxtmp[row, col] <- -1 * mtx[row, col] / mtx[col,col]
mtxs[[i]] <- Imtxtmp
mtx <- Imtxtmp %*% mtx
i = i + 1
}
}
}
L <- solve(mtxs[[1]])
for (j in 2:(i-1)) {
solve(mtxs[[j]])
L <- L %*% solve(mtxs[[j]])
}
return(list("A"=adj_mtx, "U"=mtx, "L"=L))
}
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(correlation_matrix)
## LotArea YearBuilt X1stFlrSF
## LotArea 1.0000 0.0142 0.2995
## YearBuilt 0.0142 1.0000 0.2820
## X1stFlrSF 0.2995 0.2820 1.0000
returnlist <- lower_matrix(correlation_matrix)
print("L Matrix")
## [1] "L Matrix"
L <- print(returnlist$L)
## [,1] [,2] [,3]
## [1,] 1.0000 0.0000000 0
## [2,] 0.0142 1.0000000 0
## [3,] 0.2995 0.2778031 1
print("U Matrix")
## [1] "U Matrix"
U <- print(returnlist$U)
## LotArea YearBuilt X1stFlrSF
## [1,] 1 0.0142000 0.2995000
## [2,] 0 0.9997984 0.2777471
## [3,] 0 0.0000000 0.8331407
print("Adjusted A Matrix")
## [1] "Adjusted A Matrix"
A_adj <- print(returnlist$A)
## LotArea YearBuilt X1stFlrSF
## LotArea 1.0000 0.0142 0.2995
## YearBuilt 0.0142 1.0000 0.2820
## X1stFlrSF 0.2995 0.2820 1.0000
print("Result of LU, which should be equal to A_adj")
## [1] "Result of LU, which should be equal to A_adj"
Question:
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).
Answer:
For our data, the minimum is already above zero:
min(train_full_df$X1stFlrSF)
## [1] 334
The result of the fit distribution is as follows:
(distribution <- fitdistr(train_full_df$X1stFlrSF, "exponential"))
## rate
## 0.0008601213
## (0.0000225104)
Question:
Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable.
Answer:
The exponential distribution function is the following:
\[f(x) = \lambda{e}^{-\lambda x}\]
From the fitdistr() function above, we have an estimated value of \(\lambda\) = 8.60121310^{-4}. However, if the request is to optimize \(\lambda\), then we can use some properties of the exponential function to get the optimal value. For instance, the mean value of an exponential distribution is \(\mu = \frac{1}{\lambda}\); therefore, we can simply take the known mean and calculate \(\lambda\) as \(\lambda\) = \(\frac{1}{\mu}\) = \(\frac{1}{1162.627}\) = \(0.0008601213\), which is the same result as the fitdistr estimate and verrrrry interesting. Is this how fitdistr() works for this estimate? My guess is that this finding is not an accident and that the line of questioning was intentionally guding me towards the result.
Now, let’s take some samples and plot the histograms together. Note, I go on to find a more optimal distribution fit as part of the next question.
exponential_samples = rexp(1000, distribution$estimate[[1]])
x_hist_df <- rbind(data.frame(val = train_full_df$X1stFlrSF, source = "data"), data.frame(val = exponential_samples, source = "samples"))
ggplot(x_hist_df, aes(x=val, fill=source)) +
geom_histogram(bins = 40, aes(y=..density..), alpha = 0.5, position="identity") +
labs(x="First Floor Square Feet", y="Density") +
ggtitle("Histogram Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
Question:
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
Answer:
#Confidence interval assuming normality
range <- qnorm(0.975) * sd(train_full_df$X1stFlrSF) / sqrt(nrow(probability_df))
percentile_confidence_5th <- mean(train_full_df$X1stFlrSF) - range
percentile_confidence_95th <- mean(train_full_df$X1stFlrSF) + range
#5th and 95th percentile from CDF
percentile_sample_5th <- qexp(0.05, distribution$estimate[[1]])
percentile_sample_95th <- qexp(0.95, distribution$estimate[[1]])
#empirical 5th and 95th percentile
percentile_empirical_list <- quantile(train_full_df$X1stFlrSF, c(0.05, 0.95))
Using the cumulative distribution function, the 95% confidence interval is [59.63, 3482.92]. Assuming normality, and using the empirical data, the 95% confidence interval is [1142.8, 1182.46]. Using the empirical data alone, the 95% confidence interval is [672.95, 1831.25]
To start, the shape of the data distribution does not appear to be similar to an exponential distribution. This type of distribution only decreases as x increases; however, we can see in the empirical data that there are a few small values of x, which then increase rapidly, and then slowly decay. The data is most certainly not normal, so the 95% confidence interval using that assumption obviously does not return meaningful results. Additionally, from these summary statistics we can see tha the actual data has a much tighter interval (1158) than the CDF (3423), so the shape does not appear to be consistent with an exponential distribution. Given that the data is continuous, right skewed, and may have an atypical shape, I believe a gamma distribution is likely to be a better fit. Let’s give it a try:
(gamma_distribution <- fitdistr(train_full_df$X1stFlrSF, "gamma"))
## shape rate
## 9.9800685138 0.0085840706
## (0.3352008324) (0.0002936689)
gamma_samples = rgamma(1000, gamma_distribution$estimate[[1]], gamma_distribution$estimate[[2]])
x_gamma_hist_df <- rbind(data.frame(val = train_full_df$X1stFlrSF, source = "data"), data.frame(val = gamma_samples, source = "samples"))
ggplot(x_gamma_hist_df, aes(x=val, fill=source)) +
geom_histogram(bins = 40, aes(y=..density..), alpha = 0.5, position="identity") +
labs(x="First Floor Square Feet", y="Density") +
ggtitle("Histogram Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
percentile_sample_gamma_5th <- qgamma(0.05, gamma_distribution$estimate[[1]], gamma_distribution$estimate[[2]])
percentile_sample_gamma_95th <- qgamma(0.95, gamma_distribution$estimate[[1]], gamma_distribution$estimate[[2]])
Well, that is a much better fit. Let’s check it out. The 95% confidence interval using a gamma distribution function is [630.32, 1826.64], which is much closer to our empirical data than all other previously used distributions.
Question:
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
Answer:
First, let’s hold back some data for future testing.
set.seed(979876234)
train_full_format_df <- format_function(train_full_df) %>%
dplyr::select(-Id) %>%
mutate(SalePriceLog = log(SalePrice))
sample_size <- floor(0.9 * nrow(train_full_format_df))
train_dev_ind <- sample(seq_len(nrow(train_full_format_df)), size = sample_size)
train_df <-train_full_format_df[train_dev_ind,]
dev_df <- train_full_format_df[-train_dev_ind,]
Okay, now let’s take a look at sale price all on its own and see what kind of shape we are dealing with. Here we can see that the response variable is approximately normal if we use a log() transform, so we know that this data could benefit from some type of transformation. For now, let’s just use the log() function.
p1 <- ggplot(train_df, aes(x=SalePrice)) +
geom_histogram(bins = 30, fill = "grey", color = "black", aes(y=..density..)) +
stat_function(fun=dnorm, color="steelblue", args=list(mean=mean(train_df$SalePrice), sd=sd(train_df$SalePrice))) +
labs(x="Sale Price", y="Density") +
ggtitle("Sale Price") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
p2 <- ggplot(train_df, aes(x=log(SalePrice))) +
geom_histogram(bins = 30, fill = "grey", color = "black", aes(y=..density..)) +
stat_function(fun=dnorm, color="steelblue", args=list(mean=mean(log(train_df$SalePrice)), sd=sd(log(train_df$SalePrice)))) +
labs(x="Sale Price (Log)", y="Density") +
ggtitle("Sale Price Log Transform") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
grid.arrange(p1, p2, ncol=2)
qqnorm(log(train_df$SalePrice))
qqline(log(train_df$SalePrice))
Now, let’s look at each attribute. Here I have plotted the outputs for each variable against SalePrice using the ggpairs() function and a little editing. Note, I will do a powerTransform() on some of the numeric data later on. I specifically only applied it to attributes based on square footage, where I added a very small value (0.01) whenever the first input was zero. Additionally note, as mentioned before, I have combined some levels of categorical attributes due to very rare use.
train_numeric_list <- unlist(lapply(train_df, is.numeric))
train_numeric_df <- train_df[, train_numeric_list]
cor(train_numeric_df, train_df$SalePrice)
train_df <- train_df %>%
mutate(SalePriceLog = log(SalePrice))
ggp <- ggpairs(train_full_format_df, cardinality_threshold = 26)
plot_cor <- lapply(seq(1, ncol(train_full_format_df)), function(x) {getPlot(ggp, x, ncol(train_full_format_df))})
plot_dist <- lapply(seq(1, ncol(train_full_format_df)), function(x) {getPlot(ggp, x, x)})
plot_plot <- lapply(seq(1, ncol(train_full_format_df)), function(x) {getPlot(ggp, ncol(train_full_format_df), x)})
plot_list <- lapply(seq(1, ncol(train_full_format_df)), function(x) {list(plot_cor[x], plot_dist[x], plot_plot[x])})
plot_list_formatted <- unlist(unlist(plot_list, recursive = FALSE), recursive = FALSE)
do.call(grid.arrange, c(plot_list_formatted, ncol = 3))
So, we have looked at the preliminary data. Let’s start a regression model by including all pertinent data points using the log() transofrmation of the sale price. By pertinent, I mean meaningful attributes. I have removed some attributes that were categorical and either mostly consisted of NA values or only consisted of one level; both of which are not very helpful. for the numerical data, I only included those categories that had a decent (|value| > 0.2) correlation coefficient.
#remove Alley, LandContour, Street, Utilities, PoolQC, Fence, FireplaceQu, GarageFinish, Condition2, SaleType, SaleTypeCondition, MiscFeature, PavedDrive, GarageCond, GarageQual, Functional, LandSlope, RoofMatl, BldgType
#remove BsmtFinSF2, LowQualFinSF, BsmtHalfBath, KitchenAbvGr, EnclosedPorch, X3SsnPorch, ScreenPorch, PoolArea, MiscFeature, MiscVal, MoSold, YrSold,
train_lm_df <- train_df %>%
dplyr::select(MSSubClass, MSZoning, LotFrontage, LotArea, LotShape, LotConfig, Neighborhood, Condition1, HouseStyle, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle, Exterior1st, Exterior2nd, MasVnrType, MasVnrArea, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtUnfSF, TotalBsmtSF, Heating, HeatingQC, CentralAir, Electrical, X1stFlrSF, X2ndFlrSF, GrLivArea, BsmtFullBath, FullBath, HalfBath, BedroomAbvGr, KitchenQual, Fireplaces, GarageType, GarageYrBlt, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, SaleType, SaleCondition, SalePrice)
summary(train_lm_df)
## MSSubClass MSZoning LotFrontage LotArea LotShape LotConfig Neighborhood Condition1 HouseStyle
## 20 :485 Other: 284 Min. : 21.00 Min. : 1300 IR1:436 Corner :240 NAmes :202 Feedr: 76 1Story :651
## 60 :270 RL :1030 1st Qu.: 59.00 1st Qu.: 7509 IR2: 37 CulDSac: 82 CollgCr:133 Norm :1128 2Story :401
## 50 :128 Median : 70.00 Median : 9485 IR3: 8 FR2 : 43 OldTown:103 Other: 110 1.5Fin :138
## 120 : 81 Mean : 70.09 Mean : 10592 Reg:833 FR3 : 3 Edwards: 86 SLvl : 60
## 160 : 59 3rd Qu.: 80.00 3rd Qu.: 11576 Inside :946 Somerst: 83 SFoyer : 31
## 30 : 59 Max. :313.00 Max. :215245 NridgHt: 72 1.5Unf : 14
## (Other):232 NA's :233 (Other):635 (Other): 19
## OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle Exterior1st Exterior2nd MasVnrType
## 5 :358 5 :744 Min. :1872 Min. :1950 Gable:1027 HdBoard:200 CmentBd: 52 BrkFace:409
## 6 :334 6 :230 1st Qu.:1954 1st Qu.:1967 Hip : 258 MetalSd:207 HdBoard:184 Other :905
## 7 :279 7 :191 Median :1973 Median :1994 Other: 29 Other :163 MetalSd:202
## 8 :161 8 : 57 Mean :1972 Mean :1985 Plywood: 96 Other :116
## 4 :104 4 : 49 3rd Qu.:2001 3rd Qu.:2004 VinylSd:468 Plywood:130
## 9 : 38 3 : 21 Max. :2010 Max. :2010 Wd Sdng:180 VinylSd:459
## (Other): 40 (Other): 22 Wd Sdng:171
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Min. : 0.0 Po: 0 Fa: 25 BrkTil:131 Po : 0 Po : 2 No :857 Unf :393 Min. : 0.0 Unf :393
## 1st Qu.: 0.0 Fa: 14 TA:1157 CBlock:567 Fa : 31 Fa : 38 Mn :102 LwQ : 67 1st Qu.: 0.0 LwQ : 67
## Median : 0.0 TA:804 Gd: 129 Other : 27 TA :580 TA :1184 Av :201 Rec :114 Median : 378.0 Rec :114
## Mean : 106.2 Gd:449 Ex: 3 PConc :589 Gd :560 Gd : 59 Gd :122 BLQ :136 Mean : 437.3 BLQ :136
## 3rd Qu.: 169.0 Ex: 47 Ex :112 Ex : 0 NA's: 32 ALQ :198 3rd Qu.: 698.8 ALQ :198
## Max. :1600.0 NA's: 31 NA's: 31 GLQ :375 Max. :5644.0 GLQ :375
## NA's :7 NA's: 31 NA's: 31
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF GrLivArea
## Min. : 0.0 Min. : 0.0 GasA :1287 Fa: 48 N: 80 FuseA: 86 Min. : 334 Min. : 0.0 Min. : 334
## 1st Qu.: 234.2 1st Qu.: 796.5 GasW : 13 TA:378 Y:1234 FuseF: 18 1st Qu.: 884 1st Qu.: 0.0 1st Qu.:1128
## Median : 487.5 Median : 993.5 Other: 14 Gd:219 FuseP: 3 Median :1087 Median : 0.0 Median :1468
## Mean : 575.8 Mean :1060.9 Ex:669 Mix : 1 Mean :1163 Mean : 350.8 Mean :1519
## 3rd Qu.: 815.8 3rd Qu.:1297.8 NA : 1 3rd Qu.:1388 3rd Qu.: 728.0 3rd Qu.:1776
## Max. :2336.0 Max. :6110.0 SBrkr:1205 Max. :4692 Max. :2065.0 Max. :5642
##
## BsmtFullBath FullBath HalfBath BedroomAbvGr KitchenQual Fireplaces GarageType GarageYrBlt
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000 Po: 0 Min. :0.0000 Attchd :785 Min. :1900
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 Fa: 33 1st Qu.:0.0000 BuiltIn: 82 1st Qu.:1962
## Median :0.0000 Median :2.000 Median :0.0000 Median :3.000 TA:664 Median :1.0000 Detchd :342 Median :1980
## Mean :0.4231 Mean :1.568 Mean :0.3843 Mean :2.862 Gd:527 Mean :0.6157 Other :105 Mean :1979
## 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 Ex: 90 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :3.0000 Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.0000 Max. :2010
## NA's :71
## GarageCars GarageArea WoodDeckSF OpenPorchSF SaleType SaleCondition SalePrice
## Min. :0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00 COD : 40 Abnorml: 90 Min. : 34900
## 1st Qu.:1.000 1st Qu.: 336.0 1st Qu.: 0.00 1st Qu.: 0.00 New : 113 AdjLand: 4 1st Qu.:130000
## Median :2.000 Median : 480.0 Median : 0.00 Median : 25.00 Other: 27 Alloca : 10 Median :163250
## Mean :1.779 Mean : 477.3 Mean : 96.04 Mean : 47.44 WD :1134 Family : 17 Mean :181645
## 3rd Qu.:2.000 3rd Qu.: 576.0 3rd Qu.:168.00 3rd Qu.: 68.00 Normal :1077 3rd Qu.:214375
## Max. :4.000 Max. :1418.0 Max. :857.00 Max. :547.00 Partial: 116 Max. :755000
##
train_lm_df <- train_lm_df[complete.cases(train_lm_df),]
train_lm_df$MasVnrArea <- sapply(train_lm_df$MasVnrArea, function(x) {as.numeric(x) + 0.01})
train_lm_df$BsmtFinSF1 <- sapply(train_lm_df$BsmtFinSF1, function(x) {as.numeric(x) + 0.01})
train_lm_df$BsmtUnfSF <- sapply(train_lm_df$BsmtUnfSF, function(x) {as.numeric(x) + 0.01})
train_lm_df$X1stFlrSF <- sapply(train_lm_df$X1stFlrSF, function(x) {as.numeric(x) + 0.01})
train_lm_df$X2ndFlrSF <- sapply(train_lm_df$X2ndFlrSF, function(x) {as.numeric(x) + 0.01})
train_lm_df$GrLivArea <- sapply(train_lm_df$GrLivArea, function(x) {as.numeric(x) + 0.01})
train_lm_df$GarageArea <- sapply(train_lm_df$GarageArea, function(x) {as.numeric(x) + 0.01})
train_lm_df$WoodDeckSF <- sapply(train_lm_df$WoodDeckSF, function(x) {as.numeric(x) + 0.01})
train_lm_df$OpenPorchSF <- sapply(train_lm_df$OpenPorchSF, function(x) {as.numeric(x) + 0.01})
Additionally, I have used the powerTransform() function to transform the numeric data. Note, I only transform the continuous numeric attributes, and apply a small value to them so that the first entry is not zero.
pt_lm <- powerTransform(cbind(train_lm_df$SalePrice, train_lm_df$LotFrontage, train_lm_df$LotArea, train_lm_df$MasVnrArea, train_lm_df$BsmtFinSF1, train_lm_df$BsmtUnfSF, train_lm_df$TotalBsmtSF, train_lm_df$X1stFlrSF, train_lm_df$X2ndFlrSF, train_lm_df$GrLivArea, train_lm_df$GarageArea, train_lm_df$WoodDeckSF, train_lm_df$OpenPorchSF))
pt_lm
## Estimated transformation parameters
## Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y8 Y9 Y10
## -0.02262616 0.14767653 -0.04975022 -0.09459272 0.25886898 0.66928626 0.34772316 0.18136347 -0.01084807 0.10660509
## Y11 Y12 Y13
## 0.33835784 -0.01139760 0.07045970
lm_sale_price <- lm(data = train_lm_df, formula = I(SalePrice^pt_lm$lambda[[1]]) ~ MSSubClass + MSZoning + I(LotFrontage^pt_lm$lambda[[2]])+ I(LotArea^pt_lm$lambda[[3]]) + LotShape + LotConfig + Neighborhood + Condition1 + HouseStyle + OverallQual + OverallCond + YearBuilt + YearRemodAdd + RoofStyle + Exterior1st + Exterior2nd + MasVnrType + I(MasVnrArea^pt_lm$lambda[[4]]) + ExterQual + ExterCond + Foundation + BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 + I(BsmtFinSF1^pt_lm$lambda[[5]]) + BsmtFinType2 + I(BsmtUnfSF^pt_lm$lambda[[6]]) + I(TotalBsmtSF^pt_lm$lambda[[7]]) + Heating + HeatingQC + CentralAir + Electrical + I(X1stFlrSF^pt_lm$lambda[[8]]) + I(X2ndFlrSF^pt_lm$lambda[[9]]) + I(GrLivArea^pt_lm$lambda[[10]]) + BsmtFullBath + FullBath + HalfBath + BedroomAbvGr + KitchenQual + Fireplaces + GarageType + GarageYrBlt + GarageCars + I(GarageArea^pt_lm$lambda[[11]]) + I(WoodDeckSF^pt_lm$lambda[[12]]) + I(OpenPorchSF^pt_lm$lambda[[13]]))
summary(lm_sale_price)
##
## Call:
## lm(formula = I(SalePrice^pt_lm$lambda[[1]]) ~ MSSubClass + MSZoning +
## I(LotFrontage^pt_lm$lambda[[2]]) + I(LotArea^pt_lm$lambda[[3]]) +
## LotShape + LotConfig + Neighborhood + Condition1 + HouseStyle +
## OverallQual + OverallCond + YearBuilt + YearRemodAdd + RoofStyle +
## Exterior1st + Exterior2nd + MasVnrType + I(MasVnrArea^pt_lm$lambda[[4]]) +
## ExterQual + ExterCond + Foundation + BsmtQual + BsmtCond +
## BsmtExposure + BsmtFinType1 + I(BsmtFinSF1^pt_lm$lambda[[5]]) +
## BsmtFinType2 + I(BsmtUnfSF^pt_lm$lambda[[6]]) + I(TotalBsmtSF^pt_lm$lambda[[7]]) +
## Heating + HeatingQC + CentralAir + Electrical + I(X1stFlrSF^pt_lm$lambda[[8]]) +
## I(X2ndFlrSF^pt_lm$lambda[[9]]) + I(GrLivArea^pt_lm$lambda[[10]]) +
## BsmtFullBath + FullBath + HalfBath + BedroomAbvGr + KitchenQual +
## Fireplaces + GarageType + GarageYrBlt + GarageCars + I(GarageArea^pt_lm$lambda[[11]]) +
## I(WoodDeckSF^pt_lm$lambda[[12]]) + I(OpenPorchSF^pt_lm$lambda[[13]]),
## data = train_lm_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0060915 -0.0009369 -0.0000754 0.0008640 0.0153213
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.708e-01 2.511e-02 34.672 < 2e-16 ***
## MSSubClass160 1.455e-03 9.752e-04 1.492 0.136125
## MSSubClass190 1.250e-03 9.543e-04 1.309 0.190740
## MSSubClass20 -7.145e-04 4.368e-04 -1.636 0.102307
## MSSubClass30 -4.147e-04 6.685e-04 -0.620 0.535156
## MSSubClass50 1.172e-03 1.175e-03 0.997 0.319158
## MSSubClass60 1.656e-04 9.244e-04 0.179 0.857839
## MSSubClass70 8.589e-05 9.858e-04 0.087 0.930588
## MSSubClass80 1.123e-03 1.271e-03 0.883 0.377346
## MSSubClass90 1.379e-03 8.082e-04 1.706 0.088298 .
## MSSubClassOther -1.399e-06 8.808e-04 -0.002 0.998733
## MSZoningRL -5.551e-05 3.278e-04 -0.169 0.865567
## I(LotFrontage^pt_lm$lambda[[2]]) 5.768e-04 1.457e-03 0.396 0.692385
## I(LotArea^pt_lm$lambda[[3]]) 4.169e-02 1.014e-02 4.110 4.34e-05 ***
## LotShapeIR2 -9.981e-04 4.942e-04 -2.019 0.043752 *
## LotShapeIR3 4.633e-03 1.110e-03 4.174 3.30e-05 ***
## LotShapeReg -2.840e-04 1.810e-04 -1.569 0.117070
## LotConfigCulDSac -4.887e-04 4.680e-04 -1.044 0.296619
## LotConfigFR2 1.365e-03 4.720e-04 2.891 0.003936 **
## LotConfigFR3 2.276e-03 1.276e-03 1.783 0.074863 .
## LotConfigInside 3.339e-04 1.946e-04 1.716 0.086535 .
## NeighborhoodBlueste 7.950e-04 1.801e-03 0.441 0.658966
## NeighborhoodBrDale 8.196e-04 1.129e-03 0.726 0.468171
## NeighborhoodBrkSide -9.801e-05 9.680e-04 -0.101 0.919378
## NeighborhoodClearCr 5.111e-04 1.090e-03 0.469 0.639236
## NeighborhoodCollgCr 5.269e-04 7.710e-04 0.683 0.494567
## NeighborhoodCrawfor -1.390e-03 8.896e-04 -1.562 0.118558
## NeighborhoodEdwards 2.435e-03 8.502e-04 2.863 0.004293 **
## NeighborhoodGilbert 4.899e-04 8.390e-04 0.584 0.559418
## NeighborhoodIDOTRR 2.635e-03 1.071e-03 2.461 0.014056 *
## NeighborhoodMeadowV 3.066e-03 1.261e-03 2.431 0.015260 *
## NeighborhoodMitchel 1.063e-03 8.883e-04 1.197 0.231636
## NeighborhoodNAmes 1.008e-03 8.214e-04 1.227 0.220275
## NeighborhoodNoRidge -1.621e-03 8.609e-04 -1.883 0.059978 .
## NeighborhoodNPkVill -4.768e-04 1.259e-03 -0.379 0.705029
## NeighborhoodNridgHt -1.586e-03 7.695e-04 -2.061 0.039653 *
## NeighborhoodNWAmes 1.609e-03 8.638e-04 1.863 0.062856 .
## NeighborhoodOldTown 1.779e-03 9.668e-04 1.840 0.066080 .
## NeighborhoodSawyer 9.738e-04 8.756e-04 1.112 0.266402
## NeighborhoodSawyerW 4.500e-04 8.350e-04 0.539 0.590114
## NeighborhoodSomerst -1.124e-03 8.521e-04 -1.319 0.187642
## NeighborhoodStoneBr -2.811e-03 9.041e-04 -3.109 0.001940 **
## NeighborhoodSWISU 7.812e-04 1.003e-03 0.779 0.436109
## NeighborhoodTimber 2.895e-04 8.737e-04 0.331 0.740424
## NeighborhoodVeenker -1.457e-03 1.197e-03 -1.218 0.223726
## Condition1Norm -1.288e-03 3.548e-04 -3.630 0.000301 ***
## Condition1Other -4.636e-04 4.214e-04 -1.100 0.271583
## HouseStyle1.5Unf 9.199e-04 1.457e-03 0.631 0.527993
## HouseStyle1Story 7.931e-04 1.184e-03 0.670 0.503316
## HouseStyle2.5Fin 8.197e-04 1.489e-03 0.550 0.582221
## HouseStyle2.5Unf 2.152e-03 1.318e-03 1.633 0.102792
## HouseStyle2Story 1.412e-03 1.073e-03 1.315 0.188779
## HouseStyleSFoyer 7.556e-05 1.373e-03 0.055 0.956131
## HouseStyleSLvl -5.393e-04 1.422e-03 -0.379 0.704546
## OverallQual.L -8.037e-03 1.214e-03 -6.620 6.37e-11 ***
## OverallQual.Q 2.049e-03 1.004e-03 2.040 0.041652 *
## OverallQual.C -7.838e-04 8.569e-04 -0.915 0.360586
## OverallQual^4 1.970e-03 7.417e-04 2.656 0.008049 **
## OverallQual^5 -1.990e-04 6.147e-04 -0.324 0.746260
## OverallQual^6 7.702e-04 4.417e-04 1.744 0.081571 .
## OverallQual^7 1.966e-05 2.861e-04 0.069 0.945224
## OverallQual^8 8.852e-05 1.730e-04 0.512 0.609108
## OverallCond.L -6.143e-03 1.101e-03 -5.579 3.26e-08 ***
## OverallCond.Q 1.441e-03 1.005e-03 1.434 0.152067
## OverallCond.C 2.851e-04 8.420e-04 0.339 0.735005
## OverallCond^4 -1.292e-03 6.647e-04 -1.943 0.052305 .
## OverallCond^5 6.270e-04 5.012e-04 1.251 0.211332
## OverallCond^6 -7.485e-04 3.849e-04 -1.945 0.052138 .
## OverallCond^7 2.121e-04 2.427e-04 0.874 0.382379
## YearBuilt -9.620e-06 9.202e-06 -1.045 0.296128
## YearRemodAdd -1.466e-05 6.522e-06 -2.248 0.024857 *
## RoofStyleHip -1.134e-04 1.973e-04 -0.575 0.565360
## RoofStyleOther 8.338e-04 5.682e-04 1.468 0.142595
## Exterior1stMetalSd -1.529e-03 9.684e-04 -1.579 0.114775
## Exterior1stOther -5.527e-04 5.877e-04 -0.940 0.347276
## Exterior1stPlywood 2.482e-04 5.841e-04 0.425 0.671012
## Exterior1stVinylSd -3.730e-04 9.152e-04 -0.408 0.683665
## Exterior1stWd Sdng 3.985e-04 6.710e-04 0.594 0.552741
## Exterior2ndHdBoard 2.103e-04 7.349e-04 0.286 0.774833
## Exterior2ndMetalSd 1.254e-03 1.100e-03 1.140 0.254635
## Exterior2ndOther 6.876e-04 5.373e-04 1.280 0.200993
## Exterior2ndPlywood 7.494e-05 6.922e-04 0.108 0.913806
## Exterior2ndVinylSd 1.375e-04 9.154e-04 0.150 0.880616
## Exterior2ndWd Sdng -5.478e-04 6.667e-04 -0.822 0.411470
## MasVnrTypeOther -7.248e-05 2.748e-04 -0.264 0.792073
## I(MasVnrArea^pt_lm$lambda[[4]]) 9.683e-05 3.028e-04 0.320 0.749210
## ExterQual.L -2.141e-04 7.505e-04 -0.285 0.775522
## ExterQual.Q 4.623e-04 5.428e-04 0.852 0.394637
## ExterQual.C 5.161e-06 2.752e-04 0.019 0.985041
## ExterCond.L -2.253e-03 1.323e-03 -1.703 0.088879 .
## ExterCond.Q -1.533e-03 9.688e-04 -1.583 0.113811
## ExterCond.C -1.043e-03 4.519e-04 -2.307 0.021281 *
## FoundationCBlock -7.491e-04 3.630e-04 -2.064 0.039357 *
## FoundationOther 2.674e-05 9.985e-04 0.027 0.978638
## FoundationPConc -8.184e-04 3.931e-04 -2.082 0.037636 *
## BsmtQual.L -6.593e-04 4.738e-04 -1.392 0.164410
## BsmtQual.Q -5.115e-04 3.056e-04 -1.674 0.094528 .
## BsmtQual.C 7.406e-05 1.932e-04 0.383 0.701586
## BsmtCond.L -1.584e-03 1.989e-03 -0.796 0.425990
## BsmtCond.Q 1.154e-03 1.447e-03 0.798 0.425320
## BsmtCond.C -5.754e-04 6.852e-04 -0.840 0.401337
## BsmtExposure.L -8.139e-04 2.299e-04 -3.540 0.000421 ***
## BsmtExposure.Q -3.858e-04 2.030e-04 -1.901 0.057703 .
## BsmtExposure.C -1.125e-04 2.189e-04 -0.514 0.607500
## BsmtFinType1.L -8.109e-04 3.848e-04 -2.107 0.035374 *
## BsmtFinType1.Q 9.986e-05 3.463e-04 0.288 0.773157
## BsmtFinType1.C 4.674e-05 2.673e-04 0.175 0.861207
## BsmtFinType1^4 7.191e-05 2.484e-04 0.289 0.772293
## BsmtFinType1^5 4.425e-05 2.328e-04 0.190 0.849266
## I(BsmtFinSF1^pt_lm$lambda[[5]]) -7.659e-06 1.363e-04 -0.056 0.955216
## BsmtFinType2.L NA NA NA NA
## BsmtFinType2.Q NA NA NA NA
## BsmtFinType2.C NA NA NA NA
## BsmtFinType2^4 NA NA NA NA
## BsmtFinType2^5 NA NA NA NA
## I(BsmtUnfSF^pt_lm$lambda[[6]]) 6.012e-06 5.020e-06 1.198 0.231433
## I(TotalBsmtSF^pt_lm$lambda[[7]]) -3.295e-04 1.504e-04 -2.191 0.028703 *
## HeatingGasW -2.782e-03 7.176e-04 -3.877 0.000114 ***
## HeatingOther -4.336e-04 1.450e-03 -0.299 0.765036
## HeatingQC.L -3.969e-05 3.423e-04 -0.116 0.907719
## HeatingQC.Q -2.621e-04 2.635e-04 -0.995 0.320100
## HeatingQC.C 1.690e-04 1.824e-04 0.927 0.354379
## CentralAir.L -1.179e-03 3.184e-04 -3.702 0.000228 ***
## ElectricalFuseF -1.456e-03 8.020e-04 -1.816 0.069733 .
## ElectricalFuseP -3.091e-03 1.900e-03 -1.627 0.104049
## ElectricalMix NA NA NA NA
## ElectricalNA -1.211e-03 2.293e-03 -0.528 0.597536
## ElectricalSBrkr -2.588e-04 3.255e-04 -0.795 0.426905
## I(X1stFlrSF^pt_lm$lambda[[8]]) 3.353e-03 1.770e-03 1.895 0.058435 .
## I(X2ndFlrSF^pt_lm$lambda[[9]]) -8.194e-03 6.932e-03 -1.182 0.237527
## I(GrLivArea^pt_lm$lambda[[10]]) -3.581e-02 5.454e-03 -6.565 9.07e-11 ***
## BsmtFullBath -3.797e-04 2.032e-04 -1.869 0.062028 .
## FullBath -7.414e-04 2.440e-04 -3.038 0.002453 **
## HalfBath -8.207e-04 2.296e-04 -3.574 0.000371 ***
## BedroomAbvGr -1.596e-04 1.429e-04 -1.116 0.264552
## KitchenQual.L -1.204e-03 4.773e-04 -2.523 0.011815 *
## KitchenQual.Q -2.660e-04 3.337e-04 -0.797 0.425578
## KitchenQual.C -4.127e-04 1.873e-04 -2.204 0.027806 *
## Fireplaces -4.351e-04 1.499e-04 -2.903 0.003792 **
## GarageTypeBuiltIn 2.550e-04 3.590e-04 0.710 0.477808
## GarageTypeDetchd -6.228e-05 2.459e-04 -0.253 0.800113
## GarageTypeOther 8.322e-04 4.977e-04 1.672 0.094849 .
## GarageYrBlt -2.949e-06 6.455e-06 -0.457 0.647932
## GarageCars -7.291e-04 2.454e-04 -2.971 0.003052 **
## I(GarageArea^pt_lm$lambda[[11]]) -7.525e-05 1.571e-04 -0.479 0.631955
## I(WoodDeckSF^pt_lm$lambda[[12]]) 1.486e-03 1.444e-03 1.029 0.303614
## I(OpenPorchSF^pt_lm$lambda[[13]]) -4.180e-04 2.892e-04 -1.445 0.148713
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00206 on 849 degrees of freedom
## Multiple R-squared: 0.9217, Adjusted R-squared: 0.9088
## F-statistic: 71.41 on 140 and 849 DF, p-value: < 2.2e-16
Well, that’s a great fit but a lot of variables. Let’s use the step() function to backward eliminate some of the attributes that are not particularly useful.
lm_sale_price_step <- step(lm_sale_price, trace = FALSE)
summary(lm_sale_price_step)
##
## Call:
## lm(formula = I(SalePrice^pt_lm$lambda[[1]]) ~ MSSubClass + I(LotArea^pt_lm$lambda[[3]]) +
## LotShape + LotConfig + Neighborhood + Condition1 + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + BsmtQual + BsmtExposure +
## I(BsmtFinSF1^pt_lm$lambda[[5]]) + I(TotalBsmtSF^pt_lm$lambda[[7]]) +
## Heating + CentralAir + I(X1stFlrSF^pt_lm$lambda[[8]]) + I(X2ndFlrSF^pt_lm$lambda[[9]]) +
## I(GrLivArea^pt_lm$lambda[[10]]) + BsmtFullBath + FullBath +
## HalfBath + KitchenQual + Fireplaces + GarageCars + I(OpenPorchSF^pt_lm$lambda[[13]]),
## data = train_lm_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0061180 -0.0009642 -0.0001240 0.0008912 0.0160351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.045e-01 2.059e-02 43.933 < 2e-16 ***
## MSSubClass160 1.906e-03 6.990e-04 2.727 0.006519 **
## MSSubClass190 6.030e-04 7.537e-04 0.800 0.423904
## MSSubClass20 -7.054e-04 3.860e-04 -1.828 0.067947 .
## MSSubClass30 -3.431e-04 5.939e-04 -0.578 0.563624
## MSSubClass50 -1.197e-04 6.346e-04 -0.189 0.850499
## MSSubClass60 4.160e-04 6.093e-04 0.683 0.494929
## MSSubClass70 1.423e-04 6.940e-04 0.205 0.837564
## MSSubClass80 -2.808e-04 5.552e-04 -0.506 0.613142
## MSSubClass90 1.621e-03 6.656e-04 2.435 0.015079 *
## MSSubClassOther -2.907e-04 5.430e-04 -0.535 0.592475
## I(LotArea^pt_lm$lambda[[3]]) 3.867e-02 8.637e-03 4.478 8.51e-06 ***
## LotShapeIR2 -9.849e-04 4.786e-04 -2.058 0.039891 *
## LotShapeIR3 3.827e-03 1.010e-03 3.787 0.000162 ***
## LotShapeReg -2.745e-04 1.743e-04 -1.574 0.115724
## LotConfigCulDSac -6.015e-04 4.066e-04 -1.480 0.139348
## LotConfigFR2 1.121e-03 4.536e-04 2.472 0.013636 *
## LotConfigFR3 1.969e-03 1.244e-03 1.583 0.113801
## LotConfigInside 2.018e-04 1.839e-04 1.098 0.272665
## NeighborhoodBlueste 8.072e-04 1.694e-03 0.476 0.633842
## NeighborhoodBrDale 1.011e-03 1.027e-03 0.984 0.325175
## NeighborhoodBrkSide 1.962e-04 8.556e-04 0.229 0.818687
## NeighborhoodClearCr 7.641e-04 1.014e-03 0.753 0.451353
## NeighborhoodCollgCr 5.555e-04 7.277e-04 0.763 0.445445
## NeighborhoodCrawfor -1.252e-03 8.261e-04 -1.516 0.129873
## NeighborhoodEdwards 2.454e-03 7.860e-04 3.122 0.001856 **
## NeighborhoodGilbert 6.577e-04 7.944e-04 0.828 0.407934
## NeighborhoodIDOTRR 2.796e-03 8.982e-04 3.113 0.001912 **
## NeighborhoodMeadowV 2.382e-03 1.046e-03 2.276 0.023075 *
## NeighborhoodMitchel 1.324e-03 8.358e-04 1.584 0.113626
## NeighborhoodNAmes 9.896e-04 7.636e-04 1.296 0.195283
## NeighborhoodNoRidge -1.526e-03 8.196e-04 -1.862 0.062973 .
## NeighborhoodNPkVill 2.521e-04 1.089e-03 0.231 0.816995
## NeighborhoodNridgHt -1.436e-03 7.248e-04 -1.981 0.047881 *
## NeighborhoodNWAmes 1.828e-03 7.942e-04 2.302 0.021565 *
## NeighborhoodOldTown 1.817e-03 8.133e-04 2.234 0.025724 *
## NeighborhoodSawyer 1.053e-03 8.188e-04 1.286 0.198633
## NeighborhoodSawyerW 7.040e-04 7.734e-04 0.910 0.362969
## NeighborhoodSomerst -1.028e-03 7.388e-04 -1.391 0.164508
## NeighborhoodStoneBr -2.716e-03 8.594e-04 -3.160 0.001631 **
## NeighborhoodSWISU 7.030e-04 9.323e-04 0.754 0.451000
## NeighborhoodTimber 3.971e-04 8.300e-04 0.478 0.632468
## NeighborhoodVeenker -1.273e-03 1.126e-03 -1.130 0.258668
## Condition1Norm -1.288e-03 3.372e-04 -3.820 0.000142 ***
## Condition1Other -5.391e-04 3.999e-04 -1.348 0.177969
## OverallQual.L -7.734e-03 1.131e-03 -6.837 1.49e-11 ***
## OverallQual.Q 1.793e-03 9.438e-04 1.899 0.057834 .
## OverallQual.C -4.010e-04 8.028e-04 -0.500 0.617503
## OverallQual^4 1.790e-03 6.914e-04 2.590 0.009765 **
## OverallQual^5 1.333e-04 5.767e-04 0.231 0.817197
## OverallQual^6 5.239e-04 4.166e-04 1.258 0.208814
## OverallQual^7 1.077e-04 2.694e-04 0.400 0.689543
## OverallQual^8 8.380e-05 1.684e-04 0.498 0.618817
## OverallCond.L -7.128e-03 8.974e-04 -7.943 5.83e-15 ***
## OverallCond.Q 1.647e-03 7.989e-04 2.062 0.039486 *
## OverallCond.C -3.274e-04 6.928e-04 -0.473 0.636597
## OverallCond^4 -1.232e-03 5.663e-04 -2.175 0.029852 *
## OverallCond^5 5.857e-04 4.580e-04 1.279 0.201337
## OverallCond^6 -8.219e-04 3.552e-04 -2.314 0.020886 *
## OverallCond^7 2.798e-04 2.272e-04 1.232 0.218327
## YearBuilt -2.486e-05 7.898e-06 -3.148 0.001699 **
## YearRemodAdd -1.610e-05 5.953e-06 -2.704 0.006980 **
## BsmtQual.L -6.889e-04 4.359e-04 -1.580 0.114379
## BsmtQual.Q -4.607e-04 2.793e-04 -1.649 0.099428 .
## BsmtQual.C 2.384e-05 1.790e-04 0.133 0.894089
## BsmtExposure.L -7.830e-04 2.143e-04 -3.655 0.000272 ***
## BsmtExposure.Q -3.551e-04 1.957e-04 -1.815 0.069913 .
## BsmtExposure.C -1.185e-04 2.106e-04 -0.563 0.573674
## I(BsmtFinSF1^pt_lm$lambda[[5]]) -2.099e-04 3.982e-05 -5.270 1.70e-07 ***
## I(TotalBsmtSF^pt_lm$lambda[[7]]) -2.321e-04 1.182e-04 -1.964 0.049888 *
## HeatingGasW -2.275e-03 6.691e-04 -3.400 0.000703 ***
## HeatingOther 3.107e-04 1.300e-03 0.239 0.811241
## CentralAir.L -1.216e-03 2.751e-04 -4.420 1.10e-05 ***
## I(X1stFlrSF^pt_lm$lambda[[8]]) 3.341e-03 1.556e-03 2.147 0.032042 *
## I(X2ndFlrSF^pt_lm$lambda[[9]]) -1.080e-02 5.986e-03 -1.804 0.071565 .
## I(GrLivArea^pt_lm$lambda[[10]]) -3.580e-02 4.636e-03 -7.722 3.03e-14 ***
## BsmtFullBath -5.523e-04 1.788e-04 -3.089 0.002072 **
## FullBath -8.819e-04 2.301e-04 -3.832 0.000136 ***
## HalfBath -8.146e-04 2.195e-04 -3.712 0.000218 ***
## KitchenQual.L -1.076e-03 4.529e-04 -2.376 0.017691 *
## KitchenQual.Q -2.591e-04 3.153e-04 -0.822 0.411482
## KitchenQual.C -2.805e-04 1.765e-04 -1.589 0.112378
## Fireplaces -3.714e-04 1.407e-04 -2.641 0.008417 **
## GarageCars -8.923e-04 1.583e-04 -5.638 2.31e-08 ***
## I(OpenPorchSF^pt_lm$lambda[[13]]) -5.225e-04 2.750e-04 -1.900 0.057759 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00206 on 905 degrees of freedom
## Multiple R-squared: 0.9166, Adjusted R-squared: 0.9088
## F-statistic: 118.3 on 84 and 905 DF, p-value: < 2.2e-16
Typically, in a “full” analysis, we should check every variable to make sure the fitted values are linear with the response variable, and that the residuals are normall distributed. However, due to time (and practicality), i’ll show a few… Just note that this should be done for all variables in a work setting.
ggplot(train_lm_df, aes(x=GrLivArea^pt_lm$lambda[[10]], y=as.numeric(lm_sale_price_step$fitted.values)^pt_lm$lambda[[1]])) +
geom_point() +
labs(x="GrLivArea (Fitted)", y="Sale Price (Fitted)") +
ggtitle("Continuous Variable Linear Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
ggplot(train_lm_df, aes(x=OverallQual, y=as.numeric(lm_sale_price_step$fitted.values)^pt_lm$lambda[[1]])) +
geom_boxplot() +
labs(x="Overall Quality", y="Sale Price (Fitted)") +
ggtitle("Categorical Variable Linear Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
ggplot(train_lm_df, aes(x=GrLivArea^pt_lm$lambda[[10]], y=as.numeric(lm_sale_price_step$residuals))) +
geom_point() +
labs(x="GrLivArea (Fitted)", y="Residuals") +
ggtitle("Continuous Variable Linear Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
ggplot(train_lm_df, aes(x=OverallQual, y=as.numeric(lm_sale_price_step$residuals))) +
geom_boxplot() +
labs(x="Overall Quality", y="Residuals") +
ggtitle("Categorical Variable Linear Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
So, there does appear to be a linear relationship for both of these variables and the residuals look to have fairly uniform variability, although the GrLivArea results are a little suspect since they appear to have some residuals that could have high leverage. Anyways, it looks like we have a good fit for the variables we checked.
However, we have a slight issue. There are some observations with NA values that will cause issues when they get fit to the lm() function. For these instances, I replaced the input with the median (or middle in ordered factors) value. For unordered factors, I picked the most common option. I know there are other, more scientific ways to deal with this; however, there are only a few observations that actually have this issue.
fill_in <- function(data_df) {
data_df <- data_df %>%
mutate(MSZoning = ifelse(is.na(as.character(MSZoning)), "RL", as.character(MSZoning)),
MSZoning = as.factor(MSZoning),
Exterior1st = ifelse(is.na(as.character(Exterior1st)), "VinylSd", as.character(Exterior1st)),
Exterior1st = as.factor(Exterior1st),
Exterior2nd = ifelse(is.na(as.character(Exterior2nd)), "VinylSd", as.character(Exterior2nd)),
Exterior2nd = as.factor(Exterior2nd),
SaleType = ifelse(is.na(as.character(SaleType)), "Other", as.character(SaleType)),
SaleType = as.factor(SaleType),
LotFrontage = ifelse(is.na(LotFrontage), median(LotFrontage, na.rm = TRUE), LotFrontage),
BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), median(BsmtFinSF1, na.rm = TRUE), BsmtFinSF1),
BsmtFinSF2 = ifelse(is.na(BsmtFinSF2), median(BsmtFinSF2, na.rm = TRUE), BsmtFinSF2),
BsmtUnfSF = ifelse(is.na(BsmtUnfSF), median(BsmtUnfSF, na.rm = TRUE), BsmtUnfSF),
TotalBsmtSF = ifelse(is.na(TotalBsmtSF), median(TotalBsmtSF, na.rm = TRUE), TotalBsmtSF),
GarageArea = ifelse(is.na(GarageArea), median(GarageArea, na.rm = TRUE), GarageArea),
BsmtFullBath = ifelse(is.na(BsmtFullBath), median(BsmtFullBath, na.rm = TRUE), BsmtFullBath),
BsmtQual = ifelse(is.na(as.character(BsmtQual)), "TA", as.character(BsmtQual)),
BsmtQual = factor(BsmtQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
BsmtCond = ifelse(is.na(as.character(BsmtCond)), "TA", as.character(BsmtCond)),
BsmtCond = factor(BsmtCond, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
KitchenQual = ifelse(is.na(as.character(KitchenQual)), "TA", as.character(KitchenQual)),
KitchenQual = factor(KitchenQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
Functional = ifelse(is.na(as.character(Functional)), "Mod", as.character(Functional)),
Functional = factor(Functional, levels = c("Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"), ordered = TRUE),
KitchenQual = factor(KitchenQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
BsmtExposure = ifelse(is.na(as.character(BsmtExposure)), "Av", as.character(BsmtExposure)),
BsmtExposure = factor(BsmtExposure, levels = c("No", "Mn", "Av", "Gd"), ordered = TRUE),
BsmtFinType1 = ifelse(is.na(as.character(BsmtFinType1)), "Rec", as.character(BsmtFinType1)),
BsmtFinType1 = factor(BsmtFinType1, levels = c("Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"), ordered = TRUE),
GarageCars = ifelse(is.na(GarageCars), median(GarageCars, na.rm = TRUE), GarageCars),
GarageQual = ifelse(is.na(as.character(GarageQual)), "TA", as.character(GarageQual)),
GarageQual = factor(GarageQual, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE),
GarageCond = ifelse(is.na(as.character(GarageCond)), "TA", as.character(GarageCond)),
GarageCond = factor(GarageCond, levels = c("Po", "Fa", "TA", "Gd", "Ex"), ordered = TRUE)
)
data_df$MasVnrArea <- sapply(data_df$MasVnrArea, function(x) {as.numeric(x) + 0.01})
data_df$BsmtFinSF1 <- sapply(data_df$BsmtFinSF1, function(x) {as.numeric(x) + 0.01})
data_df$BsmtUnfSF <- sapply(data_df$BsmtUnfSF, function(x) {as.numeric(x) + 0.01})
data_df$X1stFlrSF <- sapply(data_df$X1stFlrSF, function(x) {as.numeric(x) + 0.01})
data_df$X2ndFlrSF <- sapply(data_df$X2ndFlrSF, function(x) {as.numeric(x) + 0.01})
data_df$GrLivArea <- sapply(data_df$GrLivArea, function(x) {as.numeric(x) + 0.01})
data_df$GarageArea <- sapply(data_df$GarageArea, function(x) {as.numeric(x) + 0.01})
data_df$WoodDeckSF <- sapply(data_df$WoodDeckSF, function(x) {as.numeric(x) + 0.01})
data_df$OpenPorchSF <- sapply(data_df$OpenPorchSF, function(x) {as.numeric(x) + 0.01})
return(data_df)
}
train_fill_in_df <- fill_in(train_df)
train_fill_in_df$SalePricePredicted <- predict(object = lm_sale_price_step, newdata = train_fill_in_df)
train_fill_in_df$SalePricePredicted <- sapply(train_fill_in_df$SalePricePredicted, function(x) {x^(1/pt_lm$lambda[[1]])})
plot_prices_df <- train_fill_in_df %>% dplyr::select(SalePrice, SalePricePredicted) %>% rename(Actual = SalePrice, Predicted = SalePricePredicted) %>% gather(SaleType, Val, 1:2)
ggplot(plot_prices_df, aes(x=Val, fill=SaleType)) +
geom_histogram(bins = 40, aes(y=..density..), alpha = 0.5, position="identity") +
labs(x="Sale Price Comparison", y="Density") +
ggtitle("Histogram Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
plot_df <- data.frame(residuals = as.numeric(lm_sale_price$residuals))
ggplot(plot_df, aes(x=residuals)) +
geom_histogram(bins =40, aes(y=..density..), color = "black", fill = "darkgrey", position="identity") +
stat_function(fun=dnorm, color="steelblue", args=list(mean=mean(lm_sale_price_step$residuals), sd=sd(lm_sale_price_step$residuals))) +
labs(x="Residuals", y="Density") +
ggtitle("Response Variable Reisdual Plot") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
qqnorm(as.numeric(plot_df$residuals))
qqline(as.numeric(plot_df$residuals))
Well, it looks like we have an issue with the data distribution having “short tails”; however, we will move forward since it looks like we are on the right track. Let’s see how good our estimate was based on a t-test.
Here, we can see that that using a 0.05 level of significance, we can accept the null hypothesis that there is no difference between the predicted and actual sale prices. Therefore, it appears the predictor algorithm is a good estimate.
dev_df$SalePricePredicted = predict(object = lm_sale_price_step, newdata = dev_df)
dev_df$SalePricePredicted <- sapply(dev_df$SalePricePredicted, function(x) {x^(1/pt_lm$lambda[[1]])})
diff <- dev_df$SalePricePredicted - dev_df$SalePrice
t.test(diff)
##
## One Sample t-test
##
## data: diff
## t = -1.5772, df = 57, p-value = 0.1203
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -9614.574 1142.330
## sample estimates:
## mean of x
## -4236.122
Here is a histogram plot of the predicted sales prices for the test data set.
test_df <- read.csv('./data/test.csv')
test_df[test_df == "NA"] <- NA
test_predict_df <- test_df %>% format_function(.) %>% fill_in(.)
test_predict_df$SalePrice = predict(object = lm_sale_price_step, newdata = test_predict_df)
test_predict_df$SalePrice<- sapply(test_predict_df$SalePrice, function(x) {x^(1/pt_lm$lambda[[1]])})
ggplot(test_predict_df, aes(x=SalePrice)) +
geom_histogram(bins = 40, aes(y=..density..), fill = "grey", color = "black", alpha = 0.5, position="identity") +
labs(x="Sale Price Comparison For Test Data", y="Density") +
ggtitle("Histogram Comparison") +
theme(plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "lightgrey"),
panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(color = "lightgrey"),
panel.grid.major.y = element_line(color = "lightgrey"),
axis.text = element_text(size=12, color = "grey55"),
axis.title = element_text(size=14, color = "grey55"),
title = element_text(size=14, color = "grey55"))
write.csv(test_predict_df[,c("Id", "SalePrice")], "./data/prediction_submittal.csv", row.names = FALSE)
My user name for the kaggle competition is JohnGrando (https://www.kaggle.com/johngrando) and my current rank is 1,449 (score - 0.12889).