You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.
summary(training_data)
## id ms_sub_class ms_zoning lot_frontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## lot_area street alley lot_shape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## land_contour utilities lot_config land_slope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## neighborhood condition1 condition2 bldg_type
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## house_style overall_qual overall_cond year_built
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## year_remod_add roof_style roof_matl exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## exterior2nd mas_vnr_type mas_vnr_area exter_qual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## exter_cond foundation bsmt_qual bsmt_cond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## bsmt_exposure bsmt_fin_type1 bsmt_fin_sf1 bsmt_fin_type2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## bsmt_fin_sf2 bsmt_unf_sf total_bsmt_sf heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## heating_qc central_air electrical x1st_flr_sf
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## x2nd_flr_sf low_qual_fin_sf gr_liv_area bsmt_full_bath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## bsmt_half_bath full_bath half_bath bedroom_abv_gr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## kitchen_abv_gr kitchen_qual tot_rms_abv_grd functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## fireplaces fireplace_qu garage_type garage_yr_blt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## garage_finish garage_cars garage_area garage_qual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## garage_cond paved_drive wood_deck_sf open_porch_sf
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## enclosed_porch x3ssn_porch screen_porch pool_area
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## pool_qc fence misc_feature misc_val
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## mo_sold yr_sold sale_type sale_condition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## sale_price
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
Here we look at the histogram for our dependent variable sale_price
hist(training_data$sale_price)
Next we take a look at histograms and scatter plots for each of the continuous variables as well as the relationship between the different continuous variables and sale_price
hist(training_data$overall_qual)
plot(training_data$overall_qual, training_data$sale_price)
hist(training_data$year_built)
plot(training_data$year_built, training_data$sale_price)
hist(training_data$year_remod_add)
plot(training_data$year_remod_add, training_data$sale_price)
hist(training_data$bsmt_fin_sf1)
plot(training_data$bsmt_fin_sf1, training_data$sale_price)
hist(training_data$total_bsmt_sf)
plot(training_data$total_bsmt_sf, training_data$sale_price)
hist(training_data$x1st_flr_sf)
plot(training_data$x1st_flr_sf, training_data$sale_price)
hist(training_data$x2nd_flr_sf)
plot(training_data$x2nd_flr_sf, training_data$sale_price)
hist(training_data$gr_liv_area)
plot(training_data$gr_liv_area, training_data$sale_price)
hist(training_data$full_bath)
plot(training_data$full_bath, training_data$sale_price)
hist(training_data$tot_rms_abv_grd)
plot(training_data$tot_rms_abv_grd, training_data$sale_price)
hist(training_data$fireplaces)
plot(training_data$fireplaces, training_data$sale_price)
hist(training_data$garage_cars)
plot(training_data$garage_cars, training_data$sale_price)
hist(training_data$garage_area)
plot(training_data$garage_area, training_data$sale_price)
hist(training_data$wood_deck_sf)
plot(training_data$wood_deck_sf, training_data$sale_price)
hist(training_data$open_porch_sf)
plot(training_data$open_porch_sf, training_data$sale_price)
Next we get a list of the charater or categorical variables in the training data
training_data %>% select_if(., is.character) %>% str()
## tibble [1,460 × 43] (S3: tbl_df/tbl/data.frame)
## $ ms_zoning : chr [1:1460] "RL" "RL" "RL" "RL" ...
## $ street : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
## $ alley : chr [1:1460] NA NA NA NA ...
## $ lot_shape : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
## $ land_contour : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ utilities : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ lot_config : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
## $ land_slope : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ condition1 : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
## $ condition2 : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
## $ bldg_type : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ house_style : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
## $ roof_style : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
## $ roof_matl : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ exterior1st : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ exterior2nd : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ mas_vnr_type : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
## $ exter_qual : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
## $ exter_cond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ foundation : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
## $ bsmt_qual : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
## $ bsmt_cond : chr [1:1460] "TA" "TA" "TA" "Gd" ...
## $ bsmt_exposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
## $ bsmt_fin_type1: chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ bsmt_fin_type2: chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
## $ heating : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
## $ heating_qc : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
## $ central_air : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ electrical : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ kitchen_qual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
## $ functional : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
## $ fireplace_qu : chr [1:1460] NA "TA" "TA" "Gd" ...
## $ garage_type : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ garage_finish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
## $ garage_qual : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ garage_cond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ paved_drive : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ pool_qc : chr [1:1460] NA NA NA NA ...
## $ fence : chr [1:1460] NA NA NA NA ...
## $ misc_feature : chr [1:1460] NA NA NA NA ...
## $ sale_type : chr [1:1460] "WD" "WD" "WD" "WD" ...
## $ sale_condition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
And we then look at boxplots of the sale_price based on several different categorical variables
ggplot(training_data) +
geom_boxplot(aes(x=ms_zoning, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=street, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=lot_shape, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=land_contour, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=utilities, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=lot_config, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=land_slope, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=neighborhood, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=condition1, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=condition2, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=bldg_type, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=house_style, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=roof_style, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exterior1st, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exterior2nd, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exter_qual, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=exter_cond, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=heating, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=heating_qc, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=central_air, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=electrical, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=kitchen_qual, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=functional, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=garage_type, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=garage_finish, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=sale_type, y=sale_price))
ggplot(training_data) +
geom_boxplot(aes(x=sale_condition, y=sale_price))
d1 <- training_data %>%
select(sale_price, garage_area, x1st_flr_sf)
plot(d1, pch=20 , cex=1.5 , col="purple")
num_items = ncol(d1)
cor_data <- c()
for(i in seq(num_items)) {
for(j in seq(num_items)) {
cor_data <- c(cor_data,cor(d1[[i]],d1[[j]]))
}
}
(corr_matrix <- matrix(cor_data, ncol=num_items, nrow=num_items))
## [,1] [,2] [,3]
## [1,] 1.0000000 0.6234314 0.6058522
## [2,] 0.6234314 1.0000000 0.4897817
## [3,] 0.6058522 0.4897817 1.0000000
cor.test(d1$sale_price, d1$garage_area, conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: d1$sale_price and d1$garage_area
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6024756 0.6435283
## sample estimates:
## cor
## 0.6234314
cor.test(d1$sale_price, d1$x1st_flr_sf, conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: d1$sale_price and d1$x1st_flr_sf
## t = 29.078, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5841687 0.6266715
## sample estimates:
## cor
## 0.6058522
cor.test(d1$garage_area, d1$x1st_flr_sf, conf.level=0.80)
##
## Pearson's product-moment correlation
##
## data: d1$garage_area and d1$x1st_flr_sf
## t = 21.451, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4638446 0.5148798
## sample estimates:
## cor
## 0.4897817
prec_matrix <- solve(corr_matrix)
cp <- corr_matrix %*% prec_matrix
pc <- prec_matrix %*% corr_matrix
lu_decomp <- lu.decomposition(corr_matrix)
l <- lu_decomp$L
u <- lu_decomp$U
lu <- l %*% u
corr_matrix == lu
## [,1] [,2] [,3]
## [1,] TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE
Many times, it makes sense to fit a closed form distribution to
data.
- Select a variable in the Kaggle.com training dataset that is skewed to
the right, shift it so that the minimum value is absolutely above zero
if necessary.
- Then load the MASS package and run fitdistr to fit an exponential
probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html).
- Find the optimal value of \(\lambda\)
for this distribution, and then take 1000 samples from this exponential
distribution using this value (e.g., rexp(1000, \(\lambda\))).
- Plot a histogram and compare it with a histogram of your original
variable.
- Using the exponential pdf, find the 5th and 95th percentiles using the
cumulative distribution function (CDF). - Also generate a 95% confidence
interval from the empirical data, assuming normality.
- Finally, provide the empirical 5th percentile and 95th percentile of
the data.
- Discuss.
This variable has a minimum value of 1300
ggplot(training_data) +
geom_histogram(aes(x=lot_area), fill='white', color='black', bins=50)
lot_area <- training_data %>% select(lot_area)
summary(lot_area)
## lot_area
## Min. : 1300
## 1st Qu.: 7554
## Median : 9478
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
lot_area_dist <- MASS::fitdistr(lot_area$lot_area, densfun = 'exponential')
lambda_opt <- lot_area_dist$estimate
set.seed(51323)
sample_dist <- rexp(1000,lambda_opt)
hist(training_data$lot_area)
hist(sample_dist)
qexp(.05, lambda_opt)
## [1] 539.4428
qexp(.95, lambda_opt)
## [1] 31505.6
x <- mean(lot_area$lot_area)
std_dev <- sd(lot_area$lot_area)
n <- length(lot_area$lot_area)
z_score <- qnorm(.975)
(lower_limit <- x - (z_score*(std_dev/sqrt(n))))
## [1] 10004.84
(upper_limit <- x+(z_score*(std_dev/sqrt(n))))
## [1] 11028.81
quantile(lot_area$lot_area, probs=c(0.05, 0.95))
## 5% 95%
## 3311.70 17401.15
sp_quantiles <- quantile(training_data$sale_price)
sp_quantiles
## 0% 25% 50% 75% 100%
## 34900 129975 163000 214000 755000
IQR <- sp_quantiles[[4]]-sp_quantiles[[2]]
upper_limit <- sp_quantiles[[4]] + 1.5*IQR
lower_limit <- sp_quantiles[[2]] - 1.5*IQR
training_adj <- training_data %>% filter(between(sale_price, lower_limit, upper_limit))
training_corr <- cor(training_adj %>% select_if(is.numeric))
training_corr <- round(training_corr, digits=2)
sales_corr <- training_corr[,'sale_price']
sales_corr
## id ms_sub_class lot_frontage lot_area overall_qual
## -0.01 -0.06 NA 0.25 0.78
## overall_cond year_built year_remod_add mas_vnr_area bsmt_fin_sf1
## -0.05 0.56 0.54 NA 0.29
## bsmt_fin_sf2 bsmt_unf_sf total_bsmt_sf x1st_flr_sf x2nd_flr_sf
## 0.01 0.22 0.54 0.52 0.32
## low_qual_fin_sf gr_liv_area bsmt_full_bath bsmt_half_bath full_bath
## -0.06 0.66 0.20 -0.02 0.58
## half_bath bedroom_abv_gr kitchen_abv_gr tot_rms_abv_grd fireplaces
## 0.28 0.20 -0.15 0.47 0.45
## garage_yr_blt garage_cars garage_area wood_deck_sf open_porch_sf
## NA 0.63 0.61 0.30 0.33
## enclosed_porch x3ssn_porch screen_porch pool_area misc_val
## -0.14 0.04 0.11 0.05 -0.01
## mo_sold yr_sold sale_price
## 0.07 -0.03 1.00
For my model, I used the earlier correlation plot as well as the various plots showing the relationship between sales price and the different categorical and numerical variables as a basis for determining which variables to include in the model. Ultimately, this was a pretty manual and tedious process.
training_adj <- training_adj %>% mutate(overall_qual = as.factor(overall_qual))
training_adj <- training_adj %>% mutate(garage_cars = as.factor(garage_cars))
training_adj <- training_adj %>% mutate(year_built_cat = case_when(
year_built < 1980~'pre_1980',
TRUE ~ 'post_1980'
))
training_adj <- training_adj %>% mutate(full_bath = as.factor(full_bath))
training_adj <- training_adj %>% mutate(tot_rms_abv_grd = as.factor(tot_rms_abv_grd))
training_adj <- training_adj %>% mutate(has_2nd_flr = ifelse(x2nd_flr_sf == 0,'no','yes'))
m3 <- lm(sale_price ~ overall_qual + log(gr_liv_area) + garage_cars +
year_built_cat + full_bath + total_bsmt_sf + x1st_flr_sf +
(has_2nd_flr)*(x2nd_flr_sf) + tot_rms_abv_grd +
kitchen_qual + electrical + central_air + exter_qual +
exter_cond + heating + sale_type,
data=training_adj)
summary(m3)
##
## Call:
## lm(formula = sale_price ~ overall_qual + log(gr_liv_area) + garage_cars +
## year_built_cat + full_bath + total_bsmt_sf + x1st_flr_sf +
## (has_2nd_flr) * (x2nd_flr_sf) + tot_rms_abv_grd + kitchen_qual +
## electrical + central_air + exter_qual + exter_cond + heating +
## sale_type, data = training_adj)
##
## Residuals:
## Min 1Q Median 3Q Max
## -148847 -13056 290 12539 123454
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.434e+05 7.864e+04 -3.095 0.002007 **
## overall_qual2 1.004e+04 3.089e+04 0.325 0.745186
## overall_qual3 -1.981e+04 2.908e+04 -0.681 0.495972
## overall_qual4 -7.521e+03 2.857e+04 -0.263 0.792395
## overall_qual5 -8.122e+02 2.872e+04 -0.028 0.977446
## overall_qual6 7.544e+03 2.877e+04 0.262 0.793180
## overall_qual7 2.097e+04 2.881e+04 0.728 0.466728
## overall_qual8 4.521e+04 2.895e+04 1.562 0.118635
## overall_qual9 7.421e+04 2.969e+04 2.499 0.012558 *
## overall_qual10 -3.258e+04 3.239e+04 -1.006 0.314707
## log(gr_liv_area) 7.083e+04 1.194e+04 5.933 3.79e-09 ***
## garage_cars1 9.305e+03 3.376e+03 2.756 0.005923 **
## garage_cars2 1.756e+04 3.425e+03 5.128 3.35e-07 ***
## garage_cars3 3.282e+04 4.390e+03 7.477 1.37e-13 ***
## garage_cars4 3.913e+04 1.187e+04 3.295 0.001009 **
## year_built_catpre_1980 -6.168e+03 2.413e+03 -2.556 0.010693 *
## full_bath1 -4.545e+03 9.852e+03 -0.461 0.644662
## full_bath2 -5.593e+03 9.897e+03 -0.565 0.572066
## full_bath3 2.548e+03 1.165e+04 0.219 0.826923
## total_bsmt_sf 1.528e+01 3.045e+00 5.018 5.92e-07 ***
## x1st_flr_sf 3.273e+00 8.520e+00 0.384 0.700911
## has_2nd_flryes -3.513e+04 4.285e+03 -8.199 5.63e-16 ***
## x2nd_flr_sf 3.997e+01 8.177e+00 4.888 1.14e-06 ***
## tot_rms_abv_grd3 -6.355e+04 3.801e+04 -1.672 0.094784 .
## tot_rms_abv_grd4 -6.679e+04 3.766e+04 -1.774 0.076336 .
## tot_rms_abv_grd5 -6.785e+04 3.782e+04 -1.794 0.073065 .
## tot_rms_abv_grd6 -7.054e+04 3.798e+04 -1.858 0.063442 .
## tot_rms_abv_grd7 -7.113e+04 3.806e+04 -1.869 0.061884 .
## tot_rms_abv_grd8 -7.442e+04 3.812e+04 -1.952 0.051134 .
## tot_rms_abv_grd9 -6.817e+04 3.822e+04 -1.784 0.074692 .
## tot_rms_abv_grd10 -7.737e+04 3.845e+04 -2.012 0.044384 *
## tot_rms_abv_grd11 -1.031e+05 3.898e+04 -2.646 0.008250 **
## tot_rms_abv_grd12 -1.264e+05 3.931e+04 -3.216 0.001333 **
## tot_rms_abv_grd14 -8.526e+04 4.724e+04 -1.805 0.071326 .
## kitchen_qualFa -2.712e+04 6.390e+03 -4.244 2.35e-05 ***
## kitchen_qualGd -1.677e+04 3.985e+03 -4.208 2.75e-05 ***
## kitchen_qualTA -2.612e+04 4.228e+03 -6.178 8.61e-10 ***
## electricalFuseF -1.119e+03 5.804e+03 -0.193 0.847124
## electricalFuseP -6.052e+03 1.528e+04 -0.396 0.692074
## electricalMix -4.419e+04 2.562e+04 -1.725 0.084765 .
## electricalSBrkr 6.404e+03 2.897e+03 2.211 0.027234 *
## central_airY 1.721e+04 3.526e+03 4.883 1.17e-06 ***
## exter_qualFa -3.331e+04 1.083e+04 -3.075 0.002150 **
## exter_qualGd -1.188e+04 6.669e+03 -1.782 0.074982 .
## exter_qualTA -1.874e+04 6.930e+03 -2.704 0.006932 **
## exter_condFa -4.960e+04 1.618e+04 -3.065 0.002221 **
## exter_condGd -3.739e+04 1.541e+04 -2.427 0.015368 *
## exter_condPo -7.224e+04 3.035e+04 -2.381 0.017420 *
## exter_condTA -4.076e+04 1.528e+04 -2.668 0.007721 **
## heatingGasA -2.290e+04 2.603e+04 -0.879 0.379307
## heatingGasW -3.121e+04 2.663e+04 -1.172 0.241532
## heatingGrav -1.441e+04 2.768e+04 -0.520 0.602827
## heatingOthW -8.464e+04 3.157e+04 -2.681 0.007427 **
## heatingWall -1.114e+04 2.850e+04 -0.391 0.695874
## sale_typeCon 6.544e+04 1.813e+04 3.610 0.000318 ***
## sale_typeConLD 9.236e+03 9.600e+03 0.962 0.336171
## sale_typeConLI -7.055e+03 1.314e+04 -0.537 0.591424
## sale_typeConLw 1.263e+04 1.240e+04 1.018 0.308646
## sale_typeCWD 2.814e+04 1.337e+04 2.105 0.035485 *
## sale_typeNew 1.595e+04 4.912e+03 3.247 0.001197 **
## sale_typeOth 1.533e+04 1.514e+04 1.012 0.311608
## sale_typeWD 1.479e+04 3.931e+03 3.763 0.000175 ***
## has_2nd_flryes:x2nd_flr_sf NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24850 on 1336 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8317, Adjusted R-squared: 0.8241
## F-statistic: 108.3 on 61 and 1336 DF, p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(m3)
## Warning: not plotting observations with leverage one:
## 241, 359, 380, 508, 607, 1265
hist(resid(m3))
Overall my model used a mix of continuous variables and categorical
variables. Additionally I include one log-transformed term and an
interaction term. The model had Adjusted R-Squared of 08216, and the
p-value for the F-Statistic was significantly smaller than .05, so we
can assume the model is a good model.
When looking at the residual plots, we see that the residuals appear to be normally distributed and there doesn’t to be any significant patterns in the residuals, thus concluding that our model doesn’t suffer from heteroskedasity. Therefore we conclude that our model is a pretty good model for predicting sales prices for homes.
Ran the model on the test data set to generate predictions, and then imported the final sales price predictions to Kaggle
test_data <- read_delim('/Users/korymartin/Library/Mobile Documents/com~apple~CloudDocs/Grad Programs/CUNY SPS/DATA 605/Final Exam/train.csv')
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- clean_names(test_data)
test_data <- test_data %>% mutate(overall_qual = as.factor(overall_qual))
test_data <- test_data %>% mutate(garage_cars = as.factor(garage_cars))
test_data <- test_data %>% mutate(year_built_cat = case_when(
year_built < 1980~'pre_1980',
TRUE ~ 'post_1980'
))
test_data <- test_data %>% mutate(full_bath = as.factor(full_bath))
test_data <- test_data %>% mutate(tot_rms_abv_grd = as.factor(tot_rms_abv_grd))
test_data <- test_data %>% mutate(has_2nd_flr = ifelse(x2nd_flr_sf == 0,'no','yes'))
predict.y <- predict(m3, test_data)
## Warning in predict.lm(m3, test_data): prediction from a rank-deficient fit may
## be misleading
actual.y <- test_data$sale_price
test_data2 <- tibble(cbind(test_data, data.frame(predict.y)))
seq_count <- seq(1459)
test_data2 <- test_data2 %>% select(id, predict.y) %>% rename(SalePrice = predict.y, Id = id)
test_data2 <- test_data2 %>% drop_na(SalePrice)
test_data2 <- cbind(test_data2, seq_count)
test_data2 <- tibble(test_data2)
test_data2 <- test_data2 %>% mutate(Id = (seq_count+1460))
test_data2 <- test_data2 %>% select(Id, SalePrice)
write_csv(test_data2, './test_data2.csv')