Thesis

I think that these four things that matter the most in determining the price of a home. I used full bathroom, first floor square footage, total basement square footage, and year built. I will be testing these to see how they correlate.

Set up

Histograms

hist(t_raw$full_bath)

hist(t_raw$first_flr_sf)

hist(t_raw$total_bsmt_sf)

hist(t_raw$year_built)

Correlation

t_numeric <- t %>% 
  select()

t <- t_raw %>% 
  select(full_bath, first_flr_sf, total_bsmt_sf, year_built, sale_price)

library(ggplot2)


ggcorrplot(cor((t)),
            colors = c('green', 'white', 'yellow'),
            lab = TRUE)

Model

m <-lm(sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + year_built, data = t)
m2 <-lm(sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + year_built, data = t_train)
m3 <-lm(sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + year_built, data = t_test)

summary(m)
## 
## Call:
## lm(formula = sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + 
##     year_built, data = t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -465121  -27371   -6054   19894  379648 
## 
## Coefficients:
##                   Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)   -1361301.079    71757.159  -18.97 <0.0000000000000002 ***
## full_bath        35885.572     2007.327   17.88 <0.0000000000000002 ***
## first_flr_sf        52.198        4.116   12.68 <0.0000000000000002 ***
## total_bsmt_sf       43.239        3.719   11.63 <0.0000000000000002 ***
## year_built         699.974       37.282   18.77 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50960 on 2925 degrees of freedom
## Multiple R-squared:  0.5937, Adjusted R-squared:  0.5931 
## F-statistic:  1068 on 4 and 2925 DF,  p-value: < 0.00000000000000022
summary(m2)
## 
## Call:
## lm(formula = sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + 
##     year_built, data = t_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -414966  -28846   -5892   18966  377104 
## 
## Coefficients:
##                   Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)   -1365990.570   113234.495 -12.063 < 0.0000000000000002 ***
## full_bath        38555.369     3157.516  12.211 < 0.0000000000000002 ***
## first_flr_sf        47.100        6.885   6.841    0.000000000012735 ***
## total_bsmt_sf       47.076        6.510   7.232    0.000000000000868 ***
## year_built         701.131       58.839  11.916 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53120 on 1150 degrees of freedom
## Multiple R-squared:  0.5874, Adjusted R-squared:  0.5859 
## F-statistic: 409.3 on 4 and 1150 DF,  p-value: < 0.00000000000000022
summary(m3)
## 
## Call:
## lm(formula = sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + 
##     year_built, data = t_test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -465602  -26610   -5873   20324  373525 
## 
## Coefficients:
##                   Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)   -1359725.117    93289.513 -14.575 <0.0000000000000002 ***
## full_bath        33916.621     2617.130  12.959 <0.0000000000000002 ***
## first_flr_sf        55.417        5.153  10.755 <0.0000000000000002 ***
## total_bsmt_sf       41.259        4.511   9.146 <0.0000000000000002 ***
## year_built         699.891       48.472  14.439 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49540 on 1770 degrees of freedom
## Multiple R-squared:  0.5987, Adjusted R-squared:  0.5978 
## F-statistic: 660.3 on 4 and 1770 DF,  p-value: < 0.00000000000000022

Test the model on the test data

RSME

rsme_train <- sqrt(mean(m2$residuals^2))

t_test_results <- t_test %>%
  mutate(predicted = predict(m2, newdata = t_test),
         residuals = sale_price - predicted,
         squared_residuals = residuals^2)

rsme_test <- sqrt(mean(t_test_results$residuals^2))

R-squared

#Residual Sum of Squares(Train)

residual_sum_of_squares <- t_train %>% 
  mutate(residuals = m2$residuals) %>% 
  mutate(residuals_squared = residuals^2)

sum(residual_sum_of_squares$residuals_squared)
## [1] 3245112787317
#Total Variation in Model(Train)

total_variation_in_model <- t_train %>% 
  mutate(average = mean(sale_price) - sale_price) %>% 
  mutate(average_squared = average^2)

sum(total_variation_in_model$average_squared)
## [1] 7864565495090
#Total of Sums Squared(Train)

total_sum_of_squares <- 1 - sum(residual_sum_of_squares$residuals_squared)/sum(total_variation_in_model$average_squared)

#Residual Sum of Squares(Test)

residual_sum_of_squares2 <- t_test %>% 
  mutate(residuals = m3$residuals) %>% 
  mutate(residuals_squared = residuals^2)

sum(residual_sum_of_squares2$residuals_squared)
## [1] 4344351278070
#Total Variation in Model(Test)

total_variation_in_model2 <- t_test %>% 
  mutate(average = mean(sale_price) - sale_price) %>% 
  mutate(average_squared = average^2)

sum(total_variation_in_model2$average_squared)
## [1] 10826563443551
#Total Sum of Squares(Test)

total_sum_of_squares2 <- 1 - sum(residual_sum_of_squares2$residuals_squared)/sum(total_variation_in_model2$average_squared)

Visualize

plot(y = (t_test$sale_price), x = (t_test_results$predicted))

Conclusion

I found that these four details are not actually that correlated with the sale price. I find them to correlate well up until the $200,000 range and then I just assume that the values are higher for other reasons that I didnt list.