I think that these four things that matter the most in determining the price of a home. I used full bathroom, first floor square footage, total basement square footage, and year built. I will be testing these to see how they correlate.
hist(t_raw$full_bath)
hist(t_raw$first_flr_sf)
hist(t_raw$total_bsmt_sf)
hist(t_raw$year_built)
t_numeric <- t %>%
select()
t <- t_raw %>%
select(full_bath, first_flr_sf, total_bsmt_sf, year_built, sale_price)
library(ggplot2)
ggcorrplot(cor((t)),
colors = c('green', 'white', 'yellow'),
lab = TRUE)
m <-lm(sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + year_built, data = t)
m2 <-lm(sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + year_built, data = t_train)
m3 <-lm(sale_price ~ full_bath + first_flr_sf + total_bsmt_sf + year_built, data = t_test)
summary(m)
##
## Call:
## lm(formula = sale_price ~ full_bath + first_flr_sf + total_bsmt_sf +
## year_built, data = t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -465121 -27371 -6054 19894 379648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1361301.079 71757.159 -18.97 <0.0000000000000002 ***
## full_bath 35885.572 2007.327 17.88 <0.0000000000000002 ***
## first_flr_sf 52.198 4.116 12.68 <0.0000000000000002 ***
## total_bsmt_sf 43.239 3.719 11.63 <0.0000000000000002 ***
## year_built 699.974 37.282 18.77 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50960 on 2925 degrees of freedom
## Multiple R-squared: 0.5937, Adjusted R-squared: 0.5931
## F-statistic: 1068 on 4 and 2925 DF, p-value: < 0.00000000000000022
summary(m2)
##
## Call:
## lm(formula = sale_price ~ full_bath + first_flr_sf + total_bsmt_sf +
## year_built, data = t_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -414966 -28846 -5892 18966 377104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1365990.570 113234.495 -12.063 < 0.0000000000000002 ***
## full_bath 38555.369 3157.516 12.211 < 0.0000000000000002 ***
## first_flr_sf 47.100 6.885 6.841 0.000000000012735 ***
## total_bsmt_sf 47.076 6.510 7.232 0.000000000000868 ***
## year_built 701.131 58.839 11.916 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53120 on 1150 degrees of freedom
## Multiple R-squared: 0.5874, Adjusted R-squared: 0.5859
## F-statistic: 409.3 on 4 and 1150 DF, p-value: < 0.00000000000000022
summary(m3)
##
## Call:
## lm(formula = sale_price ~ full_bath + first_flr_sf + total_bsmt_sf +
## year_built, data = t_test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -465602 -26610 -5873 20324 373525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1359725.117 93289.513 -14.575 <0.0000000000000002 ***
## full_bath 33916.621 2617.130 12.959 <0.0000000000000002 ***
## first_flr_sf 55.417 5.153 10.755 <0.0000000000000002 ***
## total_bsmt_sf 41.259 4.511 9.146 <0.0000000000000002 ***
## year_built 699.891 48.472 14.439 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49540 on 1770 degrees of freedom
## Multiple R-squared: 0.5987, Adjusted R-squared: 0.5978
## F-statistic: 660.3 on 4 and 1770 DF, p-value: < 0.00000000000000022
rsme_train <- sqrt(mean(m2$residuals^2))
t_test_results <- t_test %>%
mutate(predicted = predict(m2, newdata = t_test),
residuals = sale_price - predicted,
squared_residuals = residuals^2)
rsme_test <- sqrt(mean(t_test_results$residuals^2))
#Residual Sum of Squares(Train)
residual_sum_of_squares <- t_train %>%
mutate(residuals = m2$residuals) %>%
mutate(residuals_squared = residuals^2)
sum(residual_sum_of_squares$residuals_squared)
## [1] 3245112787317
#Total Variation in Model(Train)
total_variation_in_model <- t_train %>%
mutate(average = mean(sale_price) - sale_price) %>%
mutate(average_squared = average^2)
sum(total_variation_in_model$average_squared)
## [1] 7864565495090
#Total of Sums Squared(Train)
total_sum_of_squares <- 1 - sum(residual_sum_of_squares$residuals_squared)/sum(total_variation_in_model$average_squared)
#Residual Sum of Squares(Test)
residual_sum_of_squares2 <- t_test %>%
mutate(residuals = m3$residuals) %>%
mutate(residuals_squared = residuals^2)
sum(residual_sum_of_squares2$residuals_squared)
## [1] 4344351278070
#Total Variation in Model(Test)
total_variation_in_model2 <- t_test %>%
mutate(average = mean(sale_price) - sale_price) %>%
mutate(average_squared = average^2)
sum(total_variation_in_model2$average_squared)
## [1] 10826563443551
#Total Sum of Squares(Test)
total_sum_of_squares2 <- 1 - sum(residual_sum_of_squares2$residuals_squared)/sum(total_variation_in_model2$average_squared)
plot(y = (t_test$sale_price), x = (t_test_results$predicted))
I found that these four details are not actually that correlated with the sale price. I find them to correlate well up until the $200,000 range and then I just assume that the values are higher for other reasons that I didnt list.