library(AmesHousing)
library(rsample)
library(dslabs)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ames <- AmesHousing::make_ames()
summary(ames$Sale_Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129500 160000 180796 213500 755000
ggplot(ames, aes(Sale_Price)) +
geom_histogram(bins = 30)

set.seed(123)
split <- initial_split(ames, prop = 0.7)
train_data <- training(split)
test_data <- testing(split)
dim(train_data)
## [1] 2051 81
dim(test_data)
## [1] 879 81
model <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built, data = train_data)
summary(model)
##
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area + Year_Built, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -451739 -26970 -2970 18328 308123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.157e+06 6.923e+04 -31.16 <2e-16 ***
## Gr_Liv_Area 9.436e+01 2.124e+00 44.41 <2e-16 ***
## Year_Built 1.114e+03 3.548e+01 31.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47200 on 2048 degrees of freedom
## Multiple R-squared: 0.6543, Adjusted R-squared: 0.654
## F-statistic: 1938 on 2 and 2048 DF, p-value: < 2.2e-16
pred <- predict(model, test_data)
rmse <- sqrt(mean((test_data$Sale_Price - pred)^2))
rmse
## [1] 45444.77