library(AmesHousing)
library(rsample)
library(dslabs)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ames <- AmesHousing::make_ames()
summary(ames$Sale_Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12789  129500  160000  180796  213500  755000
ggplot(ames, aes(Sale_Price)) +
  geom_histogram(bins = 30)

set.seed(123)

split <- initial_split(ames, prop = 0.7)

train_data <- training(split)
test_data  <- testing(split)

dim(train_data)
## [1] 2051   81
dim(test_data)
## [1] 879  81
model <- lm(Sale_Price ~ Gr_Liv_Area + Year_Built, data = train_data)

summary(model)
## 
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area + Year_Built, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -451739  -26970   -2970   18328  308123 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.157e+06  6.923e+04  -31.16   <2e-16 ***
## Gr_Liv_Area  9.436e+01  2.124e+00   44.41   <2e-16 ***
## Year_Built   1.114e+03  3.548e+01   31.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47200 on 2048 degrees of freedom
## Multiple R-squared:  0.6543, Adjusted R-squared:  0.654 
## F-statistic:  1938 on 2 and 2048 DF,  p-value: < 2.2e-16
pred <- predict(model, test_data)

rmse <- sqrt(mean((test_data$Sale_Price - pred)^2))

rmse
## [1] 45444.77