## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'fivethirtyeight' was built under R version 3.4.4
## Warning: package 'moderndive' was built under R version 3.4.4
## Warning: package 'skimr' was built under R version 3.4.4
The response variable would be price and the explanatory variable would be bathrooms.
house_prices %>%
select(bathrooms, price) %>%
skim()
## Skim summary statistics
## n obs: 21613
## n variables: 2
## Warning: package 'bindrcpp' was built under R version 3.4.4
##
## -- Variable type:numeric -------------------------------------------------------------------
## variable missing complete n mean sd p0 p25
## bathrooms 0 21613 21613 2.11 0.77 0 1.75
## price 0 21613 21613 540088.14 367127.2 75000 321950
## p50 p75 p100 hist
## 2.25 2.5 8 <U+2583><U+2586><U+2587><U+2582><U+2581><U+2581><U+2581><U+2581>
## 450000 645000 7700000 <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
As the number of bathrooms increase, prices tends to slowly increase as well
ggplot(data = house_prices, aes(x = bathrooms, y = price)) +
geom_point() +
labs(x = "Number of Bathrooms", y = "Price",
title = "Relationship of Number of bathrooms versus Price") +
geom_smooth(method = "lm", se = FALSE)
price=10708+250327 * bathrooms
Score <- lm(price ~ bathrooms, data = house_prices)
Score
##
## Call:
## lm(formula = price ~ bathrooms, data = house_prices)
##
## Coefficients:
## (Intercept) bathrooms
## 10708 250327
lm(formula = price ~ bathrooms, data = house_prices)
##
## Call:
## lm(formula = price ~ bathrooms, data = house_prices)
##
## Coefficients:
## (Intercept) bathrooms
## 10708 250327
resid=662500-yhat
Score <- lm(price ~ bathrooms, data = house_prices)
get_regression_table(Score)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 10708. 6211. 1.72 0.085 -1465. 22882.
## 2 bathrooms 250327. 2760. 90.7 0 244918. 255735.
regression_points <- get_regression_points(Score)
regression_points
## # A tibble: 21,613 x 5
## ID price bathrooms price_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 221900 1 261035. -39135.
## 2 2 538000 2.25 573943. -35943.
## 3 3 180000 1 261035. -81035.
## 4 4 604000 3 761688. -157688.
## 5 5 510000 2 511361. -1361.
## 6 6 1225000 4.5 1137178. 87822.
## 7 7 257500 2.25 573943. -316443.
## 8 8 291850 1.5 386198. -94348.
## 9 9 229500 1 261035. -31535.
## 10 10 323000 2.5 636525. -313525.
## # ... with 21,603 more rows
regression_points <- get_regression_points(Score)
ggplot(regression_points, aes(x = bathrooms, y = residual)) +
geom_point() +
labs(x = "Bathroom", y = "Residual") +
geom_hline(yintercept = 0, col = "blue", size = 1)
g) Now let’s focus on the same response variable, price but focus our attention on the variable waterfront which is a binary variable where TRUE means the property is waterfront and FALSE means it is not waterfront. Create a boxplot that compares the price of a house for waterfront and non-waterfront houses. Comment on the boxplot.
ggplot(data = house_prices, aes(x = waterfront, y = price, col = "pink")) +
geom_boxplot() +
labs(x = "Waterfronts", y = "Price",
title = "Relationship of Number of Waterfront versus Price") +
geom_smooth(method = "lm", se = FALSE)
waterfront <- house_prices %>%
summarize(mean = mean(waterfront, na.rm = TRUE), mean = mean(price, na.rm = TRUE))
difference<- house_prices %>%
mutate(difference= price - waterfront)
head(difference)
## # A tibble: 6 x 22
## id date price bedrooms bathrooms sqft_living sqft_lot
## <chr> <dttm> <dbl> <int> <dbl> <int> <int>
## 1 7129~ 2014-10-13 00:00:00 2.22e5 3 1 1180 5650
## 2 6414~ 2014-12-09 00:00:00 5.38e5 3 2.25 2570 7242
## 3 5631~ 2015-02-25 00:00:00 1.80e5 2 1 770 10000
## 4 2487~ 2014-12-09 00:00:00 6.04e5 4 3 1960 5000
## 5 1954~ 2015-02-18 00:00:00 5.10e5 3 2 1680 8080
## 6 7237~ 2014-05-12 00:00:00 1.23e6 4 4.5 5420 101930
## # ... with 15 more variables: floors <dbl>, waterfront <lgl>, view <int>,
## # condition <fct>, grade <fct>, sqft_above <int>, sqft_basement <int>,
## # yr_built <int>, yr_renovated <int>, zipcode <fct>, lat <dbl>,
## # long <dbl>, sqft_living15 <int>, sqft_lot15 <int>, difference <dbl>
lm(formula = price ~ bathrooms, data = house_prices)
##
## Call:
## lm(formula = price ~ bathrooms, data = house_prices)
##
## Coefficients:
## (Intercept) bathrooms
## 10708 250327
Score <- lm(price ~ bathrooms, data = house_prices)
get_regression_table(Score)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 10708. 6211. 1.72 0.085 -1465. 22882.
## 2 bathrooms 250327. 2760. 90.7 0 244918. 255735.
regression_points <- get_regression_points(Score)
regression_points
## # A tibble: 21,613 x 5
## ID price bathrooms price_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 221900 1 261035. -39135.
## 2 2 538000 2.25 573943. -35943.
## 3 3 180000 1 261035. -81035.
## 4 4 604000 3 761688. -157688.
## 5 5 510000 2 511361. -1361.
## 6 6 1225000 4.5 1137178. 87822.
## 7 7 257500 2.25 573943. -316443.
## 8 8 291850 1.5 386198. -94348.
## 9 9 229500 1 261035. -31535.
## 10 10 323000 2.5 636525. -313525.
## # ... with 21,603 more rows
The intercept coefficient 10708 means for houses that had zero bathrooms we would expect them to have a price of 10708. For every increase in 1 unit of bathrooms, there is, on average, 250327 units of price.
For the regression table it tels us the statistical and practical significance of our model results.
Knit this file and turn in a hard copy of your final document. Put all names at the top of this document.