## Warning: package 'dplyr' was built under R version 3.4.4

## Warning: package 'ggplot2' was built under R version 3.4.4

## Warning: package 'fivethirtyeight' was built under R version 3.4.4

## Warning: package 'moderndive' was built under R version 3.4.4

## Warning: package 'skimr' was built under R version 3.4.4

Consider the data file “house_prices” that is in the fivethirtyeight R pacakge.

Consider the variables bathrooms which gives the number of bathrooms of a house and price which is the asking price of the house. Which variable would you consider the reponse variable and which variable would you consider the explanatory variable?

The response variable would be price and the explanatory variable would be bathrooms.

Using the skim() function in R, look at summary statistics of both variables.

house_prices %>%
  select(bathrooms, price) %>%
  skim()

## Skim summary statistics
##  n obs: 21613 
##  n variables: 2

## Warning: package 'bindrcpp' was built under R version 3.4.4

## 
## -- Variable type:numeric -------------------------------------------------------------------
##   variable missing complete     n      mean        sd    p0       p25
##  bathrooms       0    21613 21613      2.11      0.77     0      1.75
##      price       0    21613 21613 540088.14 367127.2  75000 321950   
##        p50      p75    p100     hist
##       2.25      2.5       8 <U+2583><U+2586><U+2587><U+2582><U+2581><U+2581><U+2581><U+2581>
##  450000    645000   7700000 <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>

Create a scatterplot of the two variables and comment on the association between bathrooms and price. Is the relationship linear? Comment on the strenght and dirction of the association.

As the number of bathrooms increase, prices tends to slowly increase as well

ggplot(data = house_prices, aes(x = bathrooms, y = price)) +
  geom_point() +
labs(x = "Number of Bathrooms", y = "Price",
    title = "Relationship of Number of bathrooms versus Price") +
  geom_smooth(method = "lm", se = FALSE)

Using the lm function in R, compute the regression equation. Report your regression equation below.

price=10708+250327 * bathrooms

Score <- lm(price ~ bathrooms, data = house_prices)
Score

## 
## Call:
## lm(formula = price ~ bathrooms, data = house_prices)
## 
## Coefficients:
## (Intercept)    bathrooms  
##       10708       250327

lm(formula = price ~ bathrooms, data = house_prices)

## 
## Call:
## lm(formula = price ~ bathrooms, data = house_prices)
## 
## Coefficients:
## (Intercept)    bathrooms  
##       10708       250327

Suppose that a 3 bathroom house has a price of $662500. Compute the residual.

3 10708 + 250327*x (x=1.00)

resid=662500-yhat

Score <- lm(price ~ bathrooms, data = house_prices)
get_regression_table(Score)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   10708.     6211.      1.72   0.085   -1465.   22882.
## 2 bathrooms  250327.     2760.     90.7    0      244918.  255735.

regression_points <- get_regression_points(Score)
regression_points

## # A tibble: 21,613 x 5
##       ID   price bathrooms price_hat residual
##    <int>   <dbl>     <dbl>     <dbl>    <dbl>
##  1     1  221900      1      261035.  -39135.
##  2     2  538000      2.25   573943.  -35943.
##  3     3  180000      1      261035.  -81035.
##  4     4  604000      3      761688. -157688.
##  5     5  510000      2      511361.   -1361.
##  6     6 1225000      4.5   1137178.   87822.
##  7     7  257500      2.25   573943. -316443.
##  8     8  291850      1.5    386198.  -94348.
##  9     9  229500      1      261035.  -31535.
## 10    10  323000      2.5    636525. -313525.
## # ... with 21,603 more rows

Create a residual plot and comment on the plot. You should look for a random scattering of points on the residual plot. Comment on whether there is a random scattering of points or some sort of pattern. See page 124-125 of your course packet.

regression_points <- get_regression_points(Score)
ggplot(regression_points, aes(x = bathrooms, y = residual)) +
  geom_point() + 
labs(x = "Bathroom", y = "Residual") +
  geom_hline(yintercept = 0, col = "blue", size = 1)

g) Now let’s focus on the same response variable, price but focus our attention on the variable waterfront which is a binary variable where TRUE means the property is waterfront and FALSE means it is not waterfront. Create a boxplot that compares the price of a house for waterfront and non-waterfront houses. Comment on the boxplot.

ggplot(data = house_prices, aes(x = waterfront, y = price, col = "pink")) +
  geom_boxplot() +
labs(x = "Waterfronts", y = "Price",
    title = "Relationship of Number of Waterfront versus Price") +
  geom_smooth(method = "lm", se = FALSE)

Calculate the difference between the mean price of waterfront housing and the mean price of non-waterfront housing.

waterfront <- house_prices %>%
summarize(mean = mean(waterfront, na.rm = TRUE), mean = mean(price, na.rm = TRUE))
difference<- house_prices %>%
  mutate(difference= price - waterfront)
head(difference)

## # A tibble: 6 x 22
##   id    date                 price bedrooms bathrooms sqft_living sqft_lot
##   <chr> <dttm>               <dbl>    <int>     <dbl>       <int>    <int>
## 1 7129~ 2014-10-13 00:00:00 2.22e5        3      1           1180     5650
## 2 6414~ 2014-12-09 00:00:00 5.38e5        3      2.25        2570     7242
## 3 5631~ 2015-02-25 00:00:00 1.80e5        2      1            770    10000
## 4 2487~ 2014-12-09 00:00:00 6.04e5        4      3           1960     5000
## 5 1954~ 2015-02-18 00:00:00 5.10e5        3      2           1680     8080
## 6 7237~ 2014-05-12 00:00:00 1.23e6        4      4.5         5420   101930
## # ... with 15 more variables: floors <dbl>, waterfront <lgl>, view <int>,
## #   condition <fct>, grade <fct>, sqft_above <int>, sqft_basement <int>,
## #   yr_built <int>, yr_renovated <int>, zipcode <fct>, lat <dbl>,
## #   long <dbl>, sqft_living15 <int>, sqft_lot15 <int>, difference <dbl>

Fit a linear regression model using the lm function. Report the regression table and the actual regression model below.

lm(formula = price ~ bathrooms, data = house_prices)

## 
## Call:
## lm(formula = price ~ bathrooms, data = house_prices)
## 
## Coefficients:
## (Intercept)    bathrooms  
##       10708       250327

Score <- lm(price ~ bathrooms, data = house_prices)
get_regression_table(Score)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   10708.     6211.      1.72   0.085   -1465.   22882.
## 2 bathrooms  250327.     2760.     90.7    0      244918.  255735.

regression_points <- get_regression_points(Score)
regression_points

## # A tibble: 21,613 x 5
##       ID   price bathrooms price_hat residual
##    <int>   <dbl>     <dbl>     <dbl>    <dbl>
##  1     1  221900      1      261035.  -39135.
##  2     2  538000      2.25   573943.  -35943.
##  3     3  180000      1      261035.  -81035.
##  4     4  604000      3      761688. -157688.
##  5     5  510000      2      511361.   -1361.
##  6     6 1225000      4.5   1137178.   87822.
##  7     7  257500      2.25   573943. -316443.
##  8     8  291850      1.5    386198.  -94348.
##  9     9  229500      1      261035.  -31535.
## 10    10  323000      2.5    636525. -313525.
## # ... with 21,603 more rows

Interpret both regression coefficients in context of the problem.

The intercept coefficient 10708 means for houses that had zero bathrooms we would expect them to have a price of 10708. For every increase in 1 unit of bathrooms, there is, on average, 250327 units of price.

For the regression table it tels us the statistical and practical significance of our model results.

Knit this file and turn in a hard copy of your final document. Put all names at the top of this document.

Problem Set 6

Your name(s)

October 8, 2018

3

10708 + 250327*x (x=1.00)