Modeling with Data in the Tidyverse

DataCamp: Statistics with R

Bonnie Cooper

library( dplyr )
library( ggplot2 )
library( moderndive )
library( gapminder )
library( stringr )
library( gridExtra )

Background on Modeling for Exploration

General modeling framework formula: \[y = f( \vec{x}) + \epsilon\]

$y$: outcome variable of interest
$\vec{x}$: explanatory/predictor variable(s)
$f()$: function making the relationship between $y$ and $\vec{x}$ explicit. AKA: the signal
$\epsilon$: unsystematic error component. AKA the noise

Motives for modeling

Explanation: $\vec{x}$ are explanatory variables within data range
Prediction: $\vec{x}$ are prediction variables beyond data range

glimpse( evals )

## Rows: 463
## Columns: 14
## $ ID           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ prof_ID      <int> 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5,…
## $ score        <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4…
## $ age          <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, …
## $ bty_avg      <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333, …
## $ gender       <fct> female, female, female, female, male, male, male, male, …
## $ ethnicity    <fct> minority, minority, minority, minority, not minority, no…
## $ language     <fct> english, english, english, english, english, english, en…
## $ rank         <fct> tenure track, tenure track, tenure track, tenure track, …
## $ pic_outfit   <fct> not formal, not formal, not formal, not formal, not form…
## $ pic_color    <fct> color, color, color, color, color, color, color, color, …
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, 14,…
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 20, …
## $ cls_level    <fct> upper, upper, upper, upper, upper, upper, upper, upper, …

Exploratory Data Analysis

Looking at the data (e.g. w/ str() or glimpse())
Creating Visualizations
Computing Summary Statistics

#visualize the numeric feature score
ggplot( evals, aes( x = score ) ) +
  geom_histogram( binwidth = 0.25 ) +
  labs( x = 'teaching score', y = 'count' )

evals %>%
  summarise( mean_score = mean( score ),
             median_score = median( score ),
             sd_score = sd( score ) )

## # A tibble: 1 x 3
##   mean_score median_score sd_score
##        <dbl>        <dbl>    <dbl>
## 1       4.17          4.3    0.544

# Compute summary stats
evals %>%
  summarise(mean_age = mean(age),
            median_age = median(age),
            sd_age = sd(age))

## # A tibble: 1 x 3
##   mean_age median_age sd_age
##      <dbl>      <int>  <dbl>
## 1     48.4         48   9.80

# Plot the histogram
ggplot(evals, aes(x = age)) +
  geom_histogram(binwidth = 5) +
  labs(x = "age", y= "count")

# Compute summary stats
evals %>%
  summarise(mean_cls_students = mean(cls_students),
            median_cls_students = median(cls_students),
            sd_cls_students = sd(cls_students))

## # A tibble: 1 x 3
##   mean_cls_students median_cls_students sd_cls_students
##               <dbl>               <int>           <dbl>
## 1              55.2                  29            75.1

# Plot the histogram
ggplot(evals, aes(x = cls_students)) +
  geom_histogram(binwidth = 10) +
  labs(x = "cls_students", y= "count")

evals %>%
  summarise(mean_bty_avg = mean(bty_avg),
            median_bty_avg = median(bty_avg),
            sd_bty_avg = sd(bty_avg),
            min_bty_avg = min( bty_avg),
            max_bty_avg = max( bty_avg ))

## # A tibble: 1 x 5
##   mean_bty_avg median_bty_avg sd_bty_avg min_bty_avg max_bty_avg
##          <dbl>          <dbl>      <dbl>       <dbl>       <dbl>
## 1         4.42           4.33       1.53        1.67        8.17

# Plot the histogram
ggplot(evals, aes(x = bty_avg)) +
  geom_histogram(binwidth = 1) +
  labs(x = "bty_avg", y= "count")

Background on Modeling for Prediction

Question: Can we predict the sale price of houses based on their features?
Variables:

$y$: House sale price in USD
$\vec{x}$: Features like sqft_living, condition, bedrooms, yr_built, waterfront

kc_data_url <- 'https://raw.githubusercontent.com/SmilodonCub/ReadingLearningTinkering/master/DataCamp/Statistics_with_R/kc_house_data.csv'
house_price_kg <- read.csv( kc_data_url )
glimpse( house_price_kg )

## Rows: 21,613
## Columns: 21
## $ id            <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 1954400…
## $ date          <chr> "20141013T000000", "20141209T000000", "20150225T000000"…
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500…
## $ bedrooms      <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2…
## $ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2…
## $ sqft_living   <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 18…
## $ sqft_lot      <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470…
## $ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, …
## $ waterfront    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ view          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0…
## $ condition     <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4…
## $ grade         <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, …
## $ sqft_above    <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 18…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0,…
## $ yr_built      <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2…
## $ yr_renovated  <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ zipcode       <int> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198,…
## $ lat           <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 4…
## $ long          <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2…
## $ sqft_lot15    <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113,…

glimpse( house_prices )

## Rows: 21,613
## Columns: 21
## $ id            <chr> "7129300520", "6414100192", "5631500400", "2487200875",…
## $ date          <date> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015-0…
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500…
## $ bedrooms      <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2…
## $ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2…
## $ sqft_living   <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 18…
## $ sqft_lot      <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470…
## $ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, …
## $ waterfront    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ view          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0…
## $ condition     <fct> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4…
## $ grade         <fct> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, …
## $ sqft_above    <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 18…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0,…
## $ yr_built      <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2…
## $ yr_renovated  <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ zipcode       <fct> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198,…
## $ lat           <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 4…
## $ long          <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2…
## $ sqft_lot15    <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113,…

all_equal( house_prices, house_price_kg )

## [1] "- Different types for column `id`: character vs double\n- Different types for column `date`: date vs character\n- Different types for column `waterfront`: logical vs integer\n- Different types for column `condition`: factor<69cd8> vs integer\n- Different types for column `grade`: factor<fdaa9> vs integer\n- Different types for column `zipcode`: factor<2571c> vs integer\n"

DataCamp made some small changes to the kaggle dataset. Here I try to wrangle to kaggle set into the same condition that the course prepared it using dplyr methods.

house_price_kg2 <- house_price_kg %>%
  mutate( id = as.character( id ),
          date = as.Date( date, format = "%Y%m%d" ),
          waterfront = as.logical( waterfront ),
          condition = factor( condition ),
          grade = factor( grade ),
          zipcode = factor( zipcode )) %>%
  mutate( id = if_else(nchar(id)<10, stringr::str_pad(id,width=10,side="left",pad="0"), id) )
rownames( house_price_kg2 ) <- c()

#all_equal( house_prices, house_price_kg2 )
all.equal( house_price_kg2, house_prices )

## [1] "Attributes: < Component \"class\": Lengths (1, 3) differ (string compare on first 1) >"
## [2] "Attributes: < Component \"class\": 1 string mismatch >"

Good enough for the government. Now back to following the demo…

Visualization:

ggplot( house_prices, aes( x = price ) ) +
  geom_histogram() +
  labs( x = 'house price', y = 'count' )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution has is right skewed (with a long tail leading to the right of the distribution). This structure will make it difficult to compare lower valued housing prices. This finding is similar to the gapminder dataset:

gapminder %>%
  filter( year == 1952 ) %>%
  ggplot( aes( x=pop, y=lifeExp, color=continent ) ) +
  geom_point() +
  ggtitle( '1952 country-level life expectancy vs population' )

Rescale the data on a log10 scale to beter distinguish data points in the lower price range

gapminder %>%
  filter( year == 1952 ) %>%
  ggplot( aes( x=log10( pop ), y=lifeExp, color=continent ) ) +
  geom_point() +
  ggtitle( 'Log10 rescaling: 1952 country-level life expectancy vs population' )

With the rescaling, horizontal distances on the X-axis now correspond to multiplicate differences instead of additive.

Now to try something similar with the house_prices dataset

house_price_scaled <- house_prices %>%
  mutate( log10_price = log10( price ) ) %>%
  select( price, log10_price )

Monotonic transform: the order is preserved.

native_scale <- ggplot( house_prices, aes( x = price ) ) +
  geom_histogram() +
  labs( x = 'house price', y = 'count' ) +
  ggtitle( 'Price' )

log10_scale <- ggplot( house_price_scaled, aes( x = log10_price ) ) +
  geom_histogram() +
  labs( x = 'log10 house price', y = 'count' ) +
  ggtitle( 'log10( Price )' )

grid.arrange( native_scale, log10_scale, ncol = 2 )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

After transformation, the price distribution is much less skewed. Pretty much normal.

Now to see if the predictor feature, sqft_living warrants a similar transformation

# Plot the histogram
ggplot(house_prices, aes(x = sqft_living)) +
  geom_histogram() +
  labs(x = 'Size (sq.feet)', y='count')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution is right skewed.

# Add log10_size
house_prices_2 <- house_prices %>%
  mutate(log10_size = log10(sqft_living))

native_scale <- ggplot( house_prices, aes( x = sqft_living ) ) +
  geom_histogram() +
  labs( x = 'size', y = 'count' ) +
  ggtitle( 'Size' )

log10_scale <- ggplot( house_prices_2, aes( x = log10_size ) ) +
  geom_histogram() +
  labs( x = 'log10 size', y = 'count' ) +
  ggtitle( 'log10( Sie )' )

grid.arrange( native_scale, log10_scale, ncol = 2 )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Modeling the Problem for Explanation

Generally, $f()$ and $\epsilon$ are unknown
Unknown: $n$: the number of observations, $y$ and $\vec{x}$ are given in the dataset
Goal: fit a model, $\hat{f}()$ that approximates $f()$ while ignoring $\epsilon$
Goal Restated: separate the signal from the noise
with the model, can then generate fitted/predicted values $\hat{y}=\hat{f}(\vec{x})$

Modelling: exploring relationships between variables. EDA of relationship between 2 variables: age & score

pnt_plot <- ggplot( evals, aes( x = age, y = score ) ) +
  geom_point() +
  labs( x = 'age', y = 'score', title = 'Teaching score over age' )

jitter_plot <- ggplot( evals, aes( x = age, y = score ) ) +
  geom_jitter() +
  labs( x = 'age', y = 'score', title = 'Teaching score over age' )

grid.arrange( pnt_plot, jitter_plot, ncol = 2 )

Is the relationship positive? investigate with the correlation coefficient

evals %>%
  summarize( correlation = cor( score, age ) )

## # A tibble: 1 x 1
##   correlation
##         <dbl>
## 1      -0.107

The negative correlation suggests that as professor age, they tend to get lower scores.

EDA of relationship of teaching & ‘beauty’ scores

# Plot the histogram
score_plot <- ggplot(evals, aes(x=score)) +
  geom_histogram( ) +
  labs(x = "Score", y = "count",
       title = 'Score')

beauty_plot <- ggplot(evals, aes(x=bty_avg)) +
  geom_histogram( binwidth = 0.5 ) +
  labs(x = "Beauty score", y = "count",
       title = 'Beauty')

SMB <- ggplot(evals, aes(x = bty_avg, y = score)) +
  geom_point() +
  labs(x = "beauty score", y = "teaching score",
       title = 'Score ~ Beauty')

SMB_jitter <- ggplot(evals, aes(x = bty_avg, y = score)) +
  geom_jitter() +
  labs(x = "beauty score", y = "teaching score",
       title = 'Score ~ Beauty w/ jitter')

grid.arrange( score_plot, beauty_plot, SMB, SMB_jitter, ncol = 2 )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

evals %>%
  summarize(correlation = cor(score, bty_avg))

## # A tibble: 1 x 1
##   correlation
##         <dbl>
## 1       0.187

The Modeling Problem for Prediction

The mechnics are similar, but subtle differences with the goals:

Explanation: We care about the form of $\hat{f}()$, in particular any values quantifying relationships between $y$ and $\vec{x}$ (e.g. model coefficients)
Prediction: We don’t care so much about the form of $\hat{f}()$, only that it yields ‘good’ predictions $\hat{y}$ of $y$ based on $\vec{x}$

EDA using the categorical feature, condition:

house_prices <- house_prices %>%
  mutate( log10_price = log10( price ) )

house_prices %>%
  select( log10_price, condition ) %>%
  glimpse() %>%
  ggplot( aes( x = condition, y = log10_price ) ) +
  geom_boxplot() +
  labs( x = 'house condition', y = 'log10 price',
        title = 'log10 house price over condition')

## Rows: 21,613
## Columns: 2
## $ log10_price <dbl> 5.346157, 5.730782, 5.255273, 5.781037, 5.707570, 6.08813…
## $ condition   <fct> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, …

As the house condition improves, there is an increase in median house price.

sumvals <- house_prices %>%
  group_by( condition ) %>%
  summarize( mean = mean( log10_price ),
             sd = sd( log10_price ),
             n = n() )

## `summarise()` ungrouping output (override with `.groups` argument)

sumvals

## # A tibble: 5 x 4
##   condition  mean    sd     n
##   <fct>     <dbl> <dbl> <int>
## 1 1          5.42 0.293    30
## 2 2          5.45 0.233   172
## 3 3          5.67 0.224 14031
## 4 4          5.65 0.228  5679
## 5 5          5.71 0.244  1701

from the summary stats we see that the mean increases with condition, therestandard deviation varies between condition levels and there are uneven numbers of houses in each group (most in 3 >> 4 > 5).

sumvals %>%
  mutate( mean = 10^( as.numeric( mean ) ) ) %>%
  select( condition, mean )

## # A tibble: 5 x 2
##   condition    mean
##   <fct>       <dbl>
## 1 1         265979.
## 2 2         278758.
## 3 3         468075.
## 4 4         447737.
## 5 5         518250.

EDA of relationship of house price and waterfront

# View the structure of log10_price and waterfront
house_prices %>%
  select(log10_price, waterfront) %>%
  glimpse()

## Rows: 21,613
## Columns: 2
## $ log10_price <dbl> 5.346157, 5.730782, 5.255273, 5.781037, 5.707570, 6.08813…
## $ waterfront  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…

# Plot 
ggplot(house_prices, aes(x = waterfront, y = log10_price)) +
  geom_boxplot() +
  labs(x = "waterfront", y = "log10 price")

# Calculate stats
house_prices %>%
  group_by(waterfront) %>%
  summarize(mean_log10_price = mean( log10_price), n = n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 3
##   waterfront mean_log10_price     n
##   <lgl>                 <dbl> <int>
## 1 FALSE                  5.66 21450
## 2 TRUE                   6.12   163

# Prediction of price for houses with view
10^(6.12)

## [1] 1318257

# Prediction of price for houses without view
10^(5.66)

## [1] 457088.2

Modeling with Basic Regression

Explaining Teaching Score with Age

Basic Linear Regression. Adding a linear best-fit line to describe a relationship between features

ggplot( evals, aes( x = age, y = score ) ) +
  geom_point() +
  labs( x = 'age', y = 'score', title = 'Teaching score over age' ) +
  geom_smooth( method = 'lm', se = FALSE )

## `geom_smooth()` using formula 'y ~ x'

The overall relationship between the variables score and age is negative; the scores have a tendancy to decrease with increasing age. This result is consistent with the correlation between the features, correlation = -0.107032
NOTE: correlation $\neq$ causation!

Modeling with basic linear regression

Truth:
- Assume the relationship can be described with a line
- $f(x) = \beta_0 + \beta_1 \cdot x$
- the Observed value $y = f(x) + \epsilon = \beta_0 + \beta_1 \cdot x + \epsilon$
Fitted:
- Assume $\hat{f}(x) = \hat{\beta}_0 + \hat{\beta}_1 \cdot x$
- the Fitted/Predicted values $\hat{y} = \hat{f}(x)= \hat{\beta}_0 + \hat{\beta}_1 \cdot x$

Let R compute the coefficients $\hat{\beta}_0$ & $\hat{\beta}_1$

#Fit regression model using formula of form: y ~ x
model_score_1 <- lm( score ~ age, data = evals )
model_score_1

## 
## Call:
## lm(formula = score ~ age, data = evals)
## 
## Coefficients:
## (Intercept)          age  
##    4.461932    -0.005938

How to interpret?

Intercept: 4.46193 has no practical meaning, represents the score at age = 0.
Slope: -0.00594 for evering year increase in age, there is a corresponding decrease of 0.00594 beauty score.

#from the moderndive library:
( get_regression_table( model_score_1 ) )

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    4.46      0.127     35.2    0        4.21     4.71 
## 2 age         -0.006     0.003     -2.31   0.021   -0.011   -0.001

#using summary
#almost what moderndive is doing. add confidence intervals and reformat as table tho
summod <- summary( model_score_1 )
regtab <- data.frame( round( summod$coefficients, 4 ) )
regtab

##             Estimate Std..Error t.value Pr...t..
## (Intercept)   4.4619     0.1268 35.1947   0.0000
## age          -0.0059     0.0026 -2.3114   0.0213

Linear model attempt of score ~ beauty_avg

# Plot 
ggplot(evals, aes(x = bty_avg, y = score)) +
  geom_jitter() +
  labs(x = "beauty score", y = "score") +
  geom_smooth(method = 'lm', se = FALSE )

## `geom_smooth()` using formula 'y ~ x'

# Fit model
model_score_2 <- lm(score ~ bty_avg, data = evals)

# Output content
model_score_2

## 
## Call:
## lm(formula = score ~ bty_avg, data = evals)
## 
## Coefficients:
## (Intercept)      bty_avg  
##     3.88034      0.06664

# Output regression table
get_regression_table(model_score_2)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.88      0.076     51.0        0    3.73     4.03 
## 2 bty_avg      0.067     0.016      4.09       0    0.035    0.099

According to our linear model, for every point increase in beauty score, you should observe an associated increase of on average 0.067 points in teaching score.

Predicting Teaching Score using Age

Making predictions about a target variable.
the regression line can be used for both estimate and prediction of a feature \[\hat{y} = \hat{f}(x) = \hat{\beta}_0 + \hat{\beta}_1 \cdot x \] Ex: a professor’s age is 40. What is the predicted score? \[\hat{y} = 4.46 - 0.006 \cdot 40 = 4.22\]

Predictions Error

Prediction Error = difference between observed and predicted values

Residual = $y - \hat{y}$
Residuals correspond to the $\epsilon$ from the $y = f()+

What is meant by a ‘best-fit’ line: this is the line that minimizes the magnitudes of the residuals

Making predictions of `score` based on beauty

get_regression_table(model_score_2)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.88      0.076     51.0        0    3.73     4.03 
## 2 bty_avg      0.067     0.016      4.09       0    0.035    0.099

test_beautyscore <- data.frame( bty_avg = 5 )
y_hat <- model_score_2 %>% predict( test_beautyscore )
y_hat

##        1 
## 4.213523

a value of 4.7 is observed

# Compute residual y - y_hat
residual <- 4.7 - y_hat
residual

##         1 
## 0.4864769

the moderndive library has some nifty wrapper fxns to help find the residuals…..

# Get regression table
get_regression_table(model_score_2)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.88      0.076     51.0        0    3.73     4.03 
## 2 bty_avg      0.067     0.016      4.09       0    0.035    0.099

# Get all fitted/predicted values and residuals
get_regression_points(model_score_2)

## # A tibble: 463 x 5
##       ID score bty_avg score_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1   4.7    5         4.21    0.486
##  2     2   4.1    5         4.21   -0.114
##  3     3   3.9    5         4.21   -0.314
##  4     4   4.8    5         4.21    0.586
##  5     5   4.6    3         4.08    0.52 
##  6     6   4.3    3         4.08    0.22 
##  7     7   2.8    3         4.08   -1.28 
##  8     8   4.1    3.33      4.10   -0.002
##  9     9   3.4    3.33      4.10   -0.702
## 10    10   4.5    3.17      4.09    0.409
## # … with 453 more rows

…but let’s do the work and use dplyr methods to find them:

evals_residuals <- evals %>%
  select( ID, score, bty_avg ) %>%
  mutate( score_hat = predict( model_score_2, . ),
          residual = score - score_hat )
head( evals_residuals, 10 )

## # A tibble: 10 x 5
##       ID score bty_avg score_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1   4.7    5         4.21  0.486  
##  2     2   4.1    5         4.21 -0.114  
##  3     3   3.9    5         4.21 -0.314  
##  4     4   4.8    5         4.21  0.586  
##  5     5   4.6    3         4.08  0.520  
##  6     6   4.3    3         4.08  0.220  
##  7     7   2.8    3         4.08 -1.28   
##  8     8   4.1    3.33      4.10 -0.00244
##  9     9   3.4    3.33      4.10 -0.702  
## 10    10   4.5    3.17      4.09  0.409

Bingo!, that wasn’t so hard and didnt take too much code. On the other hand, it sure is nice to have convenient wrapper functions.

Explaining Teaching Score with Gender

Logistic Regression: binary categorical variable

sg_box <- ggplot( evals, aes( x = gender, y = score ) ) +
  geom_boxplot() +
  labs( x = 'gender', y = 'score' )

sg_hist <- ggplot( evals, aes( x = score, fill = gender ) ) +
  geom_histogram( binwidth = 0.25 ) +
  labs( x = 'gender', y = 'score' )

grid.arrange( sg_box, sg_hist, ncol = 2 )

#fit regression model
model_score_3 <- lm( score ~ gender, data = evals )

#get the regression table
get_regression_table( model_score_3 )

## # A tibble: 2 x 7
##   term       estimate std_error statistic p_value lower_ci upper_ci
##   <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     4.09      0.039    106.     0        4.02     4.17 
## 2 gendermale    0.142     0.051      2.78   0.006    0.042    0.241

#plot summary too just cause I'm used to seeing this
summary( model_score_3 )

## 
## Call:
## lm(formula = score ~ gender, data = evals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83433 -0.36357  0.06567  0.40718  0.90718 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.09282    0.03867 105.852  < 2e-16 ***
## gendermale   0.14151    0.05082   2.784  0.00558 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5399 on 461 degrees of freedom
## Multiple R-squared:  0.01654,    Adjusted R-squared:  0.01441 
## F-statistic: 7.753 on 1 and 461 DF,  p-value: 0.005583

sanity check by computing the means of each group

evals %>%
  group_by( gender ) %>%
  summarise( avg_score = mean( score ) )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   gender avg_score
##   <fct>      <dbl>
## 1 female      4.09
## 2 male        4.23

Use another categorical variable: `rank`

evals %>%
  group_by( rank ) %>%
  summarize( n = n() )

## # A tibble: 3 x 2
##   rank             n
##   <fct>        <int>
## 1 teaching       102
## 2 tenure track   108
## 3 tenured        253

EDA of relationship of score and rank

ggplot(evals, aes( x = rank, y = score)) +
  geom_boxplot() +
  labs(x = "rank", y = "score")

Look at some summary stats

evals %>%
  group_by(rank) %>%
  summarise(n = n(), mean_score = mean(score), sd_score = sd(score))

## # A tibble: 3 x 4
##   rank             n mean_score sd_score
##   <fct>        <int>      <dbl>    <dbl>
## 1 teaching       102       4.28    0.498
## 2 tenure track   108       4.15    0.561
## 3 tenured        253       4.14    0.550

Fit a linear regression model to the data

# Fit regression model
model_score_4 <- lm(score ~ rank, data = evals)

# Get regression table
get_regression_table(model_score_4)

## # A tibble: 3 x 7
##   term             estimate std_error statistic p_value lower_ci upper_ci
##   <chr>               <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept           4.28      0.054     79.9    0        4.18     4.39 
## 2 ranktenure track   -0.13      0.075     -1.73   0.084   -0.277    0.017
## 3 ranktenured        -0.145     0.064     -2.28   0.023   -0.27    -0.02

Calculate some values from the regression table outcome

# teaching mean
teaching_mean <- 4.28

# tenure track mean
tenure_track_mean <- teaching_mean - 0.13

# tenured mean
tenured_mean <- teaching_mean - 0.145

mes <- paste( 'Teaching Mean:', round( teaching_mean, 3),
              '\nTenure Track Mean:', round( tenure_track_mean, 3 ),
              '\nTenured:', round( tenured_mean, 3 ) )
cat( mes, sep = '\n')

## Teaching Mean: 4.28 
## Tenure Track Mean: 4.15 
## Tenured: 4.135

Do we get the same result using dplyr methods?

evals %>%
  group_by( rank ) %>%
  summarise( mean = mean( score ) )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 2
##   rank          mean
##   <fct>        <dbl>
## 1 teaching      4.28
## 2 tenure track  4.15
## 3 tenured       4.14

Yes, the values check out.

Predicting Teaching Score using Gender

Predicting with a categorical variable

evals %>%
  group_by( gender ) %>%
  summarize( mean_score =mean( score ), sd_score = sd( score ) )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 3
##   gender mean_score sd_score
##   <fct>       <dbl>    <dbl>
## 1 female       4.09    0.564
## 2 male         4.23    0.522

Now to find the residuals for all records

get_regression_points( model_score_3 )

## # A tibble: 463 x 5
##       ID score gender score_hat residual
##    <int> <dbl> <fct>      <dbl>    <dbl>
##  1     1   4.7 female      4.09    0.607
##  2     2   4.1 female      4.09    0.007
##  3     3   3.9 female      4.09   -0.193
##  4     4   4.8 female      4.09    0.707
##  5     5   4.6 male        4.23    0.366
##  6     6   4.3 male        4.23    0.066
##  7     7   2.8 male        4.23   -1.43 
##  8     8   4.1 male        4.23   -0.134
##  9     9   3.4 male        4.23   -0.834
## 10    10   4.5 female      4.09    0.407
## # … with 453 more rows

#OR
evals_residuals <- evals %>%
  select( ID, score, gender ) %>%
  mutate( score_hat = predict( model_score_3, . ),
          residual = score - score_hat )
head( evals_residuals, 10 )

## # A tibble: 10 x 5
##       ID score gender score_hat residual
##    <int> <dbl> <fct>      <dbl>    <dbl>
##  1     1   4.7 female      4.09  0.607  
##  2     2   4.1 female      4.09  0.00718
##  3     3   3.9 female      4.09 -0.193  
##  4     4   4.8 female      4.09  0.707  
##  5     5   4.6 male        4.23  0.366  
##  6     6   4.3 male        4.23  0.0657 
##  7     7   2.8 male        4.23 -1.43   
##  8     8   4.1 male        4.23 -0.134  
##  9     9   3.4 male        4.23 -0.834  
## 10    10   4.5 female      4.09  0.407

Plot a histogram of the residuals

ggplot( evals_residuals, aes( x = residual ) ) +
  geom_histogram() +
  labs( x = 'residuals',
        title = 'Residuals from score ~ gender' )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The residuals are roughly centered on 0

A quick look at predicting score ~ rank

# Calculate predictions and residuals
model_score_4_points <- get_regression_points(model_score_4)
model_score_4_points

## # A tibble: 463 x 5
##       ID score rank         score_hat residual
##    <int> <dbl> <fct>            <dbl>    <dbl>
##  1     1   4.7 tenure track      4.16    0.545
##  2     2   4.1 tenure track      4.16   -0.055
##  3     3   3.9 tenure track      4.16   -0.255
##  4     4   4.8 tenure track      4.16    0.645
##  5     5   4.6 tenured           4.14    0.461
##  6     6   4.3 tenured           4.14    0.161
##  7     7   2.8 tenured           4.14   -1.34 
##  8     8   4.1 tenured           4.14   -0.039
##  9     9   3.4 tenured           4.14   -0.739
## 10    10   4.5 tenured           4.14    0.361
## # … with 453 more rows

# Plot residuals
ggplot(model_score_4_points, aes( x = residual )) +
  geom_histogram() +
  labs(x = "residuals", title = "Residuals from score ~ rank model")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Modeling with Multiple Regression

Explaining house price with year and size

Multiple Linear Regression with numeric features

house_prices <- house_prices %>%
  mutate( log10_price = log10( price ),
          log10_size = log10( sqft_living ) )

3D Scatter Plot: Visualizing the oint relationship between 3 variables.
The generalization of a regression line in a 3D plot is a regression plane. Of all possible planes, the regression plane minimizes the residuals (descrepancy between observed and predicted values)

#use plotly

Quantifying the relationship:

model_price_1 <- lm( log10_price ~ log10_size + yr_built,
                     data = house_prices )
get_regression_table( model_price_1)

## # A tibble: 3 x 7
##   term       estimate std_error statistic p_value lower_ci upper_ci
##   <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.38      0.075      71.4       0    5.24     5.53 
## 2 log10_size    0.913     0.006     141.        0    0.901    0.926
## 3 yr_built     -0.001     0         -33.8       0   -0.001   -0.001

Interpreting: Taking into account the other variables used to build, a given features has an associated mean change of the estimate per unit measure. Ex: taking into account yr_built, log10_size increases by 0.913 in price on average.

EDA of relationships

# Create scatterplot with regression line
nativedat <- ggplot(house_prices, aes(x = bedrooms, y = log10_price)) +
  geom_point() +
  labs(x = "Number of bedrooms", y = "log10 price") +
  geom_smooth(method = "lm", se = FALSE)

# Remove outlier
house_prices_transform <- house_prices %>%
  filter( bedrooms < 15 )

# Create scatterplot with regression line
dat_outlier_rm <- ggplot(house_prices_transform, aes(x = bedrooms, y = log10_price)) +
  geom_point() +
  labs(x = "Number of bedrooms", y = "log10 price") +
  geom_smooth(method = "lm", se = FALSE)

grid.arrange( nativedat, dat_outlier_rm, ncol = 2 )

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

fit a multiple linear regression model

# Fit model
model_price_2 <- lm(log10_price ~ log10_size + bedrooms, 
                    data = house_prices)

# Get regression table
get_regression_table( model_price_2 )

## # A tibble: 3 x 7
##   term       estimate std_error statistic p_value lower_ci upper_ci
##   <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     2.72      0.023     118.        0    2.67     2.76 
## 2 log10_size    0.931     0.008     118.        0    0.916    0.947
## 3 bedrooms     -0.03      0.002     -19.3       0   -0.033   -0.027

Accounting for log10_size, every extra bedroom is associated with a decrease of an average 0.033 in log10_price

Predicting House Price using Year & Size

get_regression_table( model_price_1)

## # A tibble: 3 x 7
##   term       estimate std_error statistic p_value lower_ci upper_ci
##   <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.38      0.075      71.4       0    5.24     5.53 
## 2 log10_size    0.913     0.006     141.        0    0.901    0.926
## 3 yr_built     -0.001     0         -33.8       0   -0.001   -0.001

#make a prediction of log10_price = 3.07 and year = 1980
newval <- data.frame( 'log10_size' = 3.06, 'yr_built' = 1980 )
prediction <- newval %>% 
  mutate( log10_price = predict( model_price_1, . ),
          price = 10^( log10_price ) )
prediction

##   log10_size yr_built log10_price    price
## 1       3.06     1980    5.454154 284546.8

Compute all Predicted values and Residuals

#using moderndive methods
get_regression_points( model_price_1 )

## # A tibble: 21,613 x 6
##       ID log10_price log10_size yr_built log10_price_hat residual
##    <int>       <dbl>      <dbl>    <int>           <dbl>    <dbl>
##  1     1        5.35       3.07     1955            5.50   -0.153
##  2     2        5.73       3.41     1951            5.81   -0.083
##  3     3        5.26       2.89     1933            5.36   -0.105
##  4     4        5.78       3.29     1965            5.69    0.094
##  5     5        5.71       3.22     1987            5.60    0.112
##  6     6        6.09       3.73     2001            6.04    0.047
##  7     7        5.41       3.23     1995            5.59   -0.182
##  8     8        5.46       3.02     1963            5.45    0.019
##  9     9        5.36       3.25     1960            5.66   -0.295
## 10    10        5.51       3.28     2003            5.62   -0.111
## # … with 21,603 more rows

#using dplyr methods
price_residuals <- house_prices %>%
  select( id , log10_price, log10_size, yr_built ) %>%
  mutate( log10_price_hat = predict( model_price_1, . ),
          residual = log10_price - log10_price_hat )
head( price_residuals, 10 )

## # A tibble: 10 x 6
##    id         log10_price log10_size yr_built log10_price_hat residual
##    <chr>            <dbl>      <dbl>    <int>           <dbl>    <dbl>
##  1 7129300520        5.35       3.07     1955            5.50  -0.153 
##  2 6414100192        5.73       3.41     1951            5.81  -0.0828
##  3 5631500400        5.26       2.89     1933            5.36  -0.105 
##  4 2487200875        5.78       3.29     1965            5.69   0.0941
##  5 1954400510        5.71       3.23     1987            5.60   0.112 
##  6 7237550310        6.09       3.73     2001            6.04   0.0473
##  7 1321400060        5.41       3.23     1995            5.59  -0.182 
##  8 2008000270        5.47       3.03     1963            5.45   0.0193
##  9 2414600126        5.36       3.25     1960            5.66  -0.295 
## 10 3793500160        5.51       3.28     2003            5.62  -0.111

Sum of Squared Residuals: The sum of squared residuals is difficult to interpret in absolue terms, but is handy to use to compare the performance between other modeling approaches

sumsqrres <- price_residuals %>% summarise( sum( residual^2 ) )
sumsqrres

## # A tibble: 1 x 1
##   `sum(residual^2)`
##               <dbl>
## 1              585.

Modeling Price by Size and Bedrooms

# Automate prediction and residual computation
get_regression_points(model_price_2) %>%
  mutate(sq_residuals = residual^2) %>%
  summarize(sum_sq_residuals = sum(sq_residuals))

## # A tibble: 1 x 1
##   sum_sq_residuals
##              <dbl>
## 1             605.

Residuals: is the observed oucomeminus the predicted variable. They can be thought of as prediction errors or as the lack-of-fit of the predictions to truth.

Explaining House Price with Size & Condition

Using both numeric and categorical variables

model_price_3 <- lm( log10_price ~ log10_size,
                     data = house_prices )

house_prices %>%
  mutate( prediction = predict( model_price_3, . ) ) %>%
  ggplot( aes( x = log10_size, y = log10_price, color = condition, fill = FALSE ) ) +
  geom_point( ) +
  labs( x = 'log 10 square footage',
        y = 'log 10 price' ) +
  geom_line( aes( x = log10_size, y = prediction ), size = 2, color = 'black' )

house_prices %>%
  mutate( prediction = predict( model_price_3, . ) ) %>%
  ggplot( aes( x = log10_size, y = log10_price, color = condition, fill = FALSE ) ) +
  geom_point( ) +
  labs( x = 'log 10 square footage',
        y = 'log 10 price' ) +
  geom_smooth(method = "lm", se = FALSE, size = 2)

## `geom_smooth()` using formula 'y ~ x'

Parallel Slopes Model: all 5 condition levels are described by the same slope but with different intercepts.

house_prices %>%
  mutate( prediction = predict( model_price_3, . ) ) %>%
  ggplot( aes( x = log10_size, y = log10_price, color = condition ) ) +
  geom_point( ) +
  labs( x = 'log 10 square footage',
        y = 'log 10 price' ) +
  geom_smooth(method = "lm", se = FALSE, size = 2, color = 'red') +
  geom_line( aes( x = log10_size, y = prediction ), size = 2, color = 'black' ) +
  facet_wrap( ~condition )

## `geom_smooth()` using formula 'y ~ x'

Quantify the relationship

model_price_3 <- lm( log10_price ~ log10_size + condition, data = house_prices )
get_regression_table( model_price_3 )

## # A tibble: 6 x 7
##   term       estimate std_error statistic p_value lower_ci upper_ci
##   <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     2.88      0.036     80.0    0        2.81     2.95 
## 2 log10_size    0.837     0.006    134.     0        0.825    0.85 
## 3 condition2   -0.039     0.033     -1.16   0.246   -0.104    0.027
## 4 condition3    0.032     0.031      1.04   0.3     -0.028    0.092
## 5 condition4    0.044     0.031      1.42   0.155   -0.017    0.104
## 6 condition5    0.096     0.031      3.09   0.002    0.035    0.156

Parallel slopes model but with waterfront

# Fit model
model_price_4 <- lm(log10_price ~ log10_size + waterfront, 
                    data = house_prices)

# Get regression table
get_regression_table(model_price_4)

## # A tibble: 3 x 7
##   term           estimate std_error statistic p_value lower_ci upper_ci
##   <chr>             <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept         2.96      0.02      146.        0    2.92     3.00 
## 2 log10_size        0.825     0.006     134.        0    0.813    0.837
## 3 waterfrontTRUE    0.322     0.013      24.5       0    0.296    0.348

Predicting House Price using Size and Condition

Making predictions on new data. Using values in estimate in regression tables

get_regression_table( model_price_3 )

## # A tibble: 6 x 7
##   term       estimate std_error statistic p_value lower_ci upper_ci
##   <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     2.88      0.036     80.0    0        2.81     2.95 
## 2 log10_size    0.837     0.006    134.     0        0.825    0.85 
## 3 condition2   -0.039     0.033     -1.16   0.246   -0.104    0.027
## 4 condition3    0.032     0.031      1.04   0.3     -0.028    0.092
## 5 condition4    0.044     0.031      1.42   0.155   -0.017    0.104
## 6 condition5    0.096     0.031      3.09   0.002    0.035    0.156

#create a new data frame for the new values
new_houses <- data.frame( log10_size = c( 2.9, 3.6 ),
                          condition = factor( c( 3, 4 ) ) )
( new_houses )

##   log10_size condition
## 1        2.9         3
## 2        3.6         4

#make predictions on new data
get_regression_points( model_price_3, newdata = new_houses ) %>%
  mutate( price_hat = 10^log10_price_hat )

## # A tibble: 2 x 5
##      ID log10_size condition log10_price_hat price_hat
##   <int>      <dbl> <fct>               <dbl>     <dbl>
## 1     1        2.9 3                    5.34   219786.
## 2     2        3.6 4                    5.94   870964.

new_houses <- data.frame( log10_size = c( 2.9, 3.1 ),
                          waterfront = c( TRUE, FALSE ) )
get_regression_points( model_price_4, newdata = new_houses ) %>%
  mutate( price_hat = 10^log10_price_hat )

## # A tibble: 2 x 5
##      ID log10_size waterfront log10_price_hat price_hat
##   <int>      <dbl> <lgl>                <dbl>     <dbl>
## 1     1        2.9 TRUE                  5.67   472063.
## 2     2        3.1 FALSE                 5.52   328095.

Model Assessment and Selection

How do you know which model to choose? Which model is best?
Assessing quality of multiple regression models

Model 1: 2 numerical features. log10_size and yr_build -> SSR = 585
Model 3: numerical:: log10_size & a categorical:: condition -> SSR = 608

Model 3 has a larger sum of squared residuals value & is therefore not as good of a fit as Model 1.

# now to find the sum of squared residuals for Model 2 & Model 4
# Calculate squared residuals Model 2
get_regression_points( model_price_2) %>%
mutate( sq_residuals = residual^2) %>%
summarise( sum_sq_residuals = sum( sq_residuals))

## # A tibble: 1 x 1
##   sum_sq_residuals
##              <dbl>
## 1             605.

# Calculate squared residuals Model 4
get_regression_points(model_price_4) %>%
  mutate(sq_residuals = residual^2) %>%
  summarize(sum_sq_residuals = sum( sq_residuals))

## # A tibble: 1 x 1
##   sum_sq_residuals
##              <dbl>
## 1             599.

Assessing Model Fit with R-squared

R-Squared \[R^2=\frac{Var(\mbox{residuals})}{Var(\mbox{y})}=\frac{\mbox{Variance of the Residuals}}{\mbox{Variance of the Outcome Variable}}\] While the sum of squared residuals in unbounded, $R^2$ is standardized to be between 0 & 1 where smaller values of $R^2$ indicate poor model fit and values approahing 1 indicate a good fit to the data.

$R^2 = 1 \longrightarrow$ perfect fit
$R^2 = 0 \longrightarrow$ no relationship between target variable and modelled features

$R^2$’s Interpretation: the proportion of the total variation in the outcome variable $y$ that the model explains

#Compute the R^2 val of Model 1
get_regression_points( model_price_1 ) %>%
  summarise( r_squared = 1 - var( residual )/var( log10_price ) )

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1     0.483

#Compute the R^2 val of Model 3
get_regression_points( model_price_3 ) %>%
  summarise( r_squared = 1 - var( residual )/var( log10_price ) )

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1     0.462

#Compute the R^2 val of Model 2
get_regression_points(model_price_2) %>%
  summarise(r_squared = 1 - var( residual )/var( log10_price ))

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1     0.465

#Compute the R^2 val of Model 4
get_regression_points(model_price_4) %>% 
  summarise(r_squared = 1 - var( residual )/var( log10_price ))

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1     0.470

Assessing Predictions with RMSE

Root Mean Squared Error: Square -root of the Mean Squared Error (gets back to the same units as the target variable). RMSE can be thought of as the typical error that the model will make.

# Root Mean Squared Error
get_regression_points( model_price_1 ) %>%
  mutate( sq_residuals = residual^2 ) %>%
  summarize( mse = mean( sq_residuals ) ) %>%
  mutate( rmse = sqrt( mse ) )

## # A tibble: 1 x 2
##      mse  rmse
##    <dbl> <dbl>
## 1 0.0271 0.164

Get the predicted values and rmse of new values

#Get predictions
new_houses2 <- data_frame( log10_size = c( 2.9, 3.6 ), condition = factor( c( 3, 4 ) ) )

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

get_regression_points( model_price_3, newdata = new_houses2 ) %>%
  mutate( sq_residuals = residual^2 ) %>%
  summarize( mse = mean( sq_residuals ) ) %>%
  mutate( rmse = sqrt( mse ) )

## # A tibble: 1 x 2
##     mse  rmse
##   <dbl> <dbl>
## 1 0.237 0.486

Validation Set Prediction framework

Use two independent datasets to:

Train/fit your model
Evaluate your model’s predictive power i.e. validate your model

Train/test set split
Randomly split all $n$ observations into

a training set: to fit models
a test set: to make predictions on

By using independent samples from the dataset to test/train a model, you can get a sense of how the model will perform on new data.

#Randomly shuffle order of rows:
house_prices_shuffled <- house_prices %>%
  sample_frac( size = 1, replace = FALSE) #sample everything without replacement

#split into train and test
train <- house_prices_shuffled %>%
  slice( 1:10000 )
test <- house_prices_shuffled %>%
  slice( 10001:dim( house_prices)[1] )

Train the Model on Training Data

train_model_price_1 <- lm( log10_price ~ log10_size + yr_built,
                           data = train )
get_regression_table( train_model_price_1 )

## # A tibble: 3 x 7
##   term       estimate std_error statistic p_value lower_ci upper_ci
##   <chr>         <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.31      0.111      47.7       0    5.09     5.53 
## 2 log10_size    0.919     0.01       96.6       0    0.9      0.938
## 3 yr_built     -0.001     0         -22.4       0   -0.001   -0.001

Make Predictions on test data

get_regression_points( train_model_price_1, newdata = test )

## # A tibble: 11,613 x 6
##       ID log10_price log10_size yr_built log10_price_hat residual
##    <int>       <dbl>      <dbl>    <int>           <dbl>    <dbl>
##  1     1        5.6        2.77     1983            5.18    0.419
##  2     2        5.74       3.26     1928            5.71    0.033
##  3     3        5.58       3.02     1967            5.43    0.154
##  4     4        5.40       3.13     1953            5.56   -0.151
##  5     5        5.91       3.54     1998            5.86    0.047
##  6     6        5.91       3.03     1919            5.51    0.404
##  7     7        5.65       3.45     1972            5.82   -0.167
##  8     8        5.39       3.31     1977            5.69   -0.297
##  9     9        5.59       3.32     1981            5.69   -0.098
## 10    10        5.63       3.03     1950            5.46    0.165
## # … with 11,603 more rows

Assess Predictions with RMSE

get_regression_points( train_model_price_1, newdata = test ) %>%
  mutate( sq_residuals = residual^2 ) %>%
  summarise( rmse = sqrt( mean( sq_residuals ) ) )

## # A tibble: 1 x 1
##    rmse
##   <dbl>
## 1 0.164

Repeat with a different model so that RMSE values can be compared

train_model_price_3 <- lm( log10_price ~ log10_size + condition,
                           data = train )

get_regression_points( train_model_price_3, newdata = test ) %>%
  mutate( sq_residuals = residual^2 ) %>%
  summarise( rmse = sqrt( mean( sq_residuals ) ) )

## # A tibble: 1 x 1
##    rmse
##   <dbl>
## 1 0.167

# Fit model to training set
train_model_2 <- lm( log10_price ~ log10_size + bedrooms, data = train )
# Compute RMSE
get_regression_points(train_model_2, newdata = test) %>% 
  mutate(sq_residuals = residual^2) %>%
  summarize(rmse = sqrt(mean(sq_residuals)))

## # A tibble: 1 x 1
##    rmse
##   <dbl>
## 1 0.167