library( dplyr )
library( ggplot2 )
library( moderndive )
library( gapminder )
library( stringr )
library( gridExtra )
Background on Modeling for Exploration
General modeling framework formula: \[y = f( \vec{x}) + \epsilon\]
- \(y\): outcome variable of interest
- \(\vec{x}\): explanatory/predictor variable(s)
- \(f()\): function making the relationship between \(y\) and \(\vec{x}\) explicit. AKA: the signal
- \(\epsilon\): unsystematic error component. AKA the noise
Motives for modeling
- Explanation: \(\vec{x}\) are explanatory variables within data range
- Prediction: \(\vec{x}\) are prediction variables beyond data range
glimpse( evals )
## Rows: 463
## Columns: 14
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ prof_ID <int> 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5,…
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4…
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, …
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.333, …
## $ gender <fct> female, female, female, female, male, male, male, male, …
## $ ethnicity <fct> minority, minority, minority, minority, not minority, no…
## $ language <fct> english, english, english, english, english, english, en…
## $ rank <fct> tenure track, tenure track, tenure track, tenure track, …
## $ pic_outfit <fct> not formal, not formal, not formal, not formal, not form…
## $ pic_color <fct> color, color, color, color, color, color, color, color, …
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, 14,…
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 20, …
## $ cls_level <fct> upper, upper, upper, upper, upper, upper, upper, upper, …
Exploratory Data Analysis
- Looking at the data (e.g. w/ str() or glimpse())
- Creating Visualizations
- Computing Summary Statistics
#visualize the numeric feature score
ggplot( evals, aes( x = score ) ) +
geom_histogram( binwidth = 0.25 ) +
labs( x = 'teaching score', y = 'count' )
evals %>%
summarise( mean_score = mean( score ),
median_score = median( score ),
sd_score = sd( score ) )
## # A tibble: 1 x 3
## mean_score median_score sd_score
## <dbl> <dbl> <dbl>
## 1 4.17 4.3 0.544
# Compute summary stats
evals %>%
summarise(mean_age = mean(age),
median_age = median(age),
sd_age = sd(age))
## # A tibble: 1 x 3
## mean_age median_age sd_age
## <dbl> <int> <dbl>
## 1 48.4 48 9.80
# Plot the histogram
ggplot(evals, aes(x = age)) +
geom_histogram(binwidth = 5) +
labs(x = "age", y= "count")
# Compute summary stats
evals %>%
summarise(mean_cls_students = mean(cls_students),
median_cls_students = median(cls_students),
sd_cls_students = sd(cls_students))
## # A tibble: 1 x 3
## mean_cls_students median_cls_students sd_cls_students
## <dbl> <int> <dbl>
## 1 55.2 29 75.1
# Plot the histogram
ggplot(evals, aes(x = cls_students)) +
geom_histogram(binwidth = 10) +
labs(x = "cls_students", y= "count")
evals %>%
summarise(mean_bty_avg = mean(bty_avg),
median_bty_avg = median(bty_avg),
sd_bty_avg = sd(bty_avg),
min_bty_avg = min( bty_avg),
max_bty_avg = max( bty_avg ))
## # A tibble: 1 x 5
## mean_bty_avg median_bty_avg sd_bty_avg min_bty_avg max_bty_avg
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.42 4.33 1.53 1.67 8.17
# Plot the histogram
ggplot(evals, aes(x = bty_avg)) +
geom_histogram(binwidth = 1) +
labs(x = "bty_avg", y= "count")
Background on Modeling for Prediction
Question: Can we predict the sale price of houses based on their features?
Variables:
- \(y\): House sale
pricein USD - \(\vec{x}\): Features like
sqft_living,condition,bedrooms,yr_built,waterfront
kc_data_url <- 'https://raw.githubusercontent.com/SmilodonCub/ReadingLearningTinkering/master/DataCamp/Statistics_with_R/kc_house_data.csv'
house_price_kg <- read.csv( kc_data_url )
glimpse( house_price_kg )
## Rows: 21,613
## Columns: 21
## $ id <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 1954400…
## $ date <chr> "20141013T000000", "20141209T000000", "20150225T000000"…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500…
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2…
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2…
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 18…
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470…
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, …
## $ waterfront <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0…
## $ condition <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4…
## $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, …
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 18…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0,…
## $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2…
## $ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ zipcode <int> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198,…
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 4…
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2…
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113,…
glimpse( house_prices )
## Rows: 21,613
## Columns: 21
## $ id <chr> "7129300520", "6414100192", "5631500400", "2487200875",…
## $ date <date> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015-0…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500…
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2…
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2…
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 18…
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470…
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, …
## $ waterfront <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0…
## $ condition <fct> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4…
## $ grade <fct> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, …
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 18…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0,…
## $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2…
## $ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ zipcode <fct> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198,…
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 4…
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2…
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113,…
all_equal( house_prices, house_price_kg )
## [1] "- Different types for column `id`: character vs double\n- Different types for column `date`: date vs character\n- Different types for column `waterfront`: logical vs integer\n- Different types for column `condition`: factor<69cd8> vs integer\n- Different types for column `grade`: factor<fdaa9> vs integer\n- Different types for column `zipcode`: factor<2571c> vs integer\n"
DataCamp made some small changes to the kaggle dataset. Here I try to wrangle to kaggle set into the same condition that the course prepared it using dplyr methods.
house_price_kg2 <- house_price_kg %>%
mutate( id = as.character( id ),
date = as.Date( date, format = "%Y%m%d" ),
waterfront = as.logical( waterfront ),
condition = factor( condition ),
grade = factor( grade ),
zipcode = factor( zipcode )) %>%
mutate( id = if_else(nchar(id)<10, stringr::str_pad(id,width=10,side="left",pad="0"), id) )
rownames( house_price_kg2 ) <- c()
#all_equal( house_prices, house_price_kg2 )
all.equal( house_price_kg2, house_prices )
## [1] "Attributes: < Component \"class\": Lengths (1, 3) differ (string compare on first 1) >"
## [2] "Attributes: < Component \"class\": 1 string mismatch >"
Good enough for the government. Now back to following the demo…
Visualization:
ggplot( house_prices, aes( x = price ) ) +
geom_histogram() +
labs( x = 'house price', y = 'count' )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution has is right skewed (with a long tail leading to the right of the distribution). This structure will make it difficult to compare lower valued housing prices. This finding is similar to the gapminder dataset:
gapminder %>%
filter( year == 1952 ) %>%
ggplot( aes( x=pop, y=lifeExp, color=continent ) ) +
geom_point() +
ggtitle( '1952 country-level life expectancy vs population' )
Rescale the data on a log10 scale to beter distinguish data points in the lower price range
gapminder %>%
filter( year == 1952 ) %>%
ggplot( aes( x=log10( pop ), y=lifeExp, color=continent ) ) +
geom_point() +
ggtitle( 'Log10 rescaling: 1952 country-level life expectancy vs population' )
With the rescaling, horizontal distances on the X-axis now correspond to multiplicate differences instead of additive.
Now to try something similar with the house_prices dataset
house_price_scaled <- house_prices %>%
mutate( log10_price = log10( price ) ) %>%
select( price, log10_price )
Monotonic transform: the order is preserved.
native_scale <- ggplot( house_prices, aes( x = price ) ) +
geom_histogram() +
labs( x = 'house price', y = 'count' ) +
ggtitle( 'Price' )
log10_scale <- ggplot( house_price_scaled, aes( x = log10_price ) ) +
geom_histogram() +
labs( x = 'log10 house price', y = 'count' ) +
ggtitle( 'log10( Price )' )
grid.arrange( native_scale, log10_scale, ncol = 2 )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
After transformation, the price distribution is much less skewed. Pretty much normal.
Now to see if the predictor feature, sqft_living warrants a similar transformation
# Plot the histogram
ggplot(house_prices, aes(x = sqft_living)) +
geom_histogram() +
labs(x = 'Size (sq.feet)', y='count')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution is right skewed.
# Add log10_size
house_prices_2 <- house_prices %>%
mutate(log10_size = log10(sqft_living))
native_scale <- ggplot( house_prices, aes( x = sqft_living ) ) +
geom_histogram() +
labs( x = 'size', y = 'count' ) +
ggtitle( 'Size' )
log10_scale <- ggplot( house_prices_2, aes( x = log10_size ) ) +
geom_histogram() +
labs( x = 'log10 size', y = 'count' ) +
ggtitle( 'log10( Sie )' )
grid.arrange( native_scale, log10_scale, ncol = 2 )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Modeling the Problem for Explanation
- Generally, \(f()\) and \(\epsilon\) are unknown
- Unknown: \(n\): the number of observations, \(y\) and \(\vec{x}\) are given in the dataset
- Goal: fit a model, \(\hat{f}()\) that approximates \(f()\) while ignoring \(\epsilon\)
- Goal Restated: separate the signal from the noise
- with the model, can then generate fitted/predicted values \(\hat{y}=\hat{f}(\vec{x})\)
Modelling: exploring relationships between variables. EDA of relationship between 2 variables: age & score
pnt_plot <- ggplot( evals, aes( x = age, y = score ) ) +
geom_point() +
labs( x = 'age', y = 'score', title = 'Teaching score over age' )
jitter_plot <- ggplot( evals, aes( x = age, y = score ) ) +
geom_jitter() +
labs( x = 'age', y = 'score', title = 'Teaching score over age' )
grid.arrange( pnt_plot, jitter_plot, ncol = 2 )
Is the relationship positive? investigate with the correlation coefficient
evals %>%
summarize( correlation = cor( score, age ) )
## # A tibble: 1 x 1
## correlation
## <dbl>
## 1 -0.107
The negative correlation suggests that as professor age, they tend to get lower scores.
EDA of relationship of teaching & ‘beauty’ scores
# Plot the histogram
score_plot <- ggplot(evals, aes(x=score)) +
geom_histogram( ) +
labs(x = "Score", y = "count",
title = 'Score')
beauty_plot <- ggplot(evals, aes(x=bty_avg)) +
geom_histogram( binwidth = 0.5 ) +
labs(x = "Beauty score", y = "count",
title = 'Beauty')
SMB <- ggplot(evals, aes(x = bty_avg, y = score)) +
geom_point() +
labs(x = "beauty score", y = "teaching score",
title = 'Score ~ Beauty')
SMB_jitter <- ggplot(evals, aes(x = bty_avg, y = score)) +
geom_jitter() +
labs(x = "beauty score", y = "teaching score",
title = 'Score ~ Beauty w/ jitter')
grid.arrange( score_plot, beauty_plot, SMB, SMB_jitter, ncol = 2 )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
evals %>%
summarize(correlation = cor(score, bty_avg))
## # A tibble: 1 x 1
## correlation
## <dbl>
## 1 0.187
The Modeling Problem for Prediction
The mechnics are similar, but subtle differences with the goals:
- Explanation: We care about the form of \(\hat{f}()\), in particular any values quantifying relationships between \(y\) and \(\vec{x}\) (e.g. model coefficients)
- Prediction: We don’t care so much about the form of \(\hat{f}()\), only that it yields ‘good’ predictions \(\hat{y}\) of \(y\) based on \(\vec{x}\)
EDA using the categorical feature, condition:
house_prices <- house_prices %>%
mutate( log10_price = log10( price ) )
house_prices %>%
select( log10_price, condition ) %>%
glimpse() %>%
ggplot( aes( x = condition, y = log10_price ) ) +
geom_boxplot() +
labs( x = 'house condition', y = 'log10 price',
title = 'log10 house price over condition')
## Rows: 21,613
## Columns: 2
## $ log10_price <dbl> 5.346157, 5.730782, 5.255273, 5.781037, 5.707570, 6.08813…
## $ condition <fct> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, …
As the house condition improves, there is an increase in median house price.
sumvals <- house_prices %>%
group_by( condition ) %>%
summarize( mean = mean( log10_price ),
sd = sd( log10_price ),
n = n() )
## `summarise()` ungrouping output (override with `.groups` argument)
sumvals
## # A tibble: 5 x 4
## condition mean sd n
## <fct> <dbl> <dbl> <int>
## 1 1 5.42 0.293 30
## 2 2 5.45 0.233 172
## 3 3 5.67 0.224 14031
## 4 4 5.65 0.228 5679
## 5 5 5.71 0.244 1701
from the summary stats we see that the mean increases with condition, therestandard deviation varies between condition levels and there are uneven numbers of houses in each group (most in 3 >> 4 > 5).
sumvals %>%
mutate( mean = 10^( as.numeric( mean ) ) ) %>%
select( condition, mean )
## # A tibble: 5 x 2
## condition mean
## <fct> <dbl>
## 1 1 265979.
## 2 2 278758.
## 3 3 468075.
## 4 4 447737.
## 5 5 518250.
EDA of relationship of house price and waterfront
# View the structure of log10_price and waterfront
house_prices %>%
select(log10_price, waterfront) %>%
glimpse()
## Rows: 21,613
## Columns: 2
## $ log10_price <dbl> 5.346157, 5.730782, 5.255273, 5.781037, 5.707570, 6.08813…
## $ waterfront <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
# Plot
ggplot(house_prices, aes(x = waterfront, y = log10_price)) +
geom_boxplot() +
labs(x = "waterfront", y = "log10 price")
# Calculate stats
house_prices %>%
group_by(waterfront) %>%
summarize(mean_log10_price = mean( log10_price), n = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## waterfront mean_log10_price n
## <lgl> <dbl> <int>
## 1 FALSE 5.66 21450
## 2 TRUE 6.12 163
# Prediction of price for houses with view
10^(6.12)
## [1] 1318257
# Prediction of price for houses without view
10^(5.66)
## [1] 457088.2
Modeling with Basic Regression
Explaining Teaching Score with Age
Basic Linear Regression. Adding a linear best-fit line to describe a relationship between features
ggplot( evals, aes( x = age, y = score ) ) +
geom_point() +
labs( x = 'age', y = 'score', title = 'Teaching score over age' ) +
geom_smooth( method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'
The overall relationship between the variables score and age is negative; the scores have a tendancy to decrease with increasing age. This result is consistent with the correlation between the features, correlation = -0.107032
NOTE: correlation \(\neq\) causation!
Modeling with basic linear regression
- Truth:
- Assume the relationship can be described with a line
- \(f(x) = \beta_0 + \beta_1 \cdot x\)
- the Observed value \(y = f(x) + \epsilon = \beta_0 + \beta_1 \cdot x + \epsilon\)
- Fitted:
- Assume \(\hat{f}(x) = \hat{\beta}_0 + \hat{\beta}_1 \cdot x\)
- the Fitted/Predicted values \(\hat{y} = \hat{f}(x)= \hat{\beta}_0 + \hat{\beta}_1 \cdot x\)
Let R compute the coefficients \(\hat{\beta}_0\) & \(\hat{\beta}_1\)
#Fit regression model using formula of form: y ~ x
model_score_1 <- lm( score ~ age, data = evals )
model_score_1
##
## Call:
## lm(formula = score ~ age, data = evals)
##
## Coefficients:
## (Intercept) age
## 4.461932 -0.005938
How to interpret?
- Intercept: 4.46193 has no practical meaning, represents the score at age = 0.
- Slope: -0.00594 for evering year increase in age, there is a corresponding decrease of 0.00594 beauty score.
#from the moderndive library:
( get_regression_table( model_score_1 ) )
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 4.46 0.127 35.2 0 4.21 4.71
## 2 age -0.006 0.003 -2.31 0.021 -0.011 -0.001
#using summary
#almost what moderndive is doing. add confidence intervals and reformat as table tho
summod <- summary( model_score_1 )
regtab <- data.frame( round( summod$coefficients, 4 ) )
regtab
## Estimate Std..Error t.value Pr...t..
## (Intercept) 4.4619 0.1268 35.1947 0.0000
## age -0.0059 0.0026 -2.3114 0.0213
Linear model attempt of score ~ beauty_avg
# Plot
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_jitter() +
labs(x = "beauty score", y = "score") +
geom_smooth(method = 'lm', se = FALSE )
## `geom_smooth()` using formula 'y ~ x'
# Fit model
model_score_2 <- lm(score ~ bty_avg, data = evals)
# Output content
model_score_2
##
## Call:
## lm(formula = score ~ bty_avg, data = evals)
##
## Coefficients:
## (Intercept) bty_avg
## 3.88034 0.06664
# Output regression table
get_regression_table(model_score_2)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
According to our linear model, for every point increase in beauty score, you should observe an associated increase of on average 0.067 points in teaching score.
Predicting Teaching Score using Age
Making predictions about a target variable.
the regression line can be used for both estimate and prediction of a feature \[\hat{y} = \hat{f}(x) = \hat{\beta}_0 + \hat{\beta}_1 \cdot x \] Ex: a professor’s age is 40. What is the predicted score? \[\hat{y} = 4.46 - 0.006 \cdot 40 = 4.22\]
Predictions Error
Prediction Error = difference between observed and predicted values
- Residual = \(y - \hat{y}\)
- Residuals correspond to the \(\epsilon\) from the $y = f()+
What is meant by a ‘best-fit’ line: this is the line that minimizes the magnitudes of the residuals
Making predictions of score based on beauty
get_regression_table(model_score_2)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
test_beautyscore <- data.frame( bty_avg = 5 )
y_hat <- model_score_2 %>% predict( test_beautyscore )
y_hat
## 1
## 4.213523
a value of 4.7 is observed
# Compute residual y - y_hat
residual <- 4.7 - y_hat
residual
## 1
## 0.4864769
the moderndive library has some nifty wrapper fxns to help find the residuals…..
# Get regression table
get_regression_table(model_score_2)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
# Get all fitted/predicted values and residuals
get_regression_points(model_score_2)
## # A tibble: 463 x 5
## ID score bty_avg score_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486
## 2 2 4.1 5 4.21 -0.114
## 3 3 3.9 5 4.21 -0.314
## 4 4 4.8 5 4.21 0.586
## 5 5 4.6 3 4.08 0.52
## 6 6 4.3 3 4.08 0.22
## 7 7 2.8 3 4.08 -1.28
## 8 8 4.1 3.33 4.10 -0.002
## 9 9 3.4 3.33 4.10 -0.702
## 10 10 4.5 3.17 4.09 0.409
## # … with 453 more rows
…but let’s do the work and use dplyr methods to find them:
evals_residuals <- evals %>%
select( ID, score, bty_avg ) %>%
mutate( score_hat = predict( model_score_2, . ),
residual = score - score_hat )
head( evals_residuals, 10 )
## # A tibble: 10 x 5
## ID score bty_avg score_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486
## 2 2 4.1 5 4.21 -0.114
## 3 3 3.9 5 4.21 -0.314
## 4 4 4.8 5 4.21 0.586
## 5 5 4.6 3 4.08 0.520
## 6 6 4.3 3 4.08 0.220
## 7 7 2.8 3 4.08 -1.28
## 8 8 4.1 3.33 4.10 -0.00244
## 9 9 3.4 3.33 4.10 -0.702
## 10 10 4.5 3.17 4.09 0.409
Bingo!, that wasn’t so hard and didnt take too much code. On the other hand, it sure is nice to have convenient wrapper functions.
Explaining Teaching Score with Gender
Logistic Regression: binary categorical variable
sg_box <- ggplot( evals, aes( x = gender, y = score ) ) +
geom_boxplot() +
labs( x = 'gender', y = 'score' )
sg_hist <- ggplot( evals, aes( x = score, fill = gender ) ) +
geom_histogram( binwidth = 0.25 ) +
labs( x = 'gender', y = 'score' )
grid.arrange( sg_box, sg_hist, ncol = 2 )
#fit regression model
model_score_3 <- lm( score ~ gender, data = evals )
#get the regression table
get_regression_table( model_score_3 )
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 4.09 0.039 106. 0 4.02 4.17
## 2 gendermale 0.142 0.051 2.78 0.006 0.042 0.241
#plot summary too just cause I'm used to seeing this
summary( model_score_3 )
##
## Call:
## lm(formula = score ~ gender, data = evals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83433 -0.36357 0.06567 0.40718 0.90718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.09282 0.03867 105.852 < 2e-16 ***
## gendermale 0.14151 0.05082 2.784 0.00558 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5399 on 461 degrees of freedom
## Multiple R-squared: 0.01654, Adjusted R-squared: 0.01441
## F-statistic: 7.753 on 1 and 461 DF, p-value: 0.005583
sanity check by computing the means of each group
evals %>%
group_by( gender ) %>%
summarise( avg_score = mean( score ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## gender avg_score
## <fct> <dbl>
## 1 female 4.09
## 2 male 4.23
Use another categorical variable: rank
evals %>%
group_by( rank ) %>%
summarize( n = n() )
## # A tibble: 3 x 2
## rank n
## <fct> <int>
## 1 teaching 102
## 2 tenure track 108
## 3 tenured 253
EDA of relationship of score and rank
ggplot(evals, aes( x = rank, y = score)) +
geom_boxplot() +
labs(x = "rank", y = "score")
Look at some summary stats
evals %>%
group_by(rank) %>%
summarise(n = n(), mean_score = mean(score), sd_score = sd(score))
## # A tibble: 3 x 4
## rank n mean_score sd_score
## <fct> <int> <dbl> <dbl>
## 1 teaching 102 4.28 0.498
## 2 tenure track 108 4.15 0.561
## 3 tenured 253 4.14 0.550
Fit a linear regression model to the data
# Fit regression model
model_score_4 <- lm(score ~ rank, data = evals)
# Get regression table
get_regression_table(model_score_4)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 4.28 0.054 79.9 0 4.18 4.39
## 2 ranktenure track -0.13 0.075 -1.73 0.084 -0.277 0.017
## 3 ranktenured -0.145 0.064 -2.28 0.023 -0.27 -0.02
Calculate some values from the regression table outcome
# teaching mean
teaching_mean <- 4.28
# tenure track mean
tenure_track_mean <- teaching_mean - 0.13
# tenured mean
tenured_mean <- teaching_mean - 0.145
mes <- paste( 'Teaching Mean:', round( teaching_mean, 3),
'\nTenure Track Mean:', round( tenure_track_mean, 3 ),
'\nTenured:', round( tenured_mean, 3 ) )
cat( mes, sep = '\n')
## Teaching Mean: 4.28
## Tenure Track Mean: 4.15
## Tenured: 4.135
Do we get the same result using dplyr methods?
evals %>%
group_by( rank ) %>%
summarise( mean = mean( score ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## rank mean
## <fct> <dbl>
## 1 teaching 4.28
## 2 tenure track 4.15
## 3 tenured 4.14
Yes, the values check out.
Predicting Teaching Score using Gender
Predicting with a categorical variable
evals %>%
group_by( gender ) %>%
summarize( mean_score =mean( score ), sd_score = sd( score ) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## gender mean_score sd_score
## <fct> <dbl> <dbl>
## 1 female 4.09 0.564
## 2 male 4.23 0.522
Now to find the residuals for all records
get_regression_points( model_score_3 )
## # A tibble: 463 x 5
## ID score gender score_hat residual
## <int> <dbl> <fct> <dbl> <dbl>
## 1 1 4.7 female 4.09 0.607
## 2 2 4.1 female 4.09 0.007
## 3 3 3.9 female 4.09 -0.193
## 4 4 4.8 female 4.09 0.707
## 5 5 4.6 male 4.23 0.366
## 6 6 4.3 male 4.23 0.066
## 7 7 2.8 male 4.23 -1.43
## 8 8 4.1 male 4.23 -0.134
## 9 9 3.4 male 4.23 -0.834
## 10 10 4.5 female 4.09 0.407
## # … with 453 more rows
#OR
evals_residuals <- evals %>%
select( ID, score, gender ) %>%
mutate( score_hat = predict( model_score_3, . ),
residual = score - score_hat )
head( evals_residuals, 10 )
## # A tibble: 10 x 5
## ID score gender score_hat residual
## <int> <dbl> <fct> <dbl> <dbl>
## 1 1 4.7 female 4.09 0.607
## 2 2 4.1 female 4.09 0.00718
## 3 3 3.9 female 4.09 -0.193
## 4 4 4.8 female 4.09 0.707
## 5 5 4.6 male 4.23 0.366
## 6 6 4.3 male 4.23 0.0657
## 7 7 2.8 male 4.23 -1.43
## 8 8 4.1 male 4.23 -0.134
## 9 9 3.4 male 4.23 -0.834
## 10 10 4.5 female 4.09 0.407
Plot a histogram of the residuals
ggplot( evals_residuals, aes( x = residual ) ) +
geom_histogram() +
labs( x = 'residuals',
title = 'Residuals from score ~ gender' )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The residuals are roughly centered on 0
A quick look at predicting score ~ rank
# Calculate predictions and residuals
model_score_4_points <- get_regression_points(model_score_4)
model_score_4_points
## # A tibble: 463 x 5
## ID score rank score_hat residual
## <int> <dbl> <fct> <dbl> <dbl>
## 1 1 4.7 tenure track 4.16 0.545
## 2 2 4.1 tenure track 4.16 -0.055
## 3 3 3.9 tenure track 4.16 -0.255
## 4 4 4.8 tenure track 4.16 0.645
## 5 5 4.6 tenured 4.14 0.461
## 6 6 4.3 tenured 4.14 0.161
## 7 7 2.8 tenured 4.14 -1.34
## 8 8 4.1 tenured 4.14 -0.039
## 9 9 3.4 tenured 4.14 -0.739
## 10 10 4.5 tenured 4.14 0.361
## # … with 453 more rows
# Plot residuals
ggplot(model_score_4_points, aes( x = residual )) +
geom_histogram() +
labs(x = "residuals", title = "Residuals from score ~ rank model")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Modeling with Multiple Regression
Explaining house price with year and size
Multiple Linear Regression with numeric features
house_prices <- house_prices %>%
mutate( log10_price = log10( price ),
log10_size = log10( sqft_living ) )
3D Scatter Plot: Visualizing the oint relationship between 3 variables.
The generalization of a regression line in a 3D plot is a regression plane. Of all possible planes, the regression plane minimizes the residuals (descrepancy between observed and predicted values)
#use plotly
Quantifying the relationship:
model_price_1 <- lm( log10_price ~ log10_size + yr_built,
data = house_prices )
get_regression_table( model_price_1)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 5.38 0.075 71.4 0 5.24 5.53
## 2 log10_size 0.913 0.006 141. 0 0.901 0.926
## 3 yr_built -0.001 0 -33.8 0 -0.001 -0.001
Interpreting: Taking into account the other variables used to build, a given features has an associated mean change of the estimate per unit measure. Ex: taking into account yr_built, log10_size increases by 0.913 in price on average.
EDA of relationships
# Create scatterplot with regression line
nativedat <- ggplot(house_prices, aes(x = bedrooms, y = log10_price)) +
geom_point() +
labs(x = "Number of bedrooms", y = "log10 price") +
geom_smooth(method = "lm", se = FALSE)
# Remove outlier
house_prices_transform <- house_prices %>%
filter( bedrooms < 15 )
# Create scatterplot with regression line
dat_outlier_rm <- ggplot(house_prices_transform, aes(x = bedrooms, y = log10_price)) +
geom_point() +
labs(x = "Number of bedrooms", y = "log10 price") +
geom_smooth(method = "lm", se = FALSE)
grid.arrange( nativedat, dat_outlier_rm, ncol = 2 )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
fit a multiple linear regression model
# Fit model
model_price_2 <- lm(log10_price ~ log10_size + bedrooms,
data = house_prices)
# Get regression table
get_regression_table( model_price_2 )
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.72 0.023 118. 0 2.67 2.76
## 2 log10_size 0.931 0.008 118. 0 0.916 0.947
## 3 bedrooms -0.03 0.002 -19.3 0 -0.033 -0.027
Accounting for log10_size, every extra bedroom is associated with a decrease of an average 0.033 in log10_price
Predicting House Price using Year & Size
get_regression_table( model_price_1)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 5.38 0.075 71.4 0 5.24 5.53
## 2 log10_size 0.913 0.006 141. 0 0.901 0.926
## 3 yr_built -0.001 0 -33.8 0 -0.001 -0.001
#make a prediction of log10_price = 3.07 and year = 1980
newval <- data.frame( 'log10_size' = 3.06, 'yr_built' = 1980 )
prediction <- newval %>%
mutate( log10_price = predict( model_price_1, . ),
price = 10^( log10_price ) )
prediction
## log10_size yr_built log10_price price
## 1 3.06 1980 5.454154 284546.8
Compute all Predicted values and Residuals
#using moderndive methods
get_regression_points( model_price_1 )
## # A tibble: 21,613 x 6
## ID log10_price log10_size yr_built log10_price_hat residual
## <int> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1 5.35 3.07 1955 5.50 -0.153
## 2 2 5.73 3.41 1951 5.81 -0.083
## 3 3 5.26 2.89 1933 5.36 -0.105
## 4 4 5.78 3.29 1965 5.69 0.094
## 5 5 5.71 3.22 1987 5.60 0.112
## 6 6 6.09 3.73 2001 6.04 0.047
## 7 7 5.41 3.23 1995 5.59 -0.182
## 8 8 5.46 3.02 1963 5.45 0.019
## 9 9 5.36 3.25 1960 5.66 -0.295
## 10 10 5.51 3.28 2003 5.62 -0.111
## # … with 21,603 more rows
#using dplyr methods
price_residuals <- house_prices %>%
select( id , log10_price, log10_size, yr_built ) %>%
mutate( log10_price_hat = predict( model_price_1, . ),
residual = log10_price - log10_price_hat )
head( price_residuals, 10 )
## # A tibble: 10 x 6
## id log10_price log10_size yr_built log10_price_hat residual
## <chr> <dbl> <dbl> <int> <dbl> <dbl>
## 1 7129300520 5.35 3.07 1955 5.50 -0.153
## 2 6414100192 5.73 3.41 1951 5.81 -0.0828
## 3 5631500400 5.26 2.89 1933 5.36 -0.105
## 4 2487200875 5.78 3.29 1965 5.69 0.0941
## 5 1954400510 5.71 3.23 1987 5.60 0.112
## 6 7237550310 6.09 3.73 2001 6.04 0.0473
## 7 1321400060 5.41 3.23 1995 5.59 -0.182
## 8 2008000270 5.47 3.03 1963 5.45 0.0193
## 9 2414600126 5.36 3.25 1960 5.66 -0.295
## 10 3793500160 5.51 3.28 2003 5.62 -0.111
Sum of Squared Residuals: The sum of squared residuals is difficult to interpret in absolue terms, but is handy to use to compare the performance between other modeling approaches
sumsqrres <- price_residuals %>% summarise( sum( residual^2 ) )
sumsqrres
## # A tibble: 1 x 1
## `sum(residual^2)`
## <dbl>
## 1 585.
Modeling Price by Size and Bedrooms
# Automate prediction and residual computation
get_regression_points(model_price_2) %>%
mutate(sq_residuals = residual^2) %>%
summarize(sum_sq_residuals = sum(sq_residuals))
## # A tibble: 1 x 1
## sum_sq_residuals
## <dbl>
## 1 605.
Residuals: is the observed oucomeminus the predicted variable. They can be thought of as prediction errors or as the lack-of-fit of the predictions to truth.
Explaining House Price with Size & Condition
Using both numeric and categorical variables
model_price_3 <- lm( log10_price ~ log10_size,
data = house_prices )
house_prices %>%
mutate( prediction = predict( model_price_3, . ) ) %>%
ggplot( aes( x = log10_size, y = log10_price, color = condition, fill = FALSE ) ) +
geom_point( ) +
labs( x = 'log 10 square footage',
y = 'log 10 price' ) +
geom_line( aes( x = log10_size, y = prediction ), size = 2, color = 'black' )
house_prices %>%
mutate( prediction = predict( model_price_3, . ) ) %>%
ggplot( aes( x = log10_size, y = log10_price, color = condition, fill = FALSE ) ) +
geom_point( ) +
labs( x = 'log 10 square footage',
y = 'log 10 price' ) +
geom_smooth(method = "lm", se = FALSE, size = 2)
## `geom_smooth()` using formula 'y ~ x'
Parallel Slopes Model: all 5 condition levels are described by the same slope but with different intercepts.
house_prices %>%
mutate( prediction = predict( model_price_3, . ) ) %>%
ggplot( aes( x = log10_size, y = log10_price, color = condition ) ) +
geom_point( ) +
labs( x = 'log 10 square footage',
y = 'log 10 price' ) +
geom_smooth(method = "lm", se = FALSE, size = 2, color = 'red') +
geom_line( aes( x = log10_size, y = prediction ), size = 2, color = 'black' ) +
facet_wrap( ~condition )
## `geom_smooth()` using formula 'y ~ x'
Quantify the relationship
model_price_3 <- lm( log10_price ~ log10_size + condition, data = house_prices )
get_regression_table( model_price_3 )
## # A tibble: 6 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.88 0.036 80.0 0 2.81 2.95
## 2 log10_size 0.837 0.006 134. 0 0.825 0.85
## 3 condition2 -0.039 0.033 -1.16 0.246 -0.104 0.027
## 4 condition3 0.032 0.031 1.04 0.3 -0.028 0.092
## 5 condition4 0.044 0.031 1.42 0.155 -0.017 0.104
## 6 condition5 0.096 0.031 3.09 0.002 0.035 0.156
Parallel slopes model but with waterfront
# Fit model
model_price_4 <- lm(log10_price ~ log10_size + waterfront,
data = house_prices)
# Get regression table
get_regression_table(model_price_4)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.96 0.02 146. 0 2.92 3.00
## 2 log10_size 0.825 0.006 134. 0 0.813 0.837
## 3 waterfrontTRUE 0.322 0.013 24.5 0 0.296 0.348
Predicting House Price using Size and Condition
Making predictions on new data. Using values in estimate in regression tables
get_regression_table( model_price_3 )
## # A tibble: 6 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.88 0.036 80.0 0 2.81 2.95
## 2 log10_size 0.837 0.006 134. 0 0.825 0.85
## 3 condition2 -0.039 0.033 -1.16 0.246 -0.104 0.027
## 4 condition3 0.032 0.031 1.04 0.3 -0.028 0.092
## 5 condition4 0.044 0.031 1.42 0.155 -0.017 0.104
## 6 condition5 0.096 0.031 3.09 0.002 0.035 0.156
#create a new data frame for the new values
new_houses <- data.frame( log10_size = c( 2.9, 3.6 ),
condition = factor( c( 3, 4 ) ) )
( new_houses )
## log10_size condition
## 1 2.9 3
## 2 3.6 4
#make predictions on new data
get_regression_points( model_price_3, newdata = new_houses ) %>%
mutate( price_hat = 10^log10_price_hat )
## # A tibble: 2 x 5
## ID log10_size condition log10_price_hat price_hat
## <int> <dbl> <fct> <dbl> <dbl>
## 1 1 2.9 3 5.34 219786.
## 2 2 3.6 4 5.94 870964.
new_houses <- data.frame( log10_size = c( 2.9, 3.1 ),
waterfront = c( TRUE, FALSE ) )
get_regression_points( model_price_4, newdata = new_houses ) %>%
mutate( price_hat = 10^log10_price_hat )
## # A tibble: 2 x 5
## ID log10_size waterfront log10_price_hat price_hat
## <int> <dbl> <lgl> <dbl> <dbl>
## 1 1 2.9 TRUE 5.67 472063.
## 2 2 3.1 FALSE 5.52 328095.
Model Assessment and Selection
How do you know which model to choose? Which model is best?
Assessing quality of multiple regression models
Model 1: 2 numerical features. log10_size and yr_build -> SSR = 585
Model 3: numerical:: log10_size & a categorical:: condition -> SSR = 608
Model 3 has a larger sum of squared residuals value & is therefore not as good of a fit as Model 1.
# now to find the sum of squared residuals for Model 2 & Model 4
# Calculate squared residuals Model 2
get_regression_points( model_price_2) %>%
mutate( sq_residuals = residual^2) %>%
summarise( sum_sq_residuals = sum( sq_residuals))
## # A tibble: 1 x 1
## sum_sq_residuals
## <dbl>
## 1 605.
# Calculate squared residuals Model 4
get_regression_points(model_price_4) %>%
mutate(sq_residuals = residual^2) %>%
summarize(sum_sq_residuals = sum( sq_residuals))
## # A tibble: 1 x 1
## sum_sq_residuals
## <dbl>
## 1 599.
Assessing Model Fit with R-squared
R-Squared \[R^2=\frac{Var(\mbox{residuals})}{Var(\mbox{y})}=\frac{\mbox{Variance of the Residuals}}{\mbox{Variance of the Outcome Variable}}\] While the sum of squared residuals in unbounded, \(R^2\) is standardized to be between 0 & 1 where smaller values of \(R^2\) indicate poor model fit and values approahing 1 indicate a good fit to the data.
- \(R^2 = 1 \longrightarrow\) perfect fit
- \(R^2 = 0 \longrightarrow\) no relationship between target variable and modelled features
\(R^2\)’s Interpretation: the proportion of the total variation in the outcome variable \(y\) that the model explains
#Compute the R^2 val of Model 1
get_regression_points( model_price_1 ) %>%
summarise( r_squared = 1 - var( residual )/var( log10_price ) )
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.483
#Compute the R^2 val of Model 3
get_regression_points( model_price_3 ) %>%
summarise( r_squared = 1 - var( residual )/var( log10_price ) )
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.462
#Compute the R^2 val of Model 2
get_regression_points(model_price_2) %>%
summarise(r_squared = 1 - var( residual )/var( log10_price ))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.465
#Compute the R^2 val of Model 4
get_regression_points(model_price_4) %>%
summarise(r_squared = 1 - var( residual )/var( log10_price ))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.470
Assessing Predictions with RMSE
Root Mean Squared Error: Square -root of the Mean Squared Error (gets back to the same units as the target variable). RMSE can be thought of as the typical error that the model will make.
# Root Mean Squared Error
get_regression_points( model_price_1 ) %>%
mutate( sq_residuals = residual^2 ) %>%
summarize( mse = mean( sq_residuals ) ) %>%
mutate( rmse = sqrt( mse ) )
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.0271 0.164
Get the predicted values and rmse of new values
#Get predictions
new_houses2 <- data_frame( log10_size = c( 2.9, 3.6 ), condition = factor( c( 3, 4 ) ) )
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
get_regression_points( model_price_3, newdata = new_houses2 ) %>%
mutate( sq_residuals = residual^2 ) %>%
summarize( mse = mean( sq_residuals ) ) %>%
mutate( rmse = sqrt( mse ) )
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.237 0.486
Validation Set Prediction framework
Use two independent datasets to:
- Train/fit your model
- Evaluate your model’s predictive power i.e. validate your model
Train/test set split
Randomly split all \(n\) observations into
- a training set: to fit models
- a test set: to make predictions on
By using independent samples from the dataset to test/train a model, you can get a sense of how the model will perform on new data.
#Randomly shuffle order of rows:
house_prices_shuffled <- house_prices %>%
sample_frac( size = 1, replace = FALSE) #sample everything without replacement
#split into train and test
train <- house_prices_shuffled %>%
slice( 1:10000 )
test <- house_prices_shuffled %>%
slice( 10001:dim( house_prices)[1] )
Train the Model on Training Data
train_model_price_1 <- lm( log10_price ~ log10_size + yr_built,
data = train )
get_regression_table( train_model_price_1 )
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 5.31 0.111 47.7 0 5.09 5.53
## 2 log10_size 0.919 0.01 96.6 0 0.9 0.938
## 3 yr_built -0.001 0 -22.4 0 -0.001 -0.001
Make Predictions on test data
get_regression_points( train_model_price_1, newdata = test )
## # A tibble: 11,613 x 6
## ID log10_price log10_size yr_built log10_price_hat residual
## <int> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1 5.6 2.77 1983 5.18 0.419
## 2 2 5.74 3.26 1928 5.71 0.033
## 3 3 5.58 3.02 1967 5.43 0.154
## 4 4 5.40 3.13 1953 5.56 -0.151
## 5 5 5.91 3.54 1998 5.86 0.047
## 6 6 5.91 3.03 1919 5.51 0.404
## 7 7 5.65 3.45 1972 5.82 -0.167
## 8 8 5.39 3.31 1977 5.69 -0.297
## 9 9 5.59 3.32 1981 5.69 -0.098
## 10 10 5.63 3.03 1950 5.46 0.165
## # … with 11,603 more rows
Assess Predictions with RMSE
get_regression_points( train_model_price_1, newdata = test ) %>%
mutate( sq_residuals = residual^2 ) %>%
summarise( rmse = sqrt( mean( sq_residuals ) ) )
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 0.164
Repeat with a different model so that RMSE values can be compared
train_model_price_3 <- lm( log10_price ~ log10_size + condition,
data = train )
get_regression_points( train_model_price_3, newdata = test ) %>%
mutate( sq_residuals = residual^2 ) %>%
summarise( rmse = sqrt( mean( sq_residuals ) ) )
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 0.167
# Fit model to training set
train_model_2 <- lm( log10_price ~ log10_size + bedrooms, data = train )
# Compute RMSE
get_regression_points(train_model_2, newdata = test) %>%
mutate(sq_residuals = residual^2) %>%
summarize(rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 0.167