library(tidyverse)
wine <- read_rds("/Users/Rose/Downloads/wine.rds")
library(moderndive) # Package helpful for regression result readability
## Warning: package 'moderndive' was built under R version 3.5.2
wine <- wine %>%
mutate(bordeaux=(province=="Bordeaux")) # Creates a logical variable named Bordeaux (T/F)
head(wine)
## # A tibble: 6 x 16
## id country description designation points price province region_1 region_2
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 Portug… This is ri… Avidagos 87 15 Douro <NA> <NA>
## 2 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 3 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 4 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 5 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## 6 6 Italy Here's a b… Belsito 87 16 Sicily … Vittoria <NA>
## # … with 7 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## # title <chr>, variety <chr>, winery <chr>, year <dbl>, bordeaux <lgl>
m1 <- lm(price ~ points, data = wine) # Predicting the price of a wine based on its points
get_regression_table(m1) # From modern dive package
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -489. 3.97 -123. 0 -497. -481.
## 2 points 5.92 0.045 132. 0 5.83 6.01
## Interpreting the results
### Estimate = For every increase in 1 point of wine... I get to charge +$6.. so a $12 bottle of wine will now be $18 for every increased rating
### How Confident are we? Pretty confident, with a Standard Error of 0.045 we have a very tight bound (really high confidence this estimate is predicting something)
head(wine)
## # A tibble: 6 x 16
## id country description designation points price province region_1 region_2
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 Portug… This is ri… Avidagos 87 15 Douro <NA> <NA>
## 2 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 3 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 4 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 5 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## 6 6 Italy Here's a b… Belsito 87 16 Sicily … Vittoria <NA>
## # … with 7 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## # title <chr>, variety <chr>, winery <chr>, year <dbl>, bordeaux <lgl>
m2 <- lm(price ~ points+bordeaux, data = wine)
get_regression_table(m2)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -492. 3.97 -124. 0 -500. -484.
## 2 points 5.95 0.045 133. 0 5.86 6.03
## 3 bordeauxTRUE 8.70 0.661 13.2 0 7.41 10.00
m3 <- lm(price~points*bordeaux, data = wine)
get_regression_table(m3)
## # A tibble: 4 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -461. 4.04 -114. 0 -469. -453.
## 2 points 5.60 0.045 123. 0 5.51 5.69
## 3 bordeauxTRUE -666. 18.8 -35.5 0 -703. -629.
## 4 points:bordeauxTRUE 7.66 0.213 36.0 0 7.24 8.07
get_regression_summaries(m1)
get_regression_summaries(m2)
get_regression_summaries(m3)
library(caret) # Best ML library in R right now
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## TidyModels coming out soon
set.seed(504)
train_index <- createDataPartition(wine$price, times = 1, p = 0.8, list = FALSE) # Takes random indicies(rows) and splits them in a 80/20 partition
train <- wine[train_index, ] # 80% of indicies
test <- wine[-train_index, ] # The rest
# Using training dataset instead of the whole dataset (like we did before)
m1 <- lm(price~points, data = train)
m2 <- lm(price~points+bordeaux, data = train)
m3 <- lm(price~points*bordeaux, data = train)
r1 <- get_regression_points(m1, newdata = test) %>%
drop_na(residual) %>%
mutate(sq_residuals = residual^2) %>% # manually calculating the RMSE
summarize(rmse = sqrt(mean(sq_residuals)))
r1
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 48.5
r2 <- get_regression_points(m2, newdata = test) %>%
drop_na(residual) %>%
mutate(sq_residuals = residual^2) %>%
summarize(rmse = sqrt(mean(sq_residuals)))
r2
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 48.5
r3 <- get_regression_points(m3, newdata = test) %>%
drop_na(residual) %>%
mutate(sq_residuals = residual^2) %>%
summarize(rmse = sqrt(mean(sq_residuals)))
r3
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 48.1
Reasonable people will disagree over subtle matters of right and wrong… thus, the important part of data ethics is committing to consider the ethical consequences of your choices.
The difference between “regular” ethics and data ethics is that algorithms scale really easily. Thus, seemingly small decisions can have wide-ranging impact.