library(tidyverse)
## Warning: 程辑包'tidyverse'是用R版本4.1.3 来建造的
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: 程辑包'ggplot2'是用R版本4.1.3 来建造的
## Warning: 程辑包'tidyr'是用R版本4.1.3 来建造的
## Warning: 程辑包'readr'是用R版本4.1.3 来建造的
## Warning: 程辑包'stringr'是用R版本4.1.3 来建造的
## Warning: 程辑包'forcats'是用R版本4.1.3 来建造的
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(modelr)
## Warning: 程辑包'modelr'是用R版本4.1.3 来建造的
options(na.action = na.warn)
library(nycflights13)
## Warning: 程辑包'nycflights13'是用R版本4.1.3 来建造的
library(lubridate)
## Warning: 程辑包'lubridate'是用R版本4.1.3 来建造的
## 
## 载入程辑包:'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyverse)
library(modelr)
options(na.action = na.warn)
library(nycflights13)
library(lubridate) 


ggplot(diamonds, aes(cut, price)) + geom_boxplot()

ggplot(diamonds, aes(color, price)) + geom_boxplot()

ggplot(diamonds, aes(clarity, price)) + geom_boxplot()

ggplot(diamonds, aes(carat, price)) +
 geom_hex(bins = 50)

diamonds2 <- diamonds %>%
 filter(carat <= 2.5) %>%
 mutate(lprice = log2(price), lcarat = log2(carat))



ggplot(diamonds2, aes(lcarat, lprice)) +
 geom_hex(bins = 50)

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)


grid <- diamonds2 %>%
 data_grid(carat = seq_range(carat, 20)) %>%
 mutate(lcarat = log2(carat)) %>%
 add_predictions(mod_diamond, "lprice") %>%
 mutate(price = 2 ^ lprice)
ggplot(diamonds2, aes(carat, price)) +
 geom_hex(bins = 50) +
 geom_line(data = grid, color = "red", size = 1)

diamonds2 <- diamonds2 %>%
 add_residuals(mod_diamond, "lresid")
ggplot(diamonds2, aes(lcarat, lresid)) +
 geom_hex(bins = 50)

 ggplot(diamonds2, aes(cut, lresid)) + geom_boxplot()

ggplot(diamonds2, aes(color, lresid)) + geom_boxplot()

ggplot(diamonds2, aes(clarity, lresid)) + geom_boxplot()

mod_diamond2 <- lm(
 lprice ~ lcarat + color + cut + clarity,
 data = diamonds2
)


grid <- diamonds2 %>%
 data_grid(cut, .model = mod_diamond2) %>%
 add_predictions(mod_diamond2)
grid
## # A tibble: 5 x 5
##   cut       lcarat color clarity  pred
##   <ord>      <dbl> <chr> <chr>   <dbl>
## 1 Fair      -0.515 G     VS2      11.2
## 2 Good      -0.515 G     VS2      11.3
## 3 Very Good -0.515 G     VS2      11.4
## 4 Premium   -0.515 G     VS2      11.4
## 5 Ideal     -0.515 G     VS2      11.4
#> # A tibble: 5 × 5
#> cut lcarat color clarity pred
#> <ord> <dbl> <chr> <chr> <dbl>
#> 1 Fair -0.515 G SI1 11.0
#> 2 Good -0.515 G SI1 11.1
#> 3 Very Good -0.515 G SI1 11.2
#> 4 Premium -0.515 G SI1 11.2
#> 5 Ideal -0.515 G SI1 11.2
ggplot(grid, aes(cut, pred)) +
 geom_point()

diamonds2 <- diamonds2 %>%
 add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
 geom_hex(bins = 50) 

diamonds2 %>%
 filter(abs(lresid2) > 1) %>%
 add_predictions(mod_diamond2) %>%
 mutate(pred = round(2 ^ pred)) %>%
 select(price, pred, carat:table, x:z) %>%
 arrange(price)
## # A tibble: 16 x 11
##    price  pred carat cut       color clarity depth table     x     y     z
##    <int> <dbl> <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1013   264  0.25 Fair      F     SI2      54.4    64  4.3   4.23  2.32
##  2  1186   284  0.25 Premium   G     SI2      59      60  5.33  5.28  3.12
##  3  1186   284  0.25 Premium   G     SI2      58.8    60  5.33  5.28  3.12
##  4  1262  2644  1.03 Fair      E     I1       78.2    54  5.72  5.59  4.42
##  5  1415   639  0.35 Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  6  1415   639  0.35 Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  7  1715   576  0.32 Fair      F     VS2      59.6    60  4.42  4.34  2.61
##  8  1776   412  0.29 Fair      F     SI1      55.8    60  4.48  4.41  2.48
##  9  2160   314  0.34 Fair      F     I1       55.8    62  4.72  4.6   2.6 
## 10  2366   774  0.3  Very Good D     VVS2     60.6    58  4.33  4.35  2.63
## 11  3360  1373  0.51 Premium   F     SI1      62.7    62  5.09  4.96  3.15
## 12  3807  1540  0.61 Good      F     SI2      62.5    65  5.36  5.29  3.33
## 13  3920  1705  0.51 Fair      F     VVS2     65.4    60  4.98  4.9   3.23
## 14  4368  1705  0.51 Fair      F     VVS2     60.7    66  5.21  5.11  3.13
## 15 10011  4048  1.01 Fair      D     SI2      64.6    58  6.25  6.2   4.02
## 16 10470 23622  2.46 Premium   E     SI2      59.7    59  8.82  8.76  5.25
#> # A tibble: 16 × 11
#> #> price pred carat cut color clarity depth table x
#> <int> <dbl> <dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl>
#> 1 1013 264 0.25 Fair F SI2 54.4 64 4.30
#> 2 1186 284 0.25 Premium G SI2 59.0 60 5.33
#> 3 1186 284 0.25 Premium G SI2 58.8 60 5.33
#> 4 1262 2644 1.03 Fair E I1 78.2 54 5.72
#> 5 1415 639 0.35 Fair G VS2 65.9 54 5.57
#> 6 1415 639 0.35 Fair G VS2 65.9 54 5.57
#> # ... with 10 more rows, and 2 more variables: y <dbl>,
#> # z <dbl>