library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
We are trying to see what influences the price of a diamond using data in the diamonds dataset from the ggplot2 package.
There are a few bad values in the variables, x, y and z. Eliminate these and make a cleaned copy of diamonds as d.
d = diamonds[diamonds$x > 0 &
diamonds$y > 0 &
diamonds$z > 0,]
d$ppc = d$price/d$carat
str(d)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53920 obs. of 11 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
## $ ppc : num 1417 1552 1422 1152 1081 ...
We have decided to incorporate the weight of the diamond by focusing on price/carat rather than price itself. There are 3 categorical variables to consider: cut, color and clarity. We have apparently dismissed depth, table, x, y and z, but we could verify this.
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Let’s do scatterplots of these other quantitative variables against ppc.
ggplot(data=d,aes(x=x,y=ppc)) + geom_point(alpha = .05,color="blue") + geom_smooth(color="red") +
ggtitle("x against ppc")
## `geom_smooth()` using method = 'gam'
ggplot(data=d,aes(x=y,y=ppc)) + geom_point(alpha = .05,color="blue") + geom_smooth(color="red") +
ggtitle("y against ppc")
## `geom_smooth()` using method = 'gam'
ggplot(data=d,aes(x=z,y=ppc)) + geom_point(alpha = .05,color="blue") + geom_smooth(color="red") +
ggtitle("z against ppc")
## `geom_smooth()` using method = 'gam'
ggplot(data=d,aes(x=depth,y=ppc)) + geom_point(alpha = .05,color="blue") + geom_smooth(color="red") +
ggtitle("depth against ppc")
## `geom_smooth()` using method = 'gam'
ggplot(data=d,aes(x=table,y=ppc)) + geom_point(alpha = .05,color="blue") + geom_smooth(color="red") +
ggtitle("table against ppc")
## `geom_smooth()` using method = 'gam'
The variable x may have some explanatory power. There is a positive relationship below about 8. What fraction of diamonds has an x value greater than 8?
mean(diamonds$x>8)
## [1] 0.03474231
Putting the x variable aside for the time being, how do we examine the relationships among the three categorical variables and the quantitative variable of intererest, ppc?
We can create a 2-dimensional grid with facet_grid() and then somehow view the relatiosnhip between the third categoriccal variable and ppc in each cell of the grid.
Here is one way to do that.
ggplot(data=d,aes(x=cut,y=ppc)) +
geom_boxplot() +
facet_grid(color~clarity)
Whoops: Do some research on how to tilt the labels on the x-axis.
ggplot(data=d,aes(x=cut,y=ppc)) +
geom_boxplot() +
facet_grid(color~clarity) +
theme(axis.text.x = element_text(face="bold", color="#993333",
angle=45))
ggplot(data=d,aes(x=ppc,y=..density.., colour = cut)) +
geom_freqpoly() +
facet_grid(color~clarity) +
theme(axis.text.x = element_text(face="bold", color="#993333",
angle=45))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Exercise: Try another way.
Look at one cell.
ggplot(data=d[d$color=="D" & d$clarity == "IF",],aes(x=ppc,y=..density.., colour = cut)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("savesample.png",scale=2)
## Saving 14 x 10 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.