Multivariate data viz- diamonds dataset

library(ggplot2)

data(diamonds)

str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Q1) Price Histograms with Facet and Color

ggplot(aes(x=price, fill = cut), data=diamonds) + 
  geom_histogram() +
  facet_wrap(~color) + 
  scale_fill_brewer(type = 'qual') +
  scale_x_log10()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Q2) price vs Table

ggplot(aes(x = table, y = price), data = diamonds) +
  geom_point(aes(color = cut)) +
  scale_fill_brewer(type='qual') +
  coord_cartesian(xlim = c(50,80)) +
  scale_x_discrete(breaks = seq(50,80,2))

Q3) Price vs volume and clarity

diamonds$volume <- diamonds$x*diamonds$y*diamonds$z

head(diamonds,10)
##    carat       cut color clarity depth table price    x    y    z   volume
## 1   0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43 38.20203
## 2   0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31 34.50586
## 3   0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31 38.07688
## 4   0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63 46.72458
## 5   0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75 51.91725
## 6   0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48 38.69395
## 7   0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47 38.83087
## 8   0.26 Very Good     H     SI1  61.9    55   337 4.07 4.11 2.53 42.32108
## 9   0.22      Fair     E     VS2  65.1    61   337 3.87 3.78 2.49 36.42521
## 10  0.23 Very Good     H     VS1  59.4    61   338 4.00 4.05 2.39 38.71800
ggplot(aes(x = price, y = volume), data = diamonds) + geom_point()

#with colour

ggplot(aes(x = price, y = volume), data = diamonds) + geom_point(aes(color = clarity))

#remove 1%

ggplot(data = diamonds,aes(x = carat, y = price)) + 
  xlim(0,quantile(diamonds$carat,0.99)) +
  ylim(0,quantile(diamonds$price,0.99)) +
  geom_point(aes(color = clarity))
## Warning: Removed 926 rows containing missing values (geom_point).

Q4)