Multivariate data viz- diamonds dataset
library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Q1) Price Histograms with Facet and Color
ggplot(aes(x=price, fill = cut), data=diamonds) +
geom_histogram() +
facet_wrap(~color) +
scale_fill_brewer(type = 'qual') +
scale_x_log10()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Q2) price vs Table
ggplot(aes(x = table, y = price), data = diamonds) +
geom_point(aes(color = cut)) +
scale_fill_brewer(type='qual') +
coord_cartesian(xlim = c(50,80)) +
scale_x_discrete(breaks = seq(50,80,2))
Q3) Price vs volume and clarity
diamonds$volume <- diamonds$x*diamonds$y*diamonds$z
head(diamonds,10)
## carat cut color clarity depth table price x y z volume
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 38.20203
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 34.50586
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 38.07688
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63 46.72458
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 51.91725
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 38.69395
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 38.83087
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 42.32108
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 36.42521
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39 38.71800
ggplot(aes(x = price, y = volume), data = diamonds) + geom_point()
#with colour
ggplot(aes(x = price, y = volume), data = diamonds) + geom_point(aes(color = clarity))
#remove 1%
ggplot(data = diamonds,aes(x = carat, y = price)) +
xlim(0,quantile(diamonds$carat,0.99)) +
ylim(0,quantile(diamonds$price,0.99)) +
geom_point(aes(color = clarity))
## Warning: Removed 926 rows containing missing values (geom_point).
Q4)