We are going to use the diamonds dataset in the ggplot2 package for data viz exercises

Q1) Examine the diamonds datasets- number of observations, variables and ordered factors

library(ggplot2)

data(diamonds)

str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Q2) Create a histogram of the price of all the diamonds in the diamond data set.

hist(diamonds$price) #using base graphics

ggplot(data=diamonds) + geom_histogram(binwidth=500, aes(x=diamonds$price)) + ggtitle("Diamond Price Distribution") + xlab("Diamond Price U$") + ylab("Frequency") #ggplot version

#Q3
summary (diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820
#The distribution has a right skew-  long tail extending to right
#Mean is greater than median 

Q4) Diamond counts

c500=subset(diamonds, price<500)
str(c500)
## 'data.frame':    1729 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
c250=subset(diamonds, price<250)
str(c250)
## 'data.frame':    0 obs. of  10 variables:
##  $ carat  : num 
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 
##  $ depth  : num 
##  $ table  : num 
##  $ price  : int 
##  $ x      : num 
##  $ y      : num 
##  $ z      : num
c15k=subset(diamonds, price>=15000)
str(c15k)
## 'data.frame':    1656 obs. of  10 variables:
##  $ carat  : num  1.6 1.54 1.19 2.1 1.69 1.5 1.73 2.02 2.05 1.5 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 5 4 5 3 3 4 3 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 4 2 3 6 1 4 4 4 3 3 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 4 4 7 3 3 6 5 2 2 5 ...
##  $ depth  : num  61.9 62.3 61.5 61.5 60.8 62.9 62.8 63 61.9 61.6 ...
##  $ table  : num  56 58 55 57 57 56 57 59 56 58 ...
##  $ price  : int  15000 15002 15005 15007 15011 15013 15014 15014 15017 15022 ...
##  $ x      : num  7.53 7.31 6.82 8.25 7.69 7.22 7.57 8.05 8.13 7.35 ...
##  $ y      : num  7.47 7.39 6.84 8.21 7.71 7.32 7.72 7.95 8.18 7.43 ...
##  $ z      : num  4.64 4.58 4.2 5.06 4.68 4.57 4.8 5.03 5.05 4.55 ...

Q5) Cheaper diamonds-

qplot(x = price, data = diamonds, color = I('black'), fill = I('#099DD9'), binwidth = 50) + 
  scale_x_continuous(limits= c(0,2400), breaks=seq(0,2400,400))

Q6) Histogram of diamond price by cut

qplot(x =price, data = diamonds, color = I('black'), fill = I('#099DD9'), binwidth = 500) +
  facet_wrap(~cut)

by(diamonds$price,diamonds$cut,summary, digits = max(getOption('digits')))
## diamonds$cut: Fair
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   337.000  2050.250  3282.000  4358.758  5205.500 18574.000 
## -------------------------------------------------------- 
## diamonds$cut: Good
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   327.000  1145.000  3050.500  3928.864  5028.000 18788.000 
## -------------------------------------------------------- 
## diamonds$cut: Very Good
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   336.00   912.00  2648.00  3981.76  5372.75 18818.00 
## -------------------------------------------------------- 
## diamonds$cut: Premium
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   326.000  1046.000  3185.000  4584.258  6296.000 18823.000 
## -------------------------------------------------------- 
## diamonds$cut: Ideal
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   326.000   878.000  1810.000  3457.542  4678.500 18806.000

Q7) Scales and multiple histograms

qplot(x = price, data = diamonds) + facet_wrap(~cut)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

by(diamonds$price,diamonds$cut,summary, digits = max(getOption('digits')))
## diamonds$cut: Fair
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   337.000  2050.250  3282.000  4358.758  5205.500 18574.000 
## -------------------------------------------------------- 
## diamonds$cut: Good
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   327.000  1145.000  3050.500  3928.864  5028.000 18788.000 
## -------------------------------------------------------- 
## diamonds$cut: Very Good
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   336.00   912.00  2648.00  3981.76  5372.75 18818.00 
## -------------------------------------------------------- 
## diamonds$cut: Premium
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   326.000  1046.000  3185.000  4584.258  6296.000 18823.000 
## -------------------------------------------------------- 
## diamonds$cut: Ideal
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   326.000   878.000  1810.000  3457.542  4678.500 18806.000

Q8) Boxplots

ggplot(diamonds, aes(factor(cut), price, fill=cut)) + geom_boxplot() + ggtitle("Diamond Price according Cut") + xlab("Type of Cut") + ylab("Diamond Price U$") + coord_cartesian(ylim=c(0,7500)) #by cut

ggplot(diamonds, aes(factor(clarity), price, fill=clarity)) + geom_boxplot() + ggtitle("Diamond Price according Clarity") + xlab("Clarity") + ylab("Diamond Price U$") + coord_cartesian(ylim=c(0,7500)) #clarity

Q9) IQR

by(diamonds$price,diamonds$color,summary)
## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1739    3077    4003   18730 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2344    3725    4868   18790 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2242    3999    6048   18820 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4487    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5092    7202   18820 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4234    5324    7695   18710

Q10) Investigate the price per carat of diamonds across the different colors of diamonds using boxplots.

ggplot(diamonds, aes(factor(color), (price/carat), fill=color)) + geom_boxplot() + ggtitle("Diamond Price per Carat according Color") + xlab("Color") + ylab("Diamond Price per Carat U$")

Q11) Diamond frequency by polygon

ggplot(data=diamonds, aes(x=carat)) + geom_freqpoly() + ggtitle("Diamond Frequency by Carat") + xlab("Carat Size") + ylab("Count")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.