We are going to use the diamonds dataset in the ggplot2 package for data viz exercises
Q1) Examine the diamonds datasets- number of observations, variables and ordered factors
library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Q2) Create a histogram of the price of all the diamonds in the diamond data set.
hist(diamonds$price) #using base graphics
ggplot(data=diamonds) + geom_histogram(binwidth=500, aes(x=diamonds$price)) + ggtitle("Diamond Price Distribution") + xlab("Diamond Price U$") + ylab("Frequency") #ggplot version
#Q3
summary (diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
#The distribution has a right skew- long tail extending to right
#Mean is greater than median
Q4) Diamond counts
c500=subset(diamonds, price<500)
str(c500)
## 'data.frame': 1729 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
c250=subset(diamonds, price<250)
str(c250)
## 'data.frame': 0 obs. of 10 variables:
## $ carat : num
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..:
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..:
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..:
## $ depth : num
## $ table : num
## $ price : int
## $ x : num
## $ y : num
## $ z : num
c15k=subset(diamonds, price>=15000)
str(c15k)
## 'data.frame': 1656 obs. of 10 variables:
## $ carat : num 1.6 1.54 1.19 2.1 1.69 1.5 1.73 2.02 2.05 1.5 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 5 4 5 3 3 4 3 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 4 2 3 6 1 4 4 4 3 3 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 4 4 7 3 3 6 5 2 2 5 ...
## $ depth : num 61.9 62.3 61.5 61.5 60.8 62.9 62.8 63 61.9 61.6 ...
## $ table : num 56 58 55 57 57 56 57 59 56 58 ...
## $ price : int 15000 15002 15005 15007 15011 15013 15014 15014 15017 15022 ...
## $ x : num 7.53 7.31 6.82 8.25 7.69 7.22 7.57 8.05 8.13 7.35 ...
## $ y : num 7.47 7.39 6.84 8.21 7.71 7.32 7.72 7.95 8.18 7.43 ...
## $ z : num 4.64 4.58 4.2 5.06 4.68 4.57 4.8 5.03 5.05 4.55 ...
Q5) Cheaper diamonds-
qplot(x = price, data = diamonds, color = I('black'), fill = I('#099DD9'), binwidth = 50) +
scale_x_continuous(limits= c(0,2400), breaks=seq(0,2400,400))
Q6) Histogram of diamond price by cut
qplot(x =price, data = diamonds, color = I('black'), fill = I('#099DD9'), binwidth = 500) +
facet_wrap(~cut)
by(diamonds$price,diamonds$cut,summary, digits = max(getOption('digits')))
## diamonds$cut: Fair
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337.000 2050.250 3282.000 4358.758 5205.500 18574.000
## --------------------------------------------------------
## diamonds$cut: Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327.000 1145.000 3050.500 3928.864 5028.000 18788.000
## --------------------------------------------------------
## diamonds$cut: Very Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336.00 912.00 2648.00 3981.76 5372.75 18818.00
## --------------------------------------------------------
## diamonds$cut: Premium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326.000 1046.000 3185.000 4584.258 6296.000 18823.000
## --------------------------------------------------------
## diamonds$cut: Ideal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326.000 878.000 1810.000 3457.542 4678.500 18806.000
Q7) Scales and multiple histograms
qplot(x = price, data = diamonds) + facet_wrap(~cut)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
by(diamonds$price,diamonds$cut,summary, digits = max(getOption('digits')))
## diamonds$cut: Fair
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337.000 2050.250 3282.000 4358.758 5205.500 18574.000
## --------------------------------------------------------
## diamonds$cut: Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327.000 1145.000 3050.500 3928.864 5028.000 18788.000
## --------------------------------------------------------
## diamonds$cut: Very Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336.00 912.00 2648.00 3981.76 5372.75 18818.00
## --------------------------------------------------------
## diamonds$cut: Premium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326.000 1046.000 3185.000 4584.258 6296.000 18823.000
## --------------------------------------------------------
## diamonds$cut: Ideal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326.000 878.000 1810.000 3457.542 4678.500 18806.000
Q8) Boxplots
ggplot(diamonds, aes(factor(cut), price, fill=cut)) + geom_boxplot() + ggtitle("Diamond Price according Cut") + xlab("Type of Cut") + ylab("Diamond Price U$") + coord_cartesian(ylim=c(0,7500)) #by cut
ggplot(diamonds, aes(factor(clarity), price, fill=clarity)) + geom_boxplot() + ggtitle("Diamond Price according Clarity") + xlab("Clarity") + ylab("Diamond Price U$") + coord_cartesian(ylim=c(0,7500)) #clarity
Q9) IQR
by(diamonds$price,diamonds$color,summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 882 1739 3077 4003 18730
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 342 982 2344 3725 4868 18790
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354 931 2242 3999 6048 18820
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 984 3460 4487 5980 18800
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1120 3730 5092 7202 18820
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335 1860 4234 5324 7695 18710
Q10) Investigate the price per carat of diamonds across the different colors of diamonds using boxplots.
ggplot(diamonds, aes(factor(color), (price/carat), fill=color)) + geom_boxplot() + ggtitle("Diamond Price per Carat according Color") + xlab("Color") + ylab("Diamond Price per Carat U$")
Q11) Diamond frequency by polygon
ggplot(data=diamonds, aes(x=carat)) + geom_freqpoly() + ggtitle("Diamond Frequency by Carat") + xlab("Carat Size") + ylab("Count")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.