This is the excerise on visualizing one dimensional variables in the dataset

  library(ggplot2)
  data(diamonds)
  df <- diamonds
  str(df)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
  summary(df)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 
  dim(df)
## [1] 53940    10
  str(df$color)
##  Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...

Lets observe the price distribution for the diamonds

  ## Building a histogram
  qplot(x = df$price,data = df,
        binwidth = 250,
        xlab = "Diamond Price",
        ylab = "Count",
        main = "Distribution for diamond prices",
        col = I("orange"))  + 
    scale_x_continuous(breaks = seq(0,20000,2000)) + 
    
    ## Adding mean line 
    annotate("segment", x = mean(df$price), xend = mean(df$price),
             y = 0, yend = 2500,
             colour = "blue",
             size = I(1)) +
    ## Adding median line 
    annotate("segment", x = median(df$price), xend = median(df$price),
             y = 0, yend = 2500,
             colour = "green",
             size = I(1))

Summary of the dimond prices

  summary(df$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820

Counting diamond prices

count_less_500 <- sum(ifelse(df$price < 500,1,0))
count_less_250 <- sum(ifelse(df$price < 250,1,0))
count_more_15000 <- sum(ifelse(df$price >= 15000,1,0))

print(count_less_500)
## [1] 1729
print(count_less_250)
## [1] 0
print(count_more_15000)
## [1] 1656

Zooming around the peak of the price distribution

qplot(x = df$price,data = df,
        binwidth = 100,
        xlab = "Diamond Price",
        ylab = "Count",
        main = "Distribution for diamond prices",
        col = I("orange"))  + 
   coord_cartesian(xlim = c(500,2000)) + 
  scale_y_continuous(breaks = seq(0,3000,250))

Saving the plot

ggsave("zoomed version of diamond price distrubution.jpg")
## Saving 7 x 5 in image

Price distribution by diamond cut

library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.3
f1 <- qplot(x = df$price,data = df,
        binwidth = 200,
        xlab = "Diamond Price",
        ylab = "Count",
        main = "Distribution for diamond prices",
        col = I("gray"),
        xlim = c(0,10000)) + 
  facet_wrap(~ cut,ncol = 3,scales = "free") 
  

f2 <- qplot(x = cut,y = price,data = df,
        geom = "boxplot",
        xlab = "Diamond Cut",
        ylab = "Diamond price",
        main = "Diamond prices Vs Cut",
        fill = I("pink")) + 
  coord_cartesian(ylim = c(0,10000))

grid.arrange(f1,f2,nrow = 2)

Price per Carat distribution by Cut

qplot(x = df$price/df$carat,data = df,
        binwidth = 500,
        xlab = "Price per carot",
        ylab = "Count",
        main = "Distribution for diamond prices/carot",
        col = I("gray"),
        xlim = c(0,10000)) + 
  facet_wrap(~ cut,ncol = 3,scales = "free") 

f1 <- qplot(x = df$price/df$carat,data = df,
        binwidth = 500,
        geom = "histogram", 
        xlab = "Price per carot",
        ylab = "Count",
        main = "Frequency polygon for diamond prices/carot",
        col = cut)

f2 <- qplot(x = df$price/df$carat,data = df,
        binwidth = 500,
        geom = "freqpoly", 
        xlab = "Price per carot",
        ylab = "Count",
        main = "Frequency polygon for diamond prices/carot",
        col = cut)
       
f3 <- qplot(x = log10(df$price/df$carat),data = df,
        binwidth = .05,
        geom = "density", 
        xlab = "Price per carot",
        ylab = "Count",
        main = "Frequency polygon for log10 transformed diamond prices/carot", fill = cut,
        alpha = 0.2)

grid.arrange(f1,f2,f3,nrow = 3)