Visualizing one dimenstional variables

This is the excerise on visualizing one dimensional variables in the dataset

  library(ggplot2)
  data(diamonds)
  df <- diamonds
  str(df)

## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

  summary(df)

##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
##

  dim(df)

## [1] 53940    10

  str(df$color)

##  Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...

Lets observe the price distribution for the diamonds

  ## Building a histogram
  qplot(x = df$price,data = df,
        binwidth = 250,
        xlab = "Diamond Price",
        ylab = "Count",
        main = "Distribution for diamond prices",
        col = I("orange"))  + 
    scale_x_continuous(breaks = seq(0,20000,2000)) + 
    
    ## Adding mean line 
    annotate("segment", x = mean(df$price), xend = mean(df$price),
             y = 0, yend = 2500,
             colour = "blue",
             size = I(1)) +
    ## Adding median line 
    annotate("segment", x = median(df$price), xend = median(df$price),
             y = 0, yend = 2500,
             colour = "green",
             size = I(1))

Summary of the dimond prices

  summary(df$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820

Counting diamond prices

count_less_500 <- sum(ifelse(df$price < 500,1,0))
count_less_250 <- sum(ifelse(df$price < 250,1,0))
count_more_15000 <- sum(ifelse(df$price >= 15000,1,0))

print(count_less_500)

## [1] 1729

print(count_less_250)

## [1] 0

print(count_more_15000)

## [1] 1656

Zooming around the peak of the price distribution

qplot(x = df$price,data = df,
        binwidth = 100,
        xlab = "Diamond Price",
        ylab = "Count",
        main = "Distribution for diamond prices",
        col = I("orange"))  + 
   coord_cartesian(xlim = c(500,2000)) + 
  scale_y_continuous(breaks = seq(0,3000,250))

Saving the plot

ggsave("zoomed version of diamond price distrubution.jpg")

## Saving 7 x 5 in image

Price distribution by diamond cut

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.2.3

f1 <- qplot(x = df$price,data = df,
        binwidth = 200,
        xlab = "Diamond Price",
        ylab = "Count",
        main = "Distribution for diamond prices",
        col = I("gray"),
        xlim = c(0,10000)) + 
  facet_wrap(~ cut,ncol = 3,scales = "free") 
  

f2 <- qplot(x = cut,y = price,data = df,
        geom = "boxplot",
        xlab = "Diamond Cut",
        ylab = "Diamond price",
        main = "Diamond prices Vs Cut",
        fill = I("pink")) + 
  coord_cartesian(ylim = c(0,10000))

grid.arrange(f1,f2,nrow = 2)

Price per Carat distribution by Cut

qplot(x = df$price/df$carat,data = df,
        binwidth = 500,
        xlab = "Price per carot",
        ylab = "Count",
        main = "Distribution for diamond prices/carot",
        col = I("gray"),
        xlim = c(0,10000)) + 
  facet_wrap(~ cut,ncol = 3,scales = "free")

f1 <- qplot(x = df$price/df$carat,data = df,
        binwidth = 500,
        geom = "histogram", 
        xlab = "Price per carot",
        ylab = "Count",
        main = "Frequency polygon for diamond prices/carot",
        col = cut)

f2 <- qplot(x = df$price/df$carat,data = df,
        binwidth = 500,
        geom = "freqpoly", 
        xlab = "Price per carot",
        ylab = "Count",
        main = "Frequency polygon for diamond prices/carot",
        col = cut)
       
f3 <- qplot(x = log10(df$price/df$carat),data = df,
        binwidth = .05,
        geom = "density", 
        xlab = "Price per carot",
        ylab = "Count",
        main = "Frequency polygon for log10 transformed diamond prices/carot", fill = cut,
        alpha = 0.2)

grid.arrange(f1,f2,f3,nrow = 3)

Visualizing one dimenstional variables

Bilal

December 24, 2015

Lets observe the price distribution for the diamonds

Summary of the dimond prices

Counting diamond prices

Zooming around the peak of the price distribution

Saving the plot

Price distribution by diamond cut

Price per Carat distribution by Cut