Data Anlysis with R - Problem Set 3

What to Do First?

Notes:

Check the working directory and check the list of files
Load necessary libraries
The diamonds data set will be listed as a ‘Promise’ in the workspace. This is a special object in R, and you need to run a command on the data to fully load the data set.

getwd()

## [1] "C:/Users/amackay/Documents/R Scripts"

setwd("~/R Datasources")

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.1

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.2.1

Diamond Dataset

Notes: 1. No. of observations in the dataset: 53940 2. Number of variables in thedatset: 10 3. Number of Ordered factors in the dataset: 3 4. Whihc letter represents the best color: D

data("diamonds")
names(diamonds)

##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"

Examine the Price distribution

qplot(x = price, data = diamonds)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

summary(diamonds$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820

#how many diamonds cost less that $500 == 1749
summary(diamonds$price < 501)

##    Mode   FALSE    TRUE    NA's 
## logical   52191    1749       0

#how many diamonds cost less that $250 == 0
summary(diamonds$price < 251)

##    Mode   FALSE    NA's 
## logical   53940       0

#how many diamonds cost  $15000 or more  == 1656
summary(diamonds$price >= 15000)

##    Mode   FALSE    TRUE    NA's 
## logical   52284    1656       0

#Explore the largest peak in the histo
qplot(x = price , data = diamonds, 
      binwidth = 1,
      color = I('black'), fill = I('#099DD9'),
      xlab = 'Price of Diamonds',
      ylab = 'Count of Diamonds in sample') +
  scale_x_continuous(limits = c(670,700), breaks = seq(600,1000,5))

#Break out the histogram of diamond prices by cut
qplot(x = price , data = diamonds, 
      binwidth = 1,
      color = I('black'), fill = I('#099DD9'),
      xlab = 'Price of Diamonds',
      ylab = 'Count of Diamonds in sample') +
  scale_x_continuous(limits = c(670,700), breaks = seq(600,1000,5)) +
  facet_wrap(~cut)

# Which cut has the highest priced diamond?
# dont use summary as it does rounding
by(diamonds$price, diamonds$cut,max)

## diamonds$cut: Fair
## [1] 18574
## -------------------------------------------------------- 
## diamonds$cut: Good
## [1] 18788
## -------------------------------------------------------- 
## diamonds$cut: Very Good
## [1] 18818
## -------------------------------------------------------- 
## diamonds$cut: Premium
## [1] 18823
## -------------------------------------------------------- 
## diamonds$cut: Ideal
## [1] 18806

#which cut has the lowest priced diamond?
by(diamonds$price, diamonds$cut,min)

## diamonds$cut: Fair
## [1] 337
## -------------------------------------------------------- 
## diamonds$cut: Good
## [1] 327
## -------------------------------------------------------- 
## diamonds$cut: Very Good
## [1] 336
## -------------------------------------------------------- 
## diamonds$cut: Premium
## [1] 326
## -------------------------------------------------------- 
## diamonds$cut: Ideal
## [1] 326

#which cut has the lowest median price ?
by(diamonds$price, diamonds$cut,median)

## diamonds$cut: Fair
## [1] 3282
## -------------------------------------------------------- 
## diamonds$cut: Good
## [1] 3050.5
## -------------------------------------------------------- 
## diamonds$cut: Very Good
## [1] 2648
## -------------------------------------------------------- 
## diamonds$cut: Premium
## [1] 3185
## -------------------------------------------------------- 
## diamonds$cut: Ideal
## [1] 1810

Examine the price by cut using free scales

NOTES:

# In the last exercise, we looked at the summary statistics
# for diamond price by cut. If we look at the output table, the
# the median and quartiles are reasonably close to each other.
# This means the distributions should be somewhat similar,
# but the histograms we created don't show that.
by(diamonds$price, diamonds$cut,summary)

## diamonds$cut: Fair
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337    2050    3282    4359    5206   18570 
## -------------------------------------------------------- 
## diamonds$cut: Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     327    1145    3050    3929    5028   18790 
## -------------------------------------------------------- 
## diamonds$cut: Very Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     336     912    2648    3982    5373   18820 
## -------------------------------------------------------- 
## diamonds$cut: Premium
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326    1046    3185    4584    6296   18820 
## -------------------------------------------------------- 
## diamonds$cut: Ideal
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     878    1810    3458    4678   18810

# The 'Fair' and 'Good' diamonds appear to have 
# different distributions compared to the better
# cut diamonds. They seem somewhat uniform
# on the left with long tails on the right

# Look up the documentation for facet_wrap in R Studio.
# T?facet_
# the y-axis in the histograms is not fixed. You want the y-axis to
# be different for each histogram.
# Look at free scales here: http://www.cookbook-r.com/Graphs/Facets_(ggplot2)/

qplot(x = price , data = diamonds, 
      binwidth = 1,
      color = I('black'), fill = I('#099DD9'),
      xlab = 'Price of Diamonds',
      ylab = 'Count of Diamonds in sample') +
  scale_x_continuous(limits = c(670,700), breaks = seq(600,1000,5)) +
  facet_wrap(~cut, scales = 'free_y')

***

Price Per Carat By Cut

names(diamonds)

names(diamonds)

##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"

diamonds$price_per_carat <- NA
diamonds$price_per_carat = diamonds$carat / diamonds$price

diamonds$price_per_carat = factor(diamonds$price_per_carat)
summary(diamonds)

##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z                      price_per_carat 
##  Min.   : 0.000   Min.   : 0.000   0.000444444444444444:  331  
##  1st Qu.: 4.720   1st Qu.: 2.910   0.000555555555555556:  215  
##  Median : 5.710   Median : 3.530   0.000238095238095238:  160  
##  Mean   : 5.735   Mean   : 3.539   0.000495867768595041:  157  
##  3rd Qu.: 6.540   3rd Qu.: 4.040   0.000476190476190476:  129  
##  Max.   :58.900   Max.   :31.800   0.00037037037037037 :  126  
##                                    (Other)             :52822

#plot histo with free y axis
qplot(x = price/carat, data = diamonds) +
  facet_wrap(~cut, scales = 'free_y')

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

#transsform to log scale
qplot(x = price/carat, data = diamonds) +
  facet_wrap(~cut, scales = 'free_y') +
  scale_x_log10()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

#add some color and titles
qplot(x = price/carat, data = diamonds,
      xlab = 'Price per Carat',
      ylab = 'Number of diamonds in sample',
      color = I('black'), fill = I('#099DD9')) +
  facet_wrap(~cut, scales = 'free_y') +
  scale_x_log10()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

***

Examing the IQR

NOTES:

by(diamonds$price,diamonds$color,summary)

## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1739    3077    4003   18730 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2344    3725    4868   18790 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2242    3999    6048   18820 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4487    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5092    7202   18820 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4234    5324    7695   18710

#You can use the function IQR() to find the interquartile range. Pass it a subset of the diamonds data frame. 

#For example... 
#IQR(subset(diamonds, price <1000)$price) 
#Remember subset returns a data frame so we need to use $price on the end to access that variable.

by(diamonds$price,diamonds$color,IQR)

## diamonds$color: D
## [1] 3302.5
## -------------------------------------------------------- 
## diamonds$color: E
## [1] 3121
## -------------------------------------------------------- 
## diamonds$color: F
## [1] 3886.25
## -------------------------------------------------------- 
## diamonds$color: G
## [1] 5117
## -------------------------------------------------------- 
## diamonds$color: H
## [1] 4996.25
## -------------------------------------------------------- 
## diamonds$color: I
## [1] 6081.25
## -------------------------------------------------------- 
## diamonds$color: J
## [1] 5834.5

Examine Price per carat box plots by color

qplot(x=color,y = price/carat,data = diamonds,
      geom = "boxplot")

#limit the y-axis
qplot(x=color,y = price/carat,data = diamonds,
      geom = "boxplot")+
  coord_cartesian(ylim = c(1500,6500))

#check and validate with numerical summary
by(diamonds$price/diamonds$carat,diamonds$color, summary)

## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1128    2455    3411    3953    4749   17830 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1078    2430    3254    3805    4508   14610 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1168    2587    3494    4135    4947   13860 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1139    2538    3490    4163    5500   12460 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1051    2397    3819    4008    5127   10190 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1152    2345    3780    3996    5197    9398 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1081    2563    3780    3826    4928    8647

Carat Frequency Polygon

NOTES: Determin if these carat sizes have a count greated than 2000 0.1,0.3,0.8,1.01,1.6,2.0,3.0,5.0

qplot(x = carat,
      data = diamonds,
      geom = 'freqpoly',
      xlab = 'Carat',
      ylab = 'Count of diamonds in the sample'
      ) +
  scale_x_continuous(breaks = seq(0,6,0.1)) +
  scale_y_continuous(breaks = seq(0,9000,1000))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

***

Questions to clarify?

What are ordered factors?
Difference between scale_x_continious and scale_x_discrete?
Understanding Free Scales
Unable to get the quiz right for Problem set 3 Carat Freq Poly section