Diamonds are Expensive

Loading the libraries needed for this exercise, create a tibble out of diamonds using dplyr, and take a look at a summary to see what we have.

#Load all my stuff
#Rmd
#r message=FALSE, echo=FALSE, warning=FALSE

library(ggplot2)
library(dplyr)
library(magrittr)
library(ggthemes)

#see what we have
#Create a tibble using dplyr
tbl_diamond <- tbl_df(diamonds)
summary(tbl_diamond)
     carat               cut        color        clarity     
 Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
 1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
 Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
 Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
 3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
 Max.   :5.0100                     I: 5422   VVS1   : 3655  
                                    J: 2808   (Other): 2531  
     depth           table           price             x         
 Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
 1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
 Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
 Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
 3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
 Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
                                                                 
       y                z         
 Min.   : 0.000   Min.   : 0.000  
 1st Qu.: 4.720   1st Qu.: 2.910  
 Median : 5.710   Median : 3.530  
 Mean   : 5.735   Mean   : 3.539  
 3rd Qu.: 6.540   3rd Qu.: 4.040  
 Max.   :58.900   Max.   :31.800  
                                  
#With this much data, we need to make sure R sees clarity as a factor for this graph
Clarity <- as.factor(diamonds$clarity)

Step 1: Use a dot-plot to compare price, carats, and clarity

This shows that the more the clarity increases and carats increases, the price increases…fast. For reference, the clarity chart:

a <- tbl_diamond %>%
  ggplot(aes(x=carat, y=price, color= Clarity)) + 
  geom_point()+  
  xlab("Carats")+
  ylab("Price")+
  guides(fill = guide_legend("Clarity"))+
  ggtitle("Comparing Diamond Cut, Price and Clarity")

#be patient, there is alot of data to plot.  This takes a second or two.
a

Step 2: Use a dot-plot to compare cut, price, and carat

This shows that while some cuts may have higher prices, the ‘ideal’ cut is all over the place.

#make cut a factor for mapping a third variable
fact_cut <- as.factor(diamonds$cut)

#Compare cut and carat with price
b <- tbl_diamond %>%
  ggplot(aes(x=carat, y=price, color=fact_cut)) +
  geom_point() +
  xlab("Carats")+
  ylab("Price")+
  ggtitle("Comparing Diamond Carat and Price by Cut")+
  scale_colour_discrete (name="Cut")

b

Step 3: Ideal Cut Distribution

We pull out the ideal cut to see the distribution. It still appears there is nothing special about the ideal cut.

#What is 'ideal'?
#First, let's filter out 'ideal' from the data
tbl_diamond_no_ideal <- tbl_diamond %>%
  filter(cut != "Ideal")

#and make a new factor
fact_cut_noideal <- as.factor((tbl_diamond_no_ideal$cut))

#Next, filter for only showing 'ideal'
tbl_diamond_ideal <- tbl_diamond %>%
  filter(cut == "Ideal")

#Look at the Ideal cut compared to carat and price
c_ideal <- tbl_diamond_ideal %>%
  ggplot(aes(x=carat, y=price)) +
  geom_point()+
  xlab("Carats")+
  ylab("Price")+
  guides(fill = guide_legend("Clarity"))+
  ggtitle("Comparing Diamond Carat and Price for Ideal cuts")+
  theme_stata()
c_ideal

Step 4: Compare Price and Cut

This boxplot shows the quartiles and mean for price of the different cuts. It doesn’t look like the ‘better’ cuts have a significantly higher average price, as I would have expected.

# Make a boxplot comparing price and cut

d_box1 <- tbl_diamond %>%
  ggplot(aes(x = fact_cut, y = price))+
  geom_boxplot()+
  xlab("Cut")+
  ylab("Price")+
  ggtitle("Boxplot to see Cut and Price")+
  theme_solarized_2()
d_box1

Step 5: Compare Price and Clarity

This boxplot shows the quartiles and mean for price of the different levels of clarity. Interstingly, the mean prices for the better clarity is not lower. In fact, it looks like it goes down in some instances.

d_box2 <- tbl_diamond %>%
  ggplot(aes(x=clarity, y=price)) +
  geom_boxplot() +  
  xlab("Clarity")+
  ylab("Price")+
  ggtitle("Comparing Diamond Clarity and Price")+
theme_solarized_2()
d_box2

Step 6: Compare Carat and Clarity

Let’s compare the carat and clarity to see if there is an indication as to what is going on. It looks like as clarity goes up, the average carat goes down!

d_box3 <- tbl_diamond %>%
  ggplot(aes(x=clarity, y=carat)) +
  geom_boxplot()+
  xlab("Clarity")+
  ylab("Carat")+
  ggtitle("Comparing Diamond Carat and Clarity")+
  theme_solarized_2()
d_box3

Step 7: Line Graph of Mean Price by Cut

Do some analysis on the data and create a line graph the mean price by cut. We normalize the carat by dividing the price by carat to get Price Per Carat. We see that the price per carat does increase as cut gets “better”, until we get to the “ideal” cut…which seems to be ideal if you are trying to save money!

##Let's see what the average carat by clarity is.  This may explain the price.
gb_clarity <- group_by(tbl_diamond, clarity)

summarise(gb_clarity, mean(carat, na.rm=TRUE))
# A tibble: 8 x 2
  clarity `mean(carat, na.rm = TRUE)`
    <ord>                       <dbl>
1      I1                   1.2838462
2     SI2                   1.0776485
3     SI1                   0.8504822
4     VS2                   0.7639346
5     VS1                   0.7271582
6    VVS2                   0.5962021
7    VVS1                   0.5033215
8      IF                   0.5051229
#Normalize the price to compare different sized (carat) diamonds
tbl_mutate_caratprice <- tbl_diamond %>%
  mutate(mut_caratprice = price/carat)

##For all cuts
gb_clarity_ppc <- group_by(tbl_mutate_caratprice, clarity)
sum_clarity_ppc <- summarise(gb_clarity_ppc, mPrice = mean(price), mPPC = mean(mut_caratprice, na.rm=TRUE))

gb_cut_ppc <- group_by(tbl_mutate_caratprice, cut)
sum_cut_PPC <- summarise(gb_cut_ppc, mPrice = mean(price), mPPC = mean(mut_caratprice, na.rm = TRUE))

summary(sum_cut_PPC)
        cut        mPrice          mPPC     
 Fair     :1   Min.   :3458   Min.   :3767  
 Good     :1   1st Qu.:3929   1st Qu.:3860  
 Very Good:1   Median :3982   Median :3920  
 Premium  :1   Mean   :4062   Mean   :3957  
 Ideal    :1   3rd Qu.:4359   3rd Qu.:4014  
               Max.   :4584   Max.   :4223  
line_cut_PPC <- sum_cut_PPC %>%
  ggplot(aes(x = cut, y = mPPC, group = 1 ))+
  geom_point(size = 3) +
  geom_line(size = 2)+
  xlab("Diamond Cut")+
  ylab("Average Price per Carat")+
  ggtitle("Average Price Per Carat by Cut: is Ideal, Ideal?")+
  theme(legend.position = "bottom")+
  theme_solarized()

line_cut_PPC