Load main libs that will be used during the assignment

suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(scales))
suppressMessages(library(xlsx))
suppressMessages(library(tidyr))
suppressMessages(library(lubridate))
suppressMessages(library(ggthemes))
suppressMessages(library(gridExtra))

Q4.1

# In this problem set, you'll continue
# to explore the diamonds data set.

# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.

ggplot(diamonds, aes(x = x, y = price)) +
  geom_point(alpha = 1/20) +
  coord_cartesian(xlim=c(3.5, 12)) + 
  scale_y_continuous(breaks=seq(1000, 19000, 2000),label=dollar)

#Q4.2

# What are your observations about the scatterplot of price vs x?

# The bulk of the data start at around x = 3.3 and price rises exponentially as it approaches 9. 
# There also seems to be an artificial ceiling in price around $19k.

Q4.3

#a. What is the correlation between price and x?
with(diamonds, cor.test(price, x))

## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352

#b. What is the correlation between price and x?
with(diamonds, cor.test(price, y))

## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209

#c. What is the correlation between price and x?
with(diamonds, cor.test(price, z))

## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Q4.4

# Create a simple scatter plot of price vs depth.
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(alpha = 1/20)

#Q4.5

# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units. 

ggplot(data = diamonds, aes(x = depth, y = price)) + 
  geom_point(alpha = 1/100) + 
  scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2),
                     labels = seq(min(diamonds$depth), max(diamonds$depth), 2))

#Q4.6

# Based on the scatterplot of depth vs. price, most diamonds are between what values of depth?

# 58-64

Q4.7

# What's the correlation of depth vs. price?
with(diamonds, cor.test(depth, price))

## 
##  Pearson's product-moment correlation
## 
## data:  depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

Q4.8

# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.

ggplot(diamonds, aes(x = carat, y = price)) +
  geom_point(alpha=1/20) +
  scale_x_continuous(limits=c(0, quantile(diamonds$carat, 0.99))) +
  scale_y_continuous(breaks=seq(0, 18000, 2000), 
                     limits=c(0 , quantile(diamonds$price, 0.99)),
                     labels=dollar)

## Warning: Removed 926 rows containing missing values (geom_point).

#Q4.9

# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.

# Create a new variable for volume in the diamonds data frame.
# This will be useful in a later exercise.

# Don't make any adjustments to the plot just yet.

diamonds2 <- diamonds %>%
  mutate(volume=x*y*z)

ggplot(diamonds2, aes(x = volume, y = price)) + 
  geom_point()

#Q4.10

# What are your observations from the price vs volume scatterplot?

# There are 3 obvious outliers, with 1 massive diamond expanding the plot quite a bit.
# Prices rise exponentially with volume making transformations of the x-scale a good idea.
# There may also be diamonds with volumes at or near 0.

Q4.11

# What's the correlation of price and volume?
# Exclude diamonds that have a volume of 0 or that are greater than or equal to 800.

with(subset(diamonds2, !(volume == 0 | volume >= 800) ), cor.test(price, volume))

## 
##  Pearson's product-moment correlation
## 
## data:  price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Q4.12

# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)

# We encourage you to think about this next question and
# to post your thoughts in the discussion section.

# Do you think this would be a useful model to estimate
# the price of diamonds? Why or why not?

smaller <- diamonds2 %>%
  filter(volume != 0,
         volume <= 800)

ggplot(smaller, aes( x = volume, y = price)) + 
  geom_point(alpha = 1/20) +
  geom_smooth(method = "lm",
              se = TRUE)

# This is probably not the best model since the relationship doesn not appear to be normal.

Q4.13

# Use the function dplyr package
# to create a new data frame containing
# info on diamonds by clarity.

# Name the data frame diamondsByClarity

# The data frame should contain the following
# variables in this order.

#       (1) mean_price
#       (2) median_price
#       (3) min_price
#       (4) max_price
#       (5) n

# where n is the number of diamonds in each
# level of clarity.

diamondsByClarity<- diamonds %>%
  group_by(clarity) %>%
  summarise(mean_price = mean(price),
            median_price = median(price),
            min_price = min(price),
            max_price = max(price),
            n = n() ) %>%
  arrange(clarity)

Q4.14

# We have created summary data frames with the mean price
# by clarity and color. You can run the code in R to
# verify what data is in the variables diamonds_mp_by_clarity
# and diamonds_mp_by_color.

# Your task is to write additional code to create two bar plots
# on one output image using the grid.arrange() function from the package
# gridExtra.

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

# ===================================================================

c1 <- ggplot(diamonds_mp_by_clarity, aes(x=clarity, y=mean_price, fill=clarity)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_brewer(palette="Set3") + 
  guides(fill = guide_legend(ncol=2, title.hjust=0.3))

c2 <- ggplot(diamonds_mp_by_color, aes(x=color, y=mean_price, fill=color)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_brewer(palette="Set2") + 
  guides(fill = guide_legend(ncol=2, title.hjust=0.4))

grid.arrange(c1, c2)

Q4.15

# What do you notice in each of the bar charts for mean price by clarity and mean price by color?

# There's a downward trend in average diamond price as clarity goes from I1 to IF,
# and an upward trend in average price as color goes from D to J.

EDA Project: Diamond Prices (Part 2)

/mohammed

February 26, 2016

Q4.1

Q4.3

Q4.4

Q4.7

Q4.8

Q4.11

Q4.12

Q4.13

Q4.14

Q4.15