Load main libs that will be used during the assignment
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(scales))
suppressMessages(library(xlsx))
suppressMessages(library(tidyr))
suppressMessages(library(lubridate))
suppressMessages(library(ggthemes))
suppressMessages(library(gridExtra))
# In this problem set, you'll continue
# to explore the diamonds data set.
# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.
ggplot(diamonds, aes(x = x, y = price)) +
geom_point(alpha = 1/20) +
coord_cartesian(xlim=c(3.5, 12)) +
scale_y_continuous(breaks=seq(1000, 19000, 2000),label=dollar)
#Q4.2
# What are your observations about the scatterplot of price vs x?
# The bulk of the data start at around x = 3.3 and price rises exponentially as it approaches 9.
# There also seems to be an artificial ceiling in price around $19k.
#a. What is the correlation between price and x?
with(diamonds, cor.test(price, x))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8825835 0.8862594
## sample estimates:
## cor
## 0.8844352
#b. What is the correlation between price and x?
with(diamonds, cor.test(price, y))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8632867 0.8675241
## sample estimates:
## cor
## 0.8654209
#c. What is the correlation between price and x?
with(diamonds, cor.test(price, z))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8590541 0.8634131
## sample estimates:
## cor
## 0.8612494
# Create a simple scatter plot of price vs depth.
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(alpha = 1/20)
#Q4.5
# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units.
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha = 1/100) +
scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2),
labels = seq(min(diamonds$depth), max(diamonds$depth), 2))
#Q4.6
# Based on the scatterplot of depth vs. price, most diamonds are between what values of depth?
# 58-64
# What's the correlation of depth vs. price?
with(diamonds, cor.test(depth, price))
##
## Pearson's product-moment correlation
##
## data: depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019084756 -0.002208537
## sample estimates:
## cor
## -0.0106474
# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.
ggplot(diamonds, aes(x = carat, y = price)) +
geom_point(alpha=1/20) +
scale_x_continuous(limits=c(0, quantile(diamonds$carat, 0.99))) +
scale_y_continuous(breaks=seq(0, 18000, 2000),
limits=c(0 , quantile(diamonds$price, 0.99)),
labels=dollar)
## Warning: Removed 926 rows containing missing values (geom_point).
#Q4.9
# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.
# Create a new variable for volume in the diamonds data frame.
# This will be useful in a later exercise.
# Don't make any adjustments to the plot just yet.
diamonds2 <- diamonds %>%
mutate(volume=x*y*z)
ggplot(diamonds2, aes(x = volume, y = price)) +
geom_point()
#Q4.10
# What are your observations from the price vs volume scatterplot?
# There are 3 obvious outliers, with 1 massive diamond expanding the plot quite a bit.
# Prices rise exponentially with volume making transformations of the x-scale a good idea.
# There may also be diamonds with volumes at or near 0.
# What's the correlation of price and volume?
# Exclude diamonds that have a volume of 0 or that are greater than or equal to 800.
with(subset(diamonds2, !(volume == 0 | volume >= 800) ), cor.test(price, volume))
##
## Pearson's product-moment correlation
##
## data: price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9222944 0.9247772
## sample estimates:
## cor
## 0.9235455
# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)
# We encourage you to think about this next question and
# to post your thoughts in the discussion section.
# Do you think this would be a useful model to estimate
# the price of diamonds? Why or why not?
smaller <- diamonds2 %>%
filter(volume != 0,
volume <= 800)
ggplot(smaller, aes( x = volume, y = price)) +
geom_point(alpha = 1/20) +
geom_smooth(method = "lm",
se = TRUE)
# This is probably not the best model since the relationship doesn not appear to be normal.
# Use the function dplyr package
# to create a new data frame containing
# info on diamonds by clarity.
# Name the data frame diamondsByClarity
# The data frame should contain the following
# variables in this order.
# (1) mean_price
# (2) median_price
# (3) min_price
# (4) max_price
# (5) n
# where n is the number of diamonds in each
# level of clarity.
diamondsByClarity<- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n = n() ) %>%
arrange(clarity)
# We have created summary data frames with the mean price
# by clarity and color. You can run the code in R to
# verify what data is in the variables diamonds_mp_by_clarity
# and diamonds_mp_by_color.
# Your task is to write additional code to create two bar plots
# on one output image using the grid.arrange() function from the package
# gridExtra.
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
# ===================================================================
c1 <- ggplot(diamonds_mp_by_clarity, aes(x=clarity, y=mean_price, fill=clarity)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette="Set3") +
guides(fill = guide_legend(ncol=2, title.hjust=0.3))
c2 <- ggplot(diamonds_mp_by_color, aes(x=color, y=mean_price, fill=color)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_brewer(palette="Set2") +
guides(fill = guide_legend(ncol=2, title.hjust=0.4))
grid.arrange(c1, c2)
# What do you notice in each of the bar charts for mean price by clarity and mean price by color?
# There's a downward trend in average diamond price as clarity goes from I1 to IF,
# and an upward trend in average price as color goes from D to J.