Reading the Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5

Reads in data

data(diamonds)

Scatterplot of price vs x.

ggplot(data = diamonds, aes(x, y = price)) +
  geom_point(alpha = 1/20)

What is the correlation between price and x?

with(diamonds, cor.test(x=price, y = x, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352

What is the correlation between price and y?

with(diamonds, cor.test(x=price, y = y, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209

What is the correlation between price and z?

with(diamonds, cor.test(x=price, y = z, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

simple scatter plot of price vs depth

ggplot(data = diamonds, aes(x = depth,y = price)) +
  geom_point(alpha= 0.01)

Change the code to make the transparency of the points to be 1/100 of what they are now and mark the x-axis every 2 units.

ggplot(data = diamonds, aes(x = depth,y = price)) +
  geom_point(alpha= 1/100) +
  scale_x_continuous(breaks = seq(0,80,2))

what is the correlation of depth and price?

with(diamonds, cor.test(x= depth, y = price, method = "pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

Create a scatterplot of price vs carat and omit the top 1% of price and carat values.

ggplot(data = diamonds,aes(x = carat, y = price)) + 
  xlim(0,quantile(diamonds$carat,0.99)) +
  ylim(0,quantile(diamonds$price,0.99)) +
  geom_point()
## Warning: Removed 926 rows containing missing values (geom_point).

Create a scatterplot of price vs. volume (x * y * z). This is a very rough approximation for a diamond’s volume. Create a new variable for volume in the diamonds data frame. This will be useful in a later exercise.

diamonds$volume <- diamonds$x*diamonds$y*diamonds$z
ggplot(data = diamonds, aes(x = volume, y = price)) +
  geom_point()

What is the correlation of price and volume? Exclude the diamonds that have a volume of 0 or greater than or equal to 800

with(subset(diamonds, (volume > 0) & (volume <= 800)),cor.test(volume,price))
## 
##  Pearson's product-moment correlation
## 
## data:  volume and price
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Use the function dplyr package to create a new data frame containing info on diamonds by clarity.Name the data frame diamondsByClarity. The data frame should contain the following variables in this order. (1) mean_price (2) median_price (3) min_price (4) max_price (5) n

where n is the number of diamonds in each level of clarity.

suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))

diamondsByClarity<- diamonds %>%
  group_by(clarity) %>%
  summarise(mean_price = mean(price), 
            median_price = median(price), 
            min_price = min(price), 
            max_price = max(price), 
            n= n())
diamondsByClarity
## Source: local data frame [8 x 6]
## 
##   clarity mean_price median_price min_price max_price     n
##    (fctr)      (dbl)        (dbl)     (int)     (int) (int)
## 1      I1   3924.169         3344       345     18531   741
## 2     SI2   5063.029         4072       326     18804  9194
## 3     SI1   3996.001         2822       326     18818 13065
## 4     VS2   3924.989         2054       334     18823 12258
## 5     VS1   3839.455         2005       327     18795  8171
## 6    VVS2   3283.737         1311       336     18768  5066
## 7    VVS1   2523.115         1093       336     18777  3655
## 8      IF   2864.839         1080       369     18806  1790
diamonds_by_cut <- group_by(diamonds, cut)
diamonds_mp_by_cut <- summarise(diamonds_by_cut, mean_price = mean(price))

ggplot(diamonds_mp_by_cut, aes(x = cut, y = mean_price, fill= cut)) +
  geom_bar(stat = "identity")