Lesson 6: Problem Set - Explore Two Variables

Dataset : diamonds Create a scatterplot of price vs x, using the ggplot syntax.

library(ggplot2)
ggplot(aes(x=x, y=price), data = diamonds) +
  coord_cartesian(xlim = c(3, 10)) +
  geom_point(alpha= 1/10, position = position_jitter(h = 0))

Correlations between price and x,y,z.

cor.test(diamonds$price, diamonds$x)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$price and diamonds$x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352
cor.test(diamonds$price, diamonds$y)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$price and diamonds$y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209
cor.test(diamonds$price, diamonds$z)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$price and diamonds$z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Create a simple scatter plot of price vs depth.

ggplot(aes(x=depth, y=price), data=diamonds) + geom_point()

Change the code to make the transparency of the points to be 1/100 of what they are now and mark the x-axis every 2 units. See the instructor notes for two hints.

ggplot(data = diamonds, aes(x = depth, y = price)) +
  geom_point(alpha = 1/100) +
  scale_x_continuous(breaks = seq(0,80,2))

Correlation between depth and price.

cor.test(diamonds$depth, diamonds$price)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$depth and diamonds$price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

Create a scatterplot of price vs carat and omit the top 1% of price and carat values.

ggplot(data = diamonds, aes(x = carat, y = price)) +
  geom_point() +
  xlim(0, quantile(diamonds$carat, 0.99)) +
  ylim(0, quantile(diamonds$price, 0.99))
## Warning: Removed 926 rows containing missing values (geom_point).

Create a new variable for volume in the diamonds data frame. Create a scatterplot of price vs. volume (x * y * z).

diamonds$volume <- diamonds$x*diamonds$y*diamonds$z
ggplot(data = diamonds, aes(x = volume, y = price)) +
  geom_point()

library(plyr)
count(diamonds$volume == 0)
##       x  freq
## 1 FALSE 53920
## 2  TRUE    20
detach("package:plyr", unload=TRUE)
## Warning: 'plyr' namespace cannot be unloaded:
##   namespace 'plyr' is imported by 'ggplot2', 'scales' so cannot be unloaded
volume_sub <- subset(diamonds, volume != 0 & volume < 800)
cor(volume_sub$price, volume_sub$volume)
## [1] 0.9235455

Subset the data to exclude diamonds with a volume greater than or equal to 800. Also, exclude diamonds with a volume of 0. Adjust the transparency of the points and add a linear model to the plot.

ggplot(data = volume_sub, aes(x = volume, y = price)) +
  geom_point(alpha = 1/50) +
  xlim(0,500) +
  geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 14 rows containing non-finite values (stat_smooth).
## Warning: Removed 14 rows containing missing values (geom_point).

Use the function dplyr package to create a new data frame containing info on diamonds by clarity. Name the data frame diamondsByClarity the data frame should contain the following variables in this order. (1) mean_price (2) median_price (3) min_price (4) max_price (5) n where n is the number of diamonds in each level of clarity.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
clarity_groups <- group_by(diamonds, clarity)
diamondsbyClarity <- summarise(clarity_groups,
                               mean_price = mean(price),
                               median_price = median(price),
                               min_price = min(price),
                               max_price = max(price),
                               n = n())

Write additional code to create three bar plots on one output image using the grid.arrange() function from the package gridExtra.

data(diamonds)
library(dplyr)

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

diamonds_by_cut <- group_by(diamonds, cut)
diamonds_mp_by_cut <- summarise(diamonds_by_cut, mean_price = mean(price))

p1 <- ggplot(data = diamonds_mp_by_clarity,
aes(clarity,mean_price)) + 
geom_bar(stat = "identity")

p2 <- ggplot(data = diamonds_mp_by_color, aes(color,mean_price)) +
geom_bar(stat = "identity")

p3<- ggplot(data = diamonds_mp_by_cut, aes(cut, mean_price)) + geom_bar(stat = "identity")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2 , p3, ncol = 3)