Problem Set 4

Your first task is to create a scatterplot of price vs x. using the ggplot syntax.

Note: diamonds is part of ggplot2, so I need to load that first

library(ggplot2)
data(diamonds)

ggplot(aes(x=price,y=x),data=diamonds)+
  geom_point()

Correlation between x and price

cor.test(diamonds$x,diamonds$price)

## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$x and diamonds$price
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352

cor.test(diamonds$y,diamonds$price)

## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$y and diamonds$price
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209

cor.test(diamonds$z,diamonds$price)

## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$z and diamonds$price
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Scatter plot price and depth

ggplot(aes(x=price,y=depth),data=diamonds)+
  geom_point()

Udacity solution, including alpha param

ggplot(data = diamonds, aes(x = depth, y = price)) + 
  geom_point(alpha=1/100)+
  scale_x_continuous(breaks = seq(43,79,by=2))

Correlation between depth and price

cor.test(diamonds$depth,diamonds$price)

## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$depth and diamonds$price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

Create a scatterplot of price vs carat and omit the top 1% of price and carat values.

ggplot(data = subset(diamonds,price<=quantile(price,probs=.99)&carat<=quantile(carat,probs=.99)), aes(x = price, y = carat)) + 
  geom_point(alpha=1/100)

Create a scatterplot of price vs. volume (x * y * z). This is a very rough approximation for a diamond’s volume.

Create a new variable for volume in the diamonds data frame. This will be useful in a later exercise.

Don’t make any adjustments to the plot just yet.

diamonds$volume<-diamonds$x*diamonds$y*diamonds$z
ggplot(data = diamonds, aes(x = volume, y = price)) + 
  geom_point()

How many have volume 0?

length(which(diamonds$volume==0))

## [1] 20

#better
library(plyr)
count(diamonds$volume==0)

##       x  freq
## 1 FALSE 53920
## 2  TRUE    20

detach("package:plyr", unload=TRUE)

## Warning: 'plyr' namespace cannot be unloaded:
##   namespace 'plyr' is imported by 'reshape2', 'ggplot2', 'scales' so cannot be unloaded

Correlation between price and volume, excluding volume == 0 and price >=800

d <-subset(diamonds, volume!=0 & volume<800)
cor.test(d$volume,d$price)

## 
##  Pearson's product-moment correlation
## 
## data:  d$volume and d$price
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

ggplot(data = d, aes(x = volume, y = price)) + 
  geom_point(alpha=0.05)+
  geom_smooth(method = "lm", formula = y ~ x, size = 1)

Use the function dplyr package to create a new data frame containing info on diamonds by clarity. Name the data frame diamondsByClarity

The data frame should contain the following variables in this order.

  (1) mean_price
  (2) median_price
  (3) min_price
  (4) max_price
  (5) n

where n is the number of diamonds in each level of clarity.

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

byclarity<-group_by(diamonds, clarity)
diamondsByClarity<-summarise(byclarity,
                             mean_price = mean(price),
                            median_price = median(as.numeric(price)),
                             min_price = min(price),
                             max_price = max(price),
                             n = n())

Working with flights data to practic dplyring my data

#install.packages("nycflights13")
library(nycflights13)
by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)

Your task is to write additional code to create two bar plots on one output image using the grid.arrange() function from the package gridExtra.

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

graph_clarity<-ggplot(diamonds_mp_by_clarity, aes(x=factor(clarity), y=mean_price)) + geom_bar(stat="identity")
graph_color<-ggplot(diamonds_mp_by_color, aes(x=factor(color), y=mean_price)) + geom_bar(stat="identity")

library(gridExtra)

## Loading required package: grid

grid.arrange( graph_clarity, graph_color)

What I notice: The J color has the highest mean price, and the mean price goes up with the rank of the color. THis is not the case for clarity.

Problem Set 4

Pleuni

July 9, 2015