setwd("D:/R/Udacity/EDA_Course_Materials/lesson4")

Diamonds

price vs. x

library(ggplot2)
ggplot(diamonds, aes(price, x)) + geom_point()

## price vs. depth

qplot(price, depth, data = diamonds, colour = cut) 

other variant of price vs.depth (+adjastments)

ggplot(data = diamonds, aes(x = depth, y = price)) + 
  geom_point(alpha=0.1) +
  scale_x_continuous(breaks = seq(0,80,2))

cor(diamonds$depth, diamonds$price)
## [1] -0.0106474

price vs. carat

qplot(carat, price, data = diamonds) +

    scale_x_continuous(lim = c(0,2.18), breaks = seq(0,2.18,0.25)) +
    scale_y_continuous(lim = c(0, 17378), breaks = seq(0,17378,500))
## Warning: Removed 926 rows containing missing values (geom_point).

price vs. volume

# create new column
diamonds$volume = diamonds$x * diamonds$y * diamonds$z
new_data <- subset(diamonds, volume != 0 & volume < 800)

ggplot(aes(x=volume, y=price), data = new_data) +
    geom_point(alpha=0.1) +
    stat_smooth(method = "lm", formula = y ~ x + I(x^0.3), size = 1) +
    scale_x_continuous(lim = c(0,800), breaks = seq(0,800,50)) +
    scale_y_continuous(lim = c(0, 19000), breaks = seq(0,19000,1000))
## Warning: Removed 40 rows containing missing values (geom_path).

cor(new_data$price, new_data$volume)
## [1] 0.9235455

mean price by clarity

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
a <- group_by(diamonds, clarity)
diamondsByClarity <- summarise(a, mean_price = mean(price), median_price = median(price), min_price = min(price), max_price = max(price), n = n())
select(diamondsByClarity, -clarity)
## Source: local data frame [8 x 5]
## 
##   mean_price median_price min_price max_price     n
## 1   3924.169         3344       345     18531   741
## 2   5063.029         4072       326     18804  9194
## 3   3996.001         2822       326     18818 13065
## 4   3924.989         2054       334     18823 12258
## 5   3839.455         2005       327     18795  8171
## 6   3283.737         1311       336     18768  5066
## 7   2523.115         1093       336     18777  3655
## 8   2864.839         1080       369     18806  1790

bar charts of mean price

library(dplyr)
library(gridExtra)
## Loading required package: grid
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

p1 = ggplot(diamonds_mp_by_clarity, 
            aes(x = clarity, y = mean_price)) + 
            labs(title = "diamonds_mp_by_clarity") +
            geom_bar(stat='identity') 
p2 = ggplot(diamonds_mp_by_color, 
            aes(x = color, y = mean_price)) + 
            labs(title = "diamonds_mp_by_color") +
            geom_bar(stat='identity') 

grid.arrange(p1, p2, ncol = 1)