setwd("D:/R/Udacity/EDA_Course_Materials/lesson4")
library(ggplot2)
ggplot(diamonds, aes(price, x)) + geom_point()
## price vs. depth
qplot(price, depth, data = diamonds, colour = cut)
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=0.1) +
scale_x_continuous(breaks = seq(0,80,2))
cor(diamonds$depth, diamonds$price)
## [1] -0.0106474
qplot(carat, price, data = diamonds) +
scale_x_continuous(lim = c(0,2.18), breaks = seq(0,2.18,0.25)) +
scale_y_continuous(lim = c(0, 17378), breaks = seq(0,17378,500))
## Warning: Removed 926 rows containing missing values (geom_point).
# create new column
diamonds$volume = diamonds$x * diamonds$y * diamonds$z
new_data <- subset(diamonds, volume != 0 & volume < 800)
ggplot(aes(x=volume, y=price), data = new_data) +
geom_point(alpha=0.1) +
stat_smooth(method = "lm", formula = y ~ x + I(x^0.3), size = 1) +
scale_x_continuous(lim = c(0,800), breaks = seq(0,800,50)) +
scale_y_continuous(lim = c(0, 19000), breaks = seq(0,19000,1000))
## Warning: Removed 40 rows containing missing values (geom_path).
cor(new_data$price, new_data$volume)
## [1] 0.9235455
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
a <- group_by(diamonds, clarity)
diamondsByClarity <- summarise(a, mean_price = mean(price), median_price = median(price), min_price = min(price), max_price = max(price), n = n())
select(diamondsByClarity, -clarity)
## Source: local data frame [8 x 5]
##
## mean_price median_price min_price max_price n
## 1 3924.169 3344 345 18531 741
## 2 5063.029 4072 326 18804 9194
## 3 3996.001 2822 326 18818 13065
## 4 3924.989 2054 334 18823 12258
## 5 3839.455 2005 327 18795 8171
## 6 3283.737 1311 336 18768 5066
## 7 2523.115 1093 336 18777 3655
## 8 2864.839 1080 369 18806 1790
library(dplyr)
library(gridExtra)
## Loading required package: grid
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
p1 = ggplot(diamonds_mp_by_clarity,
aes(x = clarity, y = mean_price)) +
labs(title = "diamonds_mp_by_clarity") +
geom_bar(stat='identity')
p2 = ggplot(diamonds_mp_by_color,
aes(x = color, y = mean_price)) +
labs(title = "diamonds_mp_by_color") +
geom_bar(stat='identity')
grid.arrange(p1, p2, ncol = 1)