# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(data = diamonds, aes(x = price, y = x)) +
geom_point()
# Let's do the same plot but with ln(price)
ggplot(data = diamonds, aes(x = log(price), y = x)) +
geom_point()
looks like there is a linear like relationship between ln(price) and x value. and no X values for some ln(price) valuse between where ln(price)= 7 and 8
There are some outliers such as zero X value and postive price value.
with(diamonds, cor.test(x=price, y = x, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.1594, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8825835 0.8862594
## sample estimates:
## cor
## 0.8844352
with(diamonds, cor.test(x=price, y = y, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.1415, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8632867 0.8675241
## sample estimates:
## cor
## 0.8654209
with(diamonds, cor.test(x=price, y = z, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6015, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8590541 0.8634131
## sample estimates:
## cor
## 0.8612494
Create a simple scatter plot of price vs depth.
ggplot(data = diamonds, aes(x = depth,y = price)) +
geom_point(alpha= 0.5)
# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units. See the instructor notes
# for two hints.
ggplot(data = diamonds, aes(x = depth,y = price)) +
geom_point(alpha= 0.01) +
scale_x_continuous(breaks = seq(0,80,2))
Based on the scatterplot of depth vs price, most diamonds are between what values of depth? Answer: 60 - 64
what is the correlation of depth and price?
with(diamonds, cor.test(x= depth, y = price, method = "pearson"))
##
## Pearson's product-moment correlation
##
## data: depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019084756 -0.002208537
## sample estimates:
## cor
## -0.0106474
We can say there is almost no correlation between depth and price since correlation coefficient is close to zero.
# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.
ggplot(data = diamonds,aes(x = carat, y = price)) +
xlim(0,quantile(diamonds$carat,0.99)) +
ylim(0,quantile(diamonds$price,0.99)) +
geom_point()
## Warning: Removed 926 rows containing missing values (geom_point).
# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.
# Create a new variable for volume in the diamonds data frame.
# This will be useful in a later exercise.
diamonds$volume <- diamonds$x*diamonds$y*diamonds$z
head(diamonds,3)
## carat cut color clarity depth table price x y z volume
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 38.20203
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 34.50586
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 38.07688
ggplot(data = diamonds, aes(x = volume, y = price)) +
geom_point()
# Some outliers with zero volume
sum(diamonds$volume == 0)
## [1] 20
diamonds[diamonds$volume == 0,]
## carat cut color clarity depth table price x y z volume
## 2208 1.00 Premium G SI2 59.1 59 3142 6.55 6.48 0 0
## 2315 1.01 Premium H I1 58.1 59 3167 6.66 6.60 0 0
## 4792 1.10 Premium G SI2 63.0 59 3696 6.50 6.47 0 0
## 5472 1.01 Premium F SI2 59.2 58 3837 6.50 6.47 0 0
## 10168 1.50 Good G I1 64.0 61 4731 7.15 7.04 0 0
## 11183 1.07 Ideal F SI2 61.6 56 4954 0.00 6.62 0 0
## 11964 1.00 Very Good H VS2 63.3 53 5139 0.00 0.00 0 0
## 13602 1.15 Ideal G VS2 59.2 56 5564 6.88 6.83 0 0
## 15952 1.14 Fair G VS1 57.5 67 6381 0.00 0.00 0 0
## 24395 2.18 Premium H SI2 59.4 61 12631 8.49 8.45 0 0
## 24521 1.56 Ideal G VS2 62.2 54 12800 0.00 0.00 0 0
## 26124 2.25 Premium I SI1 61.3 58 15397 8.52 8.42 0 0
## 26244 1.20 Premium D VVS1 62.1 59 15686 0.00 0.00 0 0
## 27113 2.20 Premium H SI1 61.2 59 17265 8.42 8.37 0 0
## 27430 2.25 Premium H SI2 62.8 59 18034 0.00 0.00 0 0
## 27504 2.02 Premium H VS2 62.7 53 18207 8.02 7.95 0 0
## 27740 2.80 Good G SI2 63.8 58 18788 8.90 8.85 0 0
## 49557 0.71 Good F SI2 64.1 60 2130 0.00 0.00 0 0
## 49558 0.71 Good F SI2 64.1 60 2130 0.00 0.00 0 0
## 51507 1.12 Premium G I1 60.4 59 2383 6.71 6.67 0 0
Looks like there is a correlation, however there are few outlier with high volume with low price makes it hard to see the correlation. Also some (20) diamonds have zero volume
What is the correlation of price and volume? Exclude the diamonds that have a volume of 0 or greater than or equal to 800.
with(subset(diamonds, (volume > 0) & (volume <= 800)),cor.test(volume,price))
##
## Pearson's product-moment correlation
##
## data: volume and price
## t = 559.1912, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9222944 0.9247772
## sample estimates:
## cor
## 0.9235455
# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)
# We encourage you to think about this next question and
# to post your thoughts in the discussion section.
# Do you think this would be a useful model to estimate
# the price of diamonds? Why or why not?
Smoothing :
http://www.ats.ucla.edu/stat/r/faq/smooths.htm
p1 <- ggplot(data = subset(diamonds, (volume > 0) & (volume <= 800)),
aes(x = volume, y = price)) +
geom_point()
# Default smoother
p2 <- p1 + geom_smooth()
# looking at a linear fit,
p3 <- p1 + stat_smooth(method = "lm", formula = y ~ x, size = 1) + coord_cartesian(ylim = c(0,20000))
# Looking at polynimoal functions of order 2
p4 <- p1 + stat_smooth(method = "lm", formula = y ~ poly(x, 2), size = 1) + coord_cartesian(ylim = c(0,20000))
# Looking at polynimoal functions of order 3
p5 <- p1 + stat_smooth(method = "lm", formula = y ~ poly(x, 3), size = 1) + coord_cartesian(ylim = c(0,20000))
library(gridExtra)
## Loading required package: grid
grid.arrange(p2,p3,p4,p5,ncol =2)
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
# Use the function dplyr package
# to create a new data frame containing
# info on diamonds by clarity.
# Name the data frame diamondsByClarity
# The data frame should contain the following
# variables in this order.
# (1) mean_price
# (2) median_price
# (3) min_price
# (4) max_price
# (5) n
# where n is the number of diamonds in each
# level of clarity.
diamondsByClarity <- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n= n())
diamondsByClarity
## Source: local data frame [8 x 6]
##
## clarity mean_price median_price min_price max_price n
## 1 I1 3924.169 3344 345 18531 741
## 2 SI2 5063.029 4072 326 18804 9194
## 3 SI1 3996.001 2822 326 18818 13065
## 4 VS2 3924.989 2054 334 18823 12258
## 5 VS1 3839.455 2005 327 18795 8171
## 6 VVS2 3283.737 1311 336 18768 5066
## 7 VVS1 2523.115 1093 336 18777 3655
## 8 IF 2864.839 1080 369 18806 1790
# We've created summary data frames with the mean price
# by clarity and color. You can run the code in R to
# verify what data is in the variables diamonds_mp_by_clarity
# and diamonds_mp_by_color.
# Your task is to write additional code to create two bar plots
# on one output image using the grid.arrange() function from the package
# gridExtra.
Note : BARCHARTS http://docs.ggplot2.org/0.9.3/geom_bar.html
DIFFERENCE BETWEEN BARCHART AND HISTOGRAM http://flowingdata.com/2014/02/27/how-to-read-histograms-and-use-them-in-r/
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
p1 <- ggplot(diamonds_mp_by_clarity, aes(x = clarity, y = mean_price, fill= clarity)) +
geom_bar(stat = "identity")
p2 <- ggplot(diamonds_mp_by_color, aes(x = color, y = mean_price, fill= color)) +
geom_bar(stat = "identity")
grid.arrange(p1,p2, ncol =2)
s12 has the best mean price whereas VVS1 has the worst mean price. However ther wasn’t a very big change in other groups
mean price for color increase from D to J. J has the best mean price and D and E has the worst mean price. This is agains the odds.
diamonds_by_cut <- group_by(diamonds, cut)
diamonds_mp_by_cut <- summarise(diamonds_by_cut, mean_price = mean(price))
ggplot(diamonds_mp_by_cut, aes(x = cut, y = mean_price, fill= cut)) +
geom_bar(stat = "identity")