UdaCity Problem Set 4

Q1) Price vs. x

# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.

library(ggplot2)

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data(diamonds)

str(diamonds)

## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

ggplot(data = diamonds, aes(x = price, y = x)) +
  geom_point()

# Let's do the same plot but with ln(price)

ggplot(data = diamonds, aes(x = log(price), y = x)) +
  geom_point()

Q2) Findings - price vs. x

looks like there is a linear like relationship between ln(price) and x value. and no X values for some ln(price) valuse between where ln(price)= 7 and 8

There are some outliers such as zero X value and postive price value.

Q3) Correlations

What is the correlation between price and x?
What is the correlation between price and y?
What is the correlation between price and z?

with(diamonds, cor.test(x=price, y = x, method = 'pearson'))

## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.1594, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352

with(diamonds, cor.test(x=price, y = y, method = 'pearson'))

## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.1415, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209

with(diamonds, cor.test(x=price, y = z, method = 'pearson'))

## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6015, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Q4) price vs. depth

Create a simple scatter plot of price vs depth.

ggplot(data = diamonds, aes(x = depth,y = price)) +
  geom_point(alpha= 0.5)

Q5) Adjustments - price vs. depth

# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units. See the instructor notes
# for two hints.

ggplot(data = diamonds, aes(x = depth,y = price)) +
  geom_point(alpha= 0.01) +
  scale_x_continuous(breaks = seq(0,80,2))

Q6) Typical Depth Range

Based on the scatterplot of depth vs price, most diamonds are between what values of depth? Answer: 60 - 64

Q7) Correlation - price and depth

what is the correlation of depth and price?

with(diamonds, cor.test(x= depth, y = price, method = "pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

We can say there is almost no correlation between depth and price since correlation coefficient is close to zero.

Q8) price vs. carat

# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.

ggplot(data = diamonds,aes(x = carat, y = price)) + 
  xlim(0,quantile(diamonds$carat,0.99)) +
  ylim(0,quantile(diamonds$price,0.99)) +
  geom_point()

## Warning: Removed 926 rows containing missing values (geom_point).

Q9) price vs. volume

# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.

# Create a new variable for volume in the diamonds data frame.
# This will be useful in a later exercise.

diamonds$volume <- diamonds$x*diamonds$y*diamonds$z

head(diamonds,3)

##   carat     cut color clarity depth table price    x    y    z   volume
## 1  0.23   Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43 38.20203
## 2  0.21 Premium     E     SI1  59.8    61   326 3.89 3.84 2.31 34.50586
## 3  0.23    Good     E     VS1  56.9    65   327 4.05 4.07 2.31 38.07688

ggplot(data = diamonds, aes(x = volume, y = price)) +
  geom_point()

# Some outliers with zero volume
sum(diamonds$volume == 0)

## [1] 20

diamonds[diamonds$volume == 0,]

##       carat       cut color clarity depth table price    x    y z volume
## 2208   1.00   Premium     G     SI2  59.1    59  3142 6.55 6.48 0      0
## 2315   1.01   Premium     H      I1  58.1    59  3167 6.66 6.60 0      0
## 4792   1.10   Premium     G     SI2  63.0    59  3696 6.50 6.47 0      0
## 5472   1.01   Premium     F     SI2  59.2    58  3837 6.50 6.47 0      0
## 10168  1.50      Good     G      I1  64.0    61  4731 7.15 7.04 0      0
## 11183  1.07     Ideal     F     SI2  61.6    56  4954 0.00 6.62 0      0
## 11964  1.00 Very Good     H     VS2  63.3    53  5139 0.00 0.00 0      0
## 13602  1.15     Ideal     G     VS2  59.2    56  5564 6.88 6.83 0      0
## 15952  1.14      Fair     G     VS1  57.5    67  6381 0.00 0.00 0      0
## 24395  2.18   Premium     H     SI2  59.4    61 12631 8.49 8.45 0      0
## 24521  1.56     Ideal     G     VS2  62.2    54 12800 0.00 0.00 0      0
## 26124  2.25   Premium     I     SI1  61.3    58 15397 8.52 8.42 0      0
## 26244  1.20   Premium     D    VVS1  62.1    59 15686 0.00 0.00 0      0
## 27113  2.20   Premium     H     SI1  61.2    59 17265 8.42 8.37 0      0
## 27430  2.25   Premium     H     SI2  62.8    59 18034 0.00 0.00 0      0
## 27504  2.02   Premium     H     VS2  62.7    53 18207 8.02 7.95 0      0
## 27740  2.80      Good     G     SI2  63.8    58 18788 8.90 8.85 0      0
## 49557  0.71      Good     F     SI2  64.1    60  2130 0.00 0.00 0      0
## 49558  0.71      Good     F     SI2  64.1    60  2130 0.00 0.00 0      0
## 51507  1.12   Premium     G      I1  60.4    59  2383 6.71 6.67 0      0

Looks like there is a correlation, however there are few outlier with high volume with low price makes it hard to see the correlation. Also some (20) diamonds have zero volume

Q9) Correlations on Subsets

What is the correlation of price and volume? Exclude the diamonds that have a volume of 0 or greater than or equal to 800.

with(subset(diamonds, (volume > 0) & (volume <= 800)),cor.test(volume,price))

## 
##  Pearson's product-moment correlation
## 
## data:  volume and price
## t = 559.1912, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Q10) Adjustments - price vs. volume

# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot. (See the
# Instructor Notes or look up the documentation of
# geom_smooth() for more details about smoothers.)

# We encourage you to think about this next question and
# to post your thoughts in the discussion section.

# Do you think this would be a useful model to estimate
# the price of diamonds? Why or why not?

Smoothing :

http://www.ats.ucla.edu/stat/r/faq/smooths.htm

p1 <- ggplot(data = subset(diamonds, (volume > 0) & (volume <= 800)),
       aes(x = volume, y = price)) +
  geom_point() 

# Default smoother
p2 <- p1 + geom_smooth()

# looking at a linear fit,
p3 <- p1 + stat_smooth(method = "lm", formula = y ~ x, size = 1) + coord_cartesian(ylim = c(0,20000))

# Looking at polynimoal functions of order 2
p4 <- p1 + stat_smooth(method = "lm", formula = y ~ poly(x, 2), size = 1) + coord_cartesian(ylim = c(0,20000))

# Looking at polynimoal functions of order 3
p5 <- p1 + stat_smooth(method = "lm", formula = y ~ poly(x, 3), size = 1) + coord_cartesian(ylim = c(0,20000))

library(gridExtra)

## Loading required package: grid

grid.arrange(p2,p3,p4,p5,ncol =2)

## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

Q11) Mean Price by Clarity

# Use the function dplyr package
# to create a new data frame containing
# info on diamonds by clarity.

# Name the data frame diamondsByClarity

# The data frame should contain the following
# variables in this order.

#       (1) mean_price
#       (2) median_price
#       (3) min_price
#       (4) max_price
#       (5) n

# where n is the number of diamonds in each
# level of clarity.

diamondsByClarity <- diamonds %>%
  group_by(clarity) %>%
  summarise(mean_price = mean(price), 
            median_price = median(price), 
            min_price = min(price), 
            max_price = max(price), 
            n= n())

diamondsByClarity

## Source: local data frame [8 x 6]
## 
##   clarity mean_price median_price min_price max_price     n
## 1      I1   3924.169         3344       345     18531   741
## 2     SI2   5063.029         4072       326     18804  9194
## 3     SI1   3996.001         2822       326     18818 13065
## 4     VS2   3924.989         2054       334     18823 12258
## 5     VS1   3839.455         2005       327     18795  8171
## 6    VVS2   3283.737         1311       336     18768  5066
## 7    VVS1   2523.115         1093       336     18777  3655
## 8      IF   2864.839         1080       369     18806  1790

Q12) Bar Charts of Mean Price

# We've created summary data frames with the mean price
# by clarity and color. You can run the code in R to
# verify what data is in the variables diamonds_mp_by_clarity
# and diamonds_mp_by_color.

# Your task is to write additional code to create two bar plots
# on one output image using the grid.arrange() function from the package
# gridExtra.

Note : BARCHARTS http://docs.ggplot2.org/0.9.3/geom_bar.html

DIFFERENCE BETWEEN BARCHART AND HISTOGRAM http://flowingdata.com/2014/02/27/how-to-read-histograms-and-use-them-in-r/

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

p1  <- ggplot(diamonds_mp_by_clarity, aes(x = clarity, y = mean_price, fill= clarity)) +
  geom_bar(stat = "identity")

p2 <- ggplot(diamonds_mp_by_color, aes(x = color, y = mean_price, fill= color)) +
  geom_bar(stat = "identity")

grid.arrange(p1,p2, ncol =2)

s12 has the best mean price whereas VVS1 has the worst mean price. However ther wasn’t a very big change in other groups

mean price for color increase from D to J. J has the best mean price and D and E has the worst mean price. This is agains the odds.

diamonds_by_cut <- group_by(diamonds, cut)
diamonds_mp_by_cut <- summarise(diamonds_by_cut, mean_price = mean(price))

ggplot(diamonds_mp_by_cut, aes(x = cut, y = mean_price, fill= cut)) +
  geom_bar(stat = "identity")