Data Analysis with R - Problem Set 4

Clarifications:

  1. Adjustments - price vs. volume - is observation correct?
  2. Overplotting, Jitter and adjusting alpha an smoothing - any guidelines?
  3. Need help with a dataset from Gapminder ***

Get Started

Notes:

getwd()
## [1] "C:/Users/amackay/Documents/R Scripts"
#setwd("~/R Datasources")
list.files()
##  [1] "Basic commands.R"                                                      
##  [2] "Data Analysis with R - Problem Set 3 - Gapminder dataset analysis.rmd" 
##  [3] "Data Analysis with R - Problem Set 3.rmd"                              
##  [4] "Data Analysis with R - Problem Set 4.rmd"                              
##  [5] "Data_Analysis_with_R_-_Problem_Set_3.html"                             
##  [6] "Data_Analysis_with_R_-_Problem_Set_3_-_Gapminder_dataset_analysis.html"
##  [7] "Data_Analysis_with_R_-_Problem_Set_4.html"                             
##  [8] "Data_Analysis_with_R_-_Problem_Set_4.rmd"                              
##  [9] "demystifying.R"                                                        
## [10] "demystifyingR2_v3.Rmd"                                                 
## [11] "lesson3_student.html"                                                  
## [12] "lesson3_student.rmd"                                                   
## [13] "lesson4_student.html"                                                  
## [14] "lesson4_student.rmd"                                                   
## [15] "rsconnect"
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.1
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.1
data("diamonds")

names(diamonds)
##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 

price Vs x

Notes:

ggplot(aes(x = price, y = x), data = diamonds) +
  geom_point()

Findings: There seems to be a positive correlation between price and length (x) ***

Correlations

Notes:

library(alr3)
## Warning: package 'alr3' was built under R version 3.2.1
## Loading required package: car
## Warning: package 'car' was built under R version 3.2.1
cor.test(x = diamonds$price, y = diamonds$x)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$price and diamonds$x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352
cor.test(x = diamonds$price, y = diamonds$y)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$price and diamonds$y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209
cor.test(x = diamonds$price, y = diamonds$z)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$price and diamonds$z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Finding: the correclation between price and phyical dimensions of a diamond are the same


Price vs depth

Notes:

ggplot(aes(x = depth, y = price), data = diamonds) +
  geom_point()


Adjustments - price vs. depth

Notes:

range(diamonds$price)
## [1]   326 18823
ggplot(aes(x = depth, y = price), data = diamonds) +
  geom_point(alpha = 1/100)  +
  scale_y_continuous(breaks = seq(0,19000,2000))

***

Typical Depth Range

Notes:

ggplot(aes(x = depth, y = price), data = diamonds) +
  geom_point(alpha = 1/100)  +
  scale_y_continuous(breaks = seq(0,19000,2000)) +
  coord_cartesian(xlim = c(55,65))

Findings: 1. most diamonds are of a depth between 60 and 63 *** ### Correlation - price and depth Notes:

cor.test(x = diamonds$depth, y = diamonds$price)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$depth and diamonds$price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

Findings: no correlation between price and depth


price vs. carat

Notes: Need to verify if this is correct?

#range(diamonds$price)
#range(diamonds$carat)
diamonds_sub <- subset(diamonds,diamonds$price <= quantile(diamonds$price,0.99) & diamonds$carat <= quantile(diamonds$carat,0.99))

ggplot(aes(x = carat, y = price), data = diamonds_sub) +
  geom_point() +
  scale_y_continuous(breaks = seq(0,19000,2000)) +
  scale_x_continuous(breaks = seq(0,5.2,0.5))

***

price vs. volume

Notes:

diamonds$diamond_volume <- diamonds$x * diamonds$y * diamonds$z
head(diamonds,20)
##    carat       cut color clarity depth table price    x    y    z
## 1   0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2   0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3   0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4   0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5   0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6   0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
## 7   0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47
## 8   0.26 Very Good     H     SI1  61.9    55   337 4.07 4.11 2.53
## 9   0.22      Fair     E     VS2  65.1    61   337 3.87 3.78 2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338 4.00 4.05 2.39
## 11  0.30      Good     J     SI1  64.0    55   339 4.25 4.28 2.73
## 12  0.23     Ideal     J     VS1  62.8    56   340 3.93 3.90 2.46
## 13  0.22   Premium     F     SI1  60.4    61   342 3.88 3.84 2.33
## 14  0.31     Ideal     J     SI2  62.2    54   344 4.35 4.37 2.71
## 15  0.20   Premium     E     SI2  60.2    62   345 3.79 3.75 2.27
## 16  0.32   Premium     E      I1  60.9    58   345 4.38 4.42 2.68
## 17  0.30     Ideal     I     SI2  62.0    54   348 4.31 4.34 2.68
## 18  0.30      Good     J     SI1  63.4    54   351 4.23 4.29 2.70
## 19  0.30      Good     J     SI1  63.8    56   351 4.23 4.26 2.71
## 20  0.30 Very Good     J     SI1  62.7    59   351 4.21 4.27 2.66
##    diamond_volume
## 1        38.20203
## 2        34.50586
## 3        38.07688
## 4        46.72458
## 5        51.91725
## 6        38.69395
## 7        38.83087
## 8        42.32108
## 9        36.42521
## 10       38.71800
## 11       49.65870
## 12       37.70442
## 13       34.71514
## 14       51.51574
## 15       32.26237
## 16       51.88373
## 17       50.13047
## 18       48.99609
## 19       48.83366
## 20       47.81802
ggplot(aes(x = diamond_volume, y = price), data = diamonds) +
  geom_point()

Findings: 1. Positive correlation between price and volume 2. few outliers observed ***

Correlations on Subsets

Notes:

#check the corr. of price and volume
cor.test(x = diamonds$diamond_volume, y = diamonds$price)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$diamond_volume and diamonds$price
## t = 486.33, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9008054 0.9039398
## sample estimates:
##       cor 
## 0.9023845
#exclude diamonds that have a volume of 0 and that are
# greater than or equal to 800
diamonds_subset = subset(diamonds, diamond_volume >0 & diamond_volume <= 800)

cor.test(x = diamonds_subset$diamond_volume, y = diamonds_subset$price)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds_subset$diamond_volume and diamonds_subset$price
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Adjustments - price vs. volume

Notes:

#normal plot
ggplot(aes(x = diamond_volume, y = price), data = diamonds_subset) +
  geom_point()

#adjust alpha
ggplot(aes(x = diamond_volume, y = price), data = diamonds_subset) +
  geom_point(alpha = 1/10)

#add smoother
ggplot(aes(x = diamond_volume, y = price), data = diamonds_subset) +
  geom_point(alpha = 1/10) +
  geom_smooth(method = 'lm', color = 'red')

#adjust alpha and zoom in
ggplot(aes(x = diamond_volume, y = price), data = diamonds_subset) +
  geom_point(alpha = 1/100) +
  geom_smooth(method = 'lm', color = 'red') +
  coord_cartesian(xlim = c(0,200), ylim = c(0,10000))

cor.test(x = diamonds_subset$diamond_volume, y =diamonds_subset$price)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds_subset$diamond_volume and diamonds_subset$price
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Findings:

Do you think this would be a useful model to estimate the price of diamonds? Why or why not? Yes as they is a stong positive correlation?


Mean Price by Clarity

Notes: You may need to cast the data as a numeric (float) type when using it on your local machine, e.g. median(as.numeric(var)).

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.1
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
diamondsByClarity <- group_by(diamonds,clarity)
diamonds.by_clarity <- summarise(diamondsByClarity,
                                 mean_price = mean(price),
                                 median_price = median(as.numeric(price)),
                                 min_price = min(price),
                                 max_price = max(price),
                                 n = n())

head(diamonds.by_clarity)
## Source: local data frame [6 x 6]
## 
##   clarity mean_price median_price min_price max_price     n
## 1      I1   3924.169         3344       345     18531   741
## 2     SI2   5063.029         4072       326     18804  9194
## 3     SI1   3996.001         2822       326     18818 13065
## 4     VS2   3924.989         2054       334     18823 12258
## 5     VS1   3839.455         2005       327     18795  8171
## 6    VVS2   3283.737         1311       336     18768  5066

Findings: solution did not work when submitted


Bar Charts of Mean Price

Notes:

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

#more on bar charts with >geom_bar()
p1 <- ggplot(aes(x = clarity, y = mean_price), data = diamonds_mp_by_clarity) +
  geom_bar(stat = "identity")

p2 <- ggplot(aes(x = color, y = mean_price), data = diamonds_mp_by_color) +
  geom_bar(stat = "identity")

grid.arrange(p1,p2, ncol = 1)

Findings: 1. mean_price tends to decrease with color 2. same for clarity


Gapminder Revisited

Notes: need help with a suggested dataset