We are going to use the diamonds dataset in the ggplot2 package for data viz exercises Prelim data explorations

library(ggplot2)

data(diamonds)

str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Q1) create a scatterplot of price vs x. using the ggplot syntax.

ggplot(aes(x = x, y = price), data = diamonds) + geom_point()

Q3) Correlations

cor(diamonds$price,diamonds$x)
## [1] 0.8844352
cor(diamonds$price,diamonds$y)
## [1] 0.8654209
cor(diamonds$price,diamonds$z)
## [1] 0.8612494

Q4) create a scatterplot of price vs depth using the ggplot syntax.

ggplot(aes(x = price, y = depth), data = diamonds) + geom_point()

cor(diamonds$price,diamonds$depth)
## [1] -0.0106474

Q5) Transparency

ggplot(aes(x = depth,y = price),data = diamonds) +
  geom_point(alpha= 0.01) +
  scale_x_continuous(breaks = seq(0,80,2))

Q8) Scatterplot of price vs caret

ggplot(aes(x = price, y = carat), data = diamonds) + geom_point()

#remove top 1%

ggplot(data = diamonds,aes(x = carat, y = price)) + 
  xlim(0,quantile(diamonds$carat,0.99)) +
  ylim(0,quantile(diamonds$price,0.99)) +
  geom_point()
## Warning: Removed 926 rows containing missing values (geom_point).

Q9) Price vs volume

diamonds$volume <- diamonds$x*diamonds$y*diamonds$z

head(diamonds,10)
##    carat       cut color clarity depth table price    x    y    z   volume
## 1   0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43 38.20203
## 2   0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31 34.50586
## 3   0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31 38.07688
## 4   0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63 46.72458
## 5   0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75 51.91725
## 6   0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48 38.69395
## 7   0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47 38.83087
## 8   0.26 Very Good     H     SI1  61.9    55   337 4.07 4.11 2.53 42.32108
## 9   0.22      Fair     E     VS2  65.1    61   337 3.87 3.78 2.49 36.42521
## 10  0.23 Very Good     H     VS1  59.4    61   338 4.00 4.05 2.39 38.71800
ggplot(aes(x = price, y = volume), data = diamonds) + geom_point()

Q11) correlations on subsets

library(plyr)

count(diamonds$volume == 0) #diamonds with volume 0
##       x  freq
## 1 FALSE 53920
## 2  TRUE    20
new=subset(diamonds, volume> 0 & volume <800 ) #select values where volume is between 0 to 800

cor(new$price,new$volume)
## [1] 0.9235455

Q12) lines on subsets

p1 <- ggplot(aes(x = price, y = volume), data = new) + geom_point()
p1

# Default smoother
p2 <- p1 + geom_smooth()
p2
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

Q13) use of dplyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:nlme':
## 
##     collapse
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
diamondsByClarity <- diamonds %>%
  group_by(clarity) %>%
  summarise(mean_price = mean(price), 
            median_price = median(price), 
            min_price = min(price), 
            max_price = max(price), 
            n= n())

diamondsByClarity
## Source: local data frame [8 x 6]
## 
##   clarity mean_price median_price min_price max_price     n
##    (fctr)      (dbl)        (dbl)     (int)     (int) (int)
## 1      I1   3924.169         3344       345     18531   741
## 2     SI2   5063.029         4072       326     18804  9194
## 3     SI1   3996.001         2822       326     18818 13065
## 4     VS2   3924.989         2054       334     18823 12258
## 5     VS1   3839.455         2005       327     18795  8171
## 6    VVS2   3283.737         1311       336     18768  5066
## 7    VVS1   2523.115         1093       336     18777  3655
## 8      IF   2864.839         1080       369     18806  1790

Q13) bar cahrt

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

ggplot(diamonds_mp_by_clarity, aes(x = clarity, y = mean_price, fill= clarity)) +
  geom_bar(stat = "identity")

ggplot(diamonds_mp_by_color, aes(x = color, y = mean_price, fill= color)) +
  geom_bar(stat = "identity")