We are going to use the diamonds dataset in the ggplot2 package for data viz exercises Prelim data explorations
library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
Q1) create a scatterplot of price vs x. using the ggplot syntax.
ggplot(aes(x = x, y = price), data = diamonds) + geom_point()
Q3) Correlations
cor(diamonds$price,diamonds$x)
## [1] 0.8844352
cor(diamonds$price,diamonds$y)
## [1] 0.8654209
cor(diamonds$price,diamonds$z)
## [1] 0.8612494
Q4) create a scatterplot of price vs depth using the ggplot syntax.
ggplot(aes(x = price, y = depth), data = diamonds) + geom_point()
cor(diamonds$price,diamonds$depth)
## [1] -0.0106474
Q5) Transparency
ggplot(aes(x = depth,y = price),data = diamonds) +
geom_point(alpha= 0.01) +
scale_x_continuous(breaks = seq(0,80,2))
Q8) Scatterplot of price vs caret
ggplot(aes(x = price, y = carat), data = diamonds) + geom_point()
#remove top 1%
ggplot(data = diamonds,aes(x = carat, y = price)) +
xlim(0,quantile(diamonds$carat,0.99)) +
ylim(0,quantile(diamonds$price,0.99)) +
geom_point()
## Warning: Removed 926 rows containing missing values (geom_point).
Q9) Price vs volume
diamonds$volume <- diamonds$x*diamonds$y*diamonds$z
head(diamonds,10)
## carat cut color clarity depth table price x y z volume
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 38.20203
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 34.50586
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 38.07688
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63 46.72458
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 51.91725
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 38.69395
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 38.83087
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 42.32108
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 36.42521
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39 38.71800
ggplot(aes(x = price, y = volume), data = diamonds) + geom_point()
Q11) correlations on subsets
library(plyr)
count(diamonds$volume == 0) #diamonds with volume 0
## x freq
## 1 FALSE 53920
## 2 TRUE 20
new=subset(diamonds, volume> 0 & volume <800 ) #select values where volume is between 0 to 800
cor(new$price,new$volume)
## [1] 0.9235455
Q12) lines on subsets
p1 <- ggplot(aes(x = price, y = volume), data = new) + geom_point()
p1
# Default smoother
p2 <- p1 + geom_smooth()
p2
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
Q13) use of dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:nlme':
##
## collapse
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
diamondsByClarity <- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n= n())
diamondsByClarity
## Source: local data frame [8 x 6]
##
## clarity mean_price median_price min_price max_price n
## (fctr) (dbl) (dbl) (int) (int) (int)
## 1 I1 3924.169 3344 345 18531 741
## 2 SI2 5063.029 4072 326 18804 9194
## 3 SI1 3996.001 2822 326 18818 13065
## 4 VS2 3924.989 2054 334 18823 12258
## 5 VS1 3839.455 2005 327 18795 8171
## 6 VVS2 3283.737 1311 336 18768 5066
## 7 VVS1 2523.115 1093 336 18777 3655
## 8 IF 2864.839 1080 369 18806 1790
Q13) bar cahrt
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
ggplot(diamonds_mp_by_clarity, aes(x = clarity, y = mean_price, fill= clarity)) +
geom_bar(stat = "identity")
ggplot(diamonds_mp_by_color, aes(x = color, y = mean_price, fill= color)) +
geom_bar(stat = "identity")