This is the excerise on visualizing one dimensional variables in the dataset
library(ggplot2)
data(diamonds)
df <- diamonds
str(df)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(df)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
dim(df)
## [1] 53940 10
str(df$color)
## Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## Building a histogram
qplot(x = df$price,data = df,
binwidth = 250,
xlab = "Diamond Price",
ylab = "Count",
main = "Distribution for diamond prices",
col = I("orange")) +
scale_x_continuous(breaks = seq(0,20000,2000)) +
## Adding mean line
annotate("segment", x = mean(df$price), xend = mean(df$price),
y = 0, yend = 2500,
colour = "blue",
size = I(1)) +
## Adding median line
annotate("segment", x = median(df$price), xend = median(df$price),
y = 0, yend = 2500,
colour = "green",
size = I(1))
summary(df$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
count_less_500 <- sum(ifelse(df$price < 500,1,0))
count_less_250 <- sum(ifelse(df$price < 250,1,0))
count_more_15000 <- sum(ifelse(df$price >= 15000,1,0))
print(count_less_500)
## [1] 1729
print(count_less_250)
## [1] 0
print(count_more_15000)
## [1] 1656
qplot(x = df$price,data = df,
binwidth = 100,
xlab = "Diamond Price",
ylab = "Count",
main = "Distribution for diamond prices",
col = I("orange")) +
coord_cartesian(xlim = c(500,2000)) +
scale_y_continuous(breaks = seq(0,3000,250))
ggsave("zoomed version of diamond price distrubution.jpg")
## Saving 7 x 5 in image
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.3
f1 <- qplot(x = df$price,data = df,
binwidth = 200,
xlab = "Diamond Price",
ylab = "Count",
main = "Distribution for diamond prices",
col = I("gray"),
xlim = c(0,10000)) +
facet_wrap(~ cut,ncol = 3,scales = "free")
f2 <- qplot(x = cut,y = price,data = df,
geom = "boxplot",
xlab = "Diamond Cut",
ylab = "Diamond price",
main = "Diamond prices Vs Cut",
fill = I("pink")) +
coord_cartesian(ylim = c(0,10000))
grid.arrange(f1,f2,nrow = 2)
qplot(x = df$price/df$carat,data = df,
binwidth = 500,
xlab = "Price per carot",
ylab = "Count",
main = "Distribution for diamond prices/carot",
col = I("gray"),
xlim = c(0,10000)) +
facet_wrap(~ cut,ncol = 3,scales = "free")
f1 <- qplot(x = df$price/df$carat,data = df,
binwidth = 500,
geom = "histogram",
xlab = "Price per carot",
ylab = "Count",
main = "Frequency polygon for diamond prices/carot",
col = cut)
f2 <- qplot(x = df$price/df$carat,data = df,
binwidth = 500,
geom = "freqpoly",
xlab = "Price per carot",
ylab = "Count",
main = "Frequency polygon for diamond prices/carot",
col = cut)
f3 <- qplot(x = log10(df$price/df$carat),data = df,
binwidth = .05,
geom = "density",
xlab = "Price per carot",
ylab = "Count",
main = "Frequency polygon for log10 transformed diamond prices/carot", fill = cut,
alpha = 0.2)
grid.arrange(f1,f2,f3,nrow = 3)