Introduction

This report provides an analysis of the Diamonds dataset from the ggplot2 package. The dataset includes variables such as carat, cut, color, clarity, and price of diamonds.

Data Preparation

We will begin by loading the required libraries and exploring the dataset.

unique values of carat and color

unique(diamonds$carat);length(unique(diamonds$carat))
##   [1] 0.23 0.21 0.29 0.31 0.24 0.26 0.22 0.30 0.20 0.32 0.33 0.25 0.35 0.42 0.28
##  [16] 0.38 0.70 0.86 0.71 0.78 0.96 0.73 0.80 0.75 0.74 0.81 0.59 0.90 0.91 0.61
##  [31] 0.77 0.63 0.76 0.64 0.72 0.79 0.58 1.17 0.60 0.83 0.54 0.98 0.52 1.01 0.53
##  [46] 0.84 0.51 1.05 0.55 0.87 1.00 0.57 0.82 1.04 0.93 1.20 0.99 0.34 0.43 0.36
##  [61] 0.95 0.89 1.02 0.97 0.56 0.85 0.92 1.27 0.66 1.12 0.68 1.03 0.62 1.22 1.08
##  [76] 0.88 0.50 1.19 0.39 0.65 1.24 1.50 0.27 0.41 1.13 1.06 0.69 0.40 1.14 0.94
##  [91] 1.29 1.52 1.16 1.21 1.23 1.09 0.67 1.11 1.10 1.18 1.15 1.25 1.07 1.28 1.51
## [106] 0.37 1.31 1.26 1.39 1.44 1.35 1.30 1.32 1.41 1.36 1.45 1.34 1.58 1.54 1.38
## [121] 1.33 1.74 1.64 1.47 1.40 1.55 1.95 2.00 1.37 1.83 1.62 1.57 1.69 2.06 1.72
## [136] 1.66 2.14 1.49 1.46 2.15 1.96 2.22 1.70 1.53 1.85 2.01 2.27 1.68 1.56 1.81
## [151] 1.65 1.82 2.03 1.73 1.59 1.42 1.43 2.08 1.48 1.60 2.49 1.71 2.02 2.07 3.00
## [166] 2.21 2.10 1.91 2.25 2.17 2.32 2.72 1.61 2.23 2.11 2.05 1.63 2.30 2.31 1.75
## [181] 2.04 2.12 1.77 2.50 1.80 1.67 1.84 2.20 3.01 1.88 2.33 2.68 2.34 1.90 2.16
## [196] 2.74 1.78 1.76 2.28 1.79 1.94 2.43 1.86 3.11 1.87 2.09 1.89 2.52 2.19 2.18
## [211] 2.77 2.63 3.05 2.46 3.02 2.38 2.24 2.26 2.36 1.99 2.29 3.65 2.45 2.40 2.54
## [226] 3.24 2.13 2.58 3.22 3.50 2.48 1.98 2.44 2.75 1.93 2.41 2.61 2.35 2.51 2.70
## [241] 2.55 1.97 2.53 2.37 2.47 2.80 4.01 2.56 3.04 1.92 2.39 3.40 4.00 3.67 2.42
## [256] 2.66 2.65 2.59 2.60 2.57 2.71 4.13 2.64 5.01 4.50 2.67 3.51 0.44 0.45 0.47
## [271] 0.46 0.48 0.49
## [1] 273
unique(diamonds$color);length(unique(diamonds$color))
## [1] E I J H F G D
## Levels: D < E < F < G < H < I < J
## [1] 7
#user defined character variables in to factor(categorical)

diamonds$x=as.factor(diamonds$x)
diamonds$y=as.factor(diamonds$y)
str(diamonds$x)
##  Factor w/ 554 levels "0","3.73","3.74",..: 21 15 31 46 60 20 21 33 13 26 ...
str(diamonds$y)
##  Factor w/ 552 levels "0","3.68","3.71",..: 27 13 36 52 64 25 27 40 8 34 ...
n=nrow(diamonds)
carat = seq(1,n) #alternate to 1:n is the function seq()

#Layer 1 setting a 2 dimensional space for plotting
ggplot(diamonds, aes(x=carat,y=color)) 

#Layer 2 Choosing a geometry to plot 
# here we use points 

ggplot(diamonds, aes(x=carat,y=color)) + 
  geom_point() 

#Layer 3 aesthetics to plot #Three options: size, shape, color

ggplot(diamonds, aes(x=carat,y=color)) + 
  geom_point(size=5) 

ggplot(diamonds, aes(x=carat,y=color)) + 
  geom_point(size=2,shape=4) 

ggplot(diamonds, aes(x=carat,y=color))+
  geom_point(color="red") 

#All three in one plot

single_num_plot1 = ggplot(diamonds, 
                          aes(x=carat,y=color)) + 
  geom_point(shape=25,size=1.75,color="red")

single_num_plot1

#Layer 4 axes labels and plot titles
#We add new features to the existing graph

single_num_plot2=single_num_plot1 +
  labs(title="diamonds dataset",
       subtitle="data", 
       y="color", 
       x="carat", 
       caption="diamonds")
single_num_plot2

#Layer 5 aesthetics for axes labels and plot titles 

single_num_plot3=single_num_plot2+
  theme(
    plot.title = element_text(color="orange", size=14, 
                              face="bold.italic"),
    plot.subtitle = element_text(color="green4", size=12, 
                                 face="italic"),
    axis.title.x = element_text(color="blue", 
                                size=14, face="bold"),
    axis.title.y = element_text(color="black",
                                size=14, face="bold"),
    plot.caption =  element_text(color="steelblue",
                                 size=14, face="italic"))

single_num_plot3

# All plots in a Grid

# Install required libraries --> gridextra 
# Create individual plots
suppressWarnings(suppressMessages(library(gridExtra)))

p1 <- ggplot(diamonds, aes(x=carat)) + geom_density(fill="skyblue") + ggtitle("Density Plot")

p2 <- ggplot(diamonds, aes(x=cut)) + geom_bar(fill="pink") + ggtitle("Bar Plot")

p3 <- ggplot(diamonds, aes(x=cut, y=price)) + geom_boxplot(fill="lightgreen") + ggtitle("Boxplot")

p4 <- ggplot(diamonds, aes(x=carat)) + geom_histogram(binwidth=0.1, fill="orange") + ggtitle("Histogram")

p5 <- ggplot(diamonds, aes(x=carat, y=price)) + geom_point(alpha=0.3) + geom_smooth(method="loess") + ggtitle("Smoothed Histogram")

p6 <- ggplot(diamonds, aes(x = 2, fill = cut)) + geom_bar(width = 1) + coord_polar(theta = "y") + ggtitle("Pie Chart") + theme_void()

grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Failed to fit group -1.
## Caused by error in `predLoess()`:
## ! workspace required (4364865305) is too large probably because of setting 'se = TRUE'.