Introduction

This report provides an in-depth analysis of the diamonds dataset using various visualization techniques. The aim is to uncover insights into diamond characteristics and their relationships.

Loading Libraries and Data

library(ggplot2)
data("diamonds")
diamonds_data <- diamonds

Convert relevant variables to factors for consistent handling in plots

diamonds_data$cut <- as.factor(diamonds_data$cut)
diamonds_data$color <- as.factor(diamonds_data$color)
diamonds_data$clarity <- as.factor(diamonds_data$clarity)

Data Exploration

Structure and Summary

str(diamonds_data)
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds_data)
##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
## 

Visualizations

Scatter Plot - Carat vs Price

ggplot(diamonds_data, aes(x=carat, y=price)) +
  geom_point(alpha=0.3, color="blue") +
  labs(title="Carat vs Price of Diamonds", x="Carat", y="Price (USD)")

Scatter Plot with Cut Quality as Color

ggplot(diamonds_data, aes(x=carat, y=price, color=cut)) +
  geom_point(alpha=0.3, size=1.5) +
  labs(title="Carat vs Price by Cut Quality",
       subtitle="Influence of Cut on Price vs Carat Relationship",
       x="Carat", y="Price (USD)", color="Cut Quality")

Boxplot - Price Distribution by Cut

ggplot(diamonds_data, aes(x=cut, y=price)) +
  geom_boxplot(fill="lightblue", color="darkblue") +
  labs(title="Boxplot of Diamond Prices by Cut Quality", x="Cut Quality", y="Price (USD)")

Histogram - Distribution of Carat Sizes

ggplot(diamonds_data, aes(x=carat)) +
  geom_histogram(bins=30, fill="coral", color="black") +
  labs(title="Distribution of Diamond Carat Sizes", x="Carat", y="Count")

Density Plot - Price

ggplot(diamonds_data, aes(x=price)) +
  geom_density(fill="purple", alpha=0.4) +
  labs(title="Density Plot of Diamond Prices", x="Price (USD)", y="Density")

Bar Plot - Frequency of Diamond Colors

ggplot(diamonds_data, aes(x=color)) +
  geom_bar(fill="lightgreen", color="darkgreen") +
  labs(title="Frequency of Diamond Colors", x="Diamond Color", y="Count")

Violin Plot - Price Distribution by Clarity

ggplot(diamonds_data, aes(x=clarity, y=price, fill=clarity)) +
  geom_violin() +
  labs(title="Price Distribution by Diamond Clarity", x="Clarity", y="Price (USD)")

Faceted Scatter Plot - Carat vs Price by Cut and Color

ggplot(diamonds_data, aes(x=carat, y=price)) +
  geom_point(alpha=0.3) +
  facet_grid(cut ~ color) +
  labs(title="Faceted Plot of Carat vs Price by Cut and Color", x="Carat", y="Price (USD)")

Pie Chart for Cut Distribution

ggplot(diamonds_data, aes(x=factor(1), fill=cut)) +
  geom_bar(width=1) +
  coord_polar(theta="y") +
  labs(title="Pie Chart of Cut Distribution") +
  theme_void()

Conclusion

This report has visualized the diamonds dataset using various techniques in ggplot2, providing a comprehensive overview of diamond characteristics. Each visualization offers unique insights, aiding in understanding the data better.

Appendix

Additional Themes Explore further ggplot2 themes for enhanced visual appeal.

# install.packages("ggplot")  # comment to install ggplot
# library(ggplot)
# ggplot(diamonds_data, aes(x=carat, y=price, color=cut)) +
#   geom_point(alpha=0.3, size=1) +
#   labs(title="Carat vs Price with Economist Theme") +
#   theme_economist()