This report provides an in-depth analysis of the diamonds dataset using various visualization techniques. The aim is to uncover insights into diamond characteristics and their relationships.
library(ggplot2)
data("diamonds")
diamonds_data <- diamonds
diamonds_data$cut <- as.factor(diamonds_data$cut)
diamonds_data$color <- as.factor(diamonds_data$color)
diamonds_data$clarity <- as.factor(diamonds_data$clarity)
str(diamonds_data)
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds_data)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
ggplot(diamonds_data, aes(x=carat, y=price)) +
geom_point(alpha=0.3, color="blue") +
labs(title="Carat vs Price of Diamonds", x="Carat", y="Price (USD)")
ggplot(diamonds_data, aes(x=carat, y=price, color=cut)) +
geom_point(alpha=0.3, size=1.5) +
labs(title="Carat vs Price by Cut Quality",
subtitle="Influence of Cut on Price vs Carat Relationship",
x="Carat", y="Price (USD)", color="Cut Quality")
ggplot(diamonds_data, aes(x=cut, y=price)) +
geom_boxplot(fill="lightblue", color="darkblue") +
labs(title="Boxplot of Diamond Prices by Cut Quality", x="Cut Quality", y="Price (USD)")
ggplot(diamonds_data, aes(x=carat)) +
geom_histogram(bins=30, fill="coral", color="black") +
labs(title="Distribution of Diamond Carat Sizes", x="Carat", y="Count")
ggplot(diamonds_data, aes(x=price)) +
geom_density(fill="purple", alpha=0.4) +
labs(title="Density Plot of Diamond Prices", x="Price (USD)", y="Density")
ggplot(diamonds_data, aes(x=color)) +
geom_bar(fill="lightgreen", color="darkgreen") +
labs(title="Frequency of Diamond Colors", x="Diamond Color", y="Count")
ggplot(diamonds_data, aes(x=clarity, y=price, fill=clarity)) +
geom_violin() +
labs(title="Price Distribution by Diamond Clarity", x="Clarity", y="Price (USD)")
ggplot(diamonds_data, aes(x=carat, y=price)) +
geom_point(alpha=0.3) +
facet_grid(cut ~ color) +
labs(title="Faceted Plot of Carat vs Price by Cut and Color", x="Carat", y="Price (USD)")
ggplot(diamonds_data, aes(x=factor(1), fill=cut)) +
geom_bar(width=1) +
coord_polar(theta="y") +
labs(title="Pie Chart of Cut Distribution") +
theme_void()
This report has visualized the diamonds dataset using various techniques in ggplot2, providing a comprehensive overview of diamond characteristics. Each visualization offers unique insights, aiding in understanding the data better.
Additional Themes Explore further ggplot2 themes for enhanced visual appeal.
# install.packages("ggplot") # comment to install ggplot
# library(ggplot)
# ggplot(diamonds_data, aes(x=carat, y=price, color=cut)) +
# geom_point(alpha=0.3, size=1) +
# labs(title="Carat vs Price with Economist Theme") +
# theme_economist()