Week_3_Data_Dive.knit

Title : “Week_3_Data_Dive” “Output : html_document”

load the necessary libraries and load the data

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data(diamonds)

View(diamonds)

we do not want the diamond data where its dimensions are 0, so we are removing them

diamonds <- diamonds[diamonds$x != 0 & diamonds$y != 0 & diamonds$z != 0, ]

We will look at how many diamonds are present according to ‘cut’ and their average price

grouped_by_cut <- diamonds %>%
  group_by(cut) %>%
  summarise(count = n(),mean_price = mean(price),.groups = 'drop')
grouped_by_cut

## # A tibble: 5 × 3
##   cut       count mean_price
##   <ord>     <int>      <dbl>
## 1 Fair       1609      4358.
## 2 Good       4902      3926.
## 3 Very Good 12081      3982.
## 4 Premium   13780      4580.
## 5 Ideal     21548      3457.

ggplot(data = grouped_by_cut, aes(x = cut, y = mean_price, fill = cut)) +
  geom_bar(stat = 'identity') +
  labs(title = "Mean Price for Each Cut Type",
       x = "Cut",
       y = "Mean Price") +
  theme_classic()

Similarly, we need to look at data that falls under ‘Clarity’

grouped_by_clarity <- diamonds %>%
  group_by(clarity) %>%
  summarise(count = n(),mean_price = mean(price), .groups = 'drop')
grouped_by_clarity

## # A tibble: 8 × 3
##   clarity count mean_price
##   <ord>   <int>      <dbl>
## 1 I1        738      3926.
## 2 SI2      9185      5060.
## 3 SI1     13063      3994.
## 4 VS2     12254      3923.
## 5 VS1      8170      3839.
## 6 VVS2     5066      3284.
## 7 VVS1     3654      2520.
## 8 IF       1790      2865.

ggplot(data=grouped_by_clarity,aes(x=clarity,y=mean_price,fill=clarity))+geom_bar(stat='identity')+labs(title = "Mean Price for Each Clarity Type",
       x = "Clarity",
       y = "Mean Price") +
  theme_minimal()

let’s also look at how many diamonds that falls under each color and their average price

grouped_by_color <- diamonds %>%
  group_by(color) %>%
  summarise(count = n(), mean_price = mean(price), .groups = 'drop')
grouped_by_color

## # A tibble: 7 × 3
##   color count mean_price
##   <ord> <int>      <dbl>
## 1 D      6774      3168.
## 2 E      9797      3077.
## 3 F      9538      3725.
## 4 G     11284      3997.
## 5 H      8298      4481.
## 6 I      5421      5090.
## 7 J      2808      5324.

Let’s create a column quality for color, where it is divided into 5 bins and each has a separate label based on the average price of the diamond with respect to the color.

result <- diamonds|>
  group_by(color)|>
  summarise(avg_price=mean(price), .groups = 'drop')
result$quality <- cut(result$avg_price,breaks = 5, labels =c('very_low','low','medium','high','very high') )
result

## # A tibble: 7 × 3
##   color avg_price quality  
##   <ord>     <dbl> <fct>    
## 1 D         3168. very_low 
## 2 E         3077. very_low 
## 3 F         3725. low      
## 4 G         3997. medium   
## 5 H         4481. high     
## 6 I         5090. very high
## 7 J         5324. very high

let us visualize which color(D to J) falls under which bin(quality)

ggplot(data=result, aes(x=quality,y=avg_price,fill = color))+geom_bar(stat = 'identity', position='dodge')+labs(title = "quality of the diamond by color and average price ")

Now let us create a column which has the probability that diamond falls under a bin

result_prob <- result |>
  group_by(quality)|>
  summarise(count=n(), .groups = 'drop')|>
  mutate(q_prob = count/sum(count))
result_prob

## # A tibble: 5 × 3
##   quality   count q_prob
##   <fct>     <int>  <dbl>
## 1 very_low      2  0.286
## 2 low           1  0.143
## 3 medium        1  0.143
## 4 high          1  0.143
## 5 very high     2  0.286

the probability of a diamond that has a very low quality is 0.2857143

Creating a tag for the probabilities.

Low_Probability = min(result_prob$q_prob)
High_Probability = max(result_prob$q_prob)
result_prob <-result_prob |>
  mutate(Tag = case_when(q_prob == Low_Probability~ "Smallest Group",q_prob == High_Probability~ "Highest Group", TRUE ~ "Other Groups"))

result_prob

## # A tibble: 5 × 4
##   quality   count q_prob Tag           
##   <fct>     <int>  <dbl> <chr>         
## 1 very_low      2  0.286 Highest Group 
## 2 low           1  0.143 Smallest Group
## 3 medium        1  0.143 Smallest Group
## 4 high          1  0.143 Smallest Group
## 5 very high     2  0.286 Highest Group

merging result and result_prob

result <- result %>%
  left_join(result_prob,by='quality')
result

## # A tibble: 7 × 6
##   color avg_price quality   count q_prob Tag           
##   <ord>     <dbl> <fct>     <int>  <dbl> <chr>         
## 1 D         3168. very_low      2  0.286 Highest Group 
## 2 E         3077. very_low      2  0.286 Highest Group 
## 3 F         3725. low           1  0.143 Smallest Group
## 4 G         3997. medium        1  0.143 Smallest Group
## 5 H         4481. high          1  0.143 Smallest Group
## 6 I         5090. very high     2  0.286 Highest Group 
## 7 J         5324. very high     2  0.286 Highest Group

now let us merge with main dataset

diamonds <- left_join(diamonds,result,by='color')
  
diamonds

## # A tibble: 53,920 × 15
##    carat cut       color clarity depth table price     x     y     z avg_price
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>     <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43     3077.
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31     3077.
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31     3077.
##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63     5090.
##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75     5324.
##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48     5324.
##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47     5090.
##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53     4481.
##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49     3077.
## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39     4481.
## # ℹ 53,910 more rows
## # ℹ 4 more variables: quality <fct>, count <int>, q_prob <dbl>, Tag <chr>

What a random row is selected, what is the probability that the color is D
0.286

Hypothesis: as the average price of the diamond increases, the diamonds fall under a a small group

we can check that by visualizing the data

diamonds|>
ggplot(aes(x=Tag,y=avg_price,color=Tag))+
  geom_boxplot()

As the diamond average price increases it may fall under the small group.