Week_7_Data_Dive.knit

Title: Data_Dive_Week_7
Output: HTML document

#installing neceasary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)
library(pwr)
library(pwrss)

## 
## Attaching package: 'pwrss'

## The following object is masked from 'package:stats':
## 
##     power.t.test

#load and view the data
head(diamonds)

## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

Null Hypothesis: The mean prices of diamonds with “good” and “ideal” cuts are equal.

diamonds_subset <- subset(diamonds, cut %in% c("Good", "Ideal"))


sigma_pooled <- sqrt(var(diamonds_subset$price))
alpha <- 0.03
beta <- 1 - 0.70
Z_alpha <- qnorm(1 - alpha/2)
Z_beta <- qnorm(1 - beta)
d <- 0.2


t_test_result <- t.test(price ~ cut, data = diamonds_subset, var.equal = TRUE)

p_value <- t_test_result$p.value
if (p_value < alpha) {
  conclusion <- "Reject the null hypothesis"
} else {
  conclusion <- "Fail to reject the null hypothesis"
}


cat("T-test p-value:", p_value, "\n")

## T-test p-value: 3.638743e-15

cat("Conclusion:", conclusion, "\n")

## Conclusion: Reject the null hypothesis

Since the P-value is very low. we can strongly say that there is a significant difference between the prices of diamonds with good and ideal. let’s visualize this

diamonds_subset <- subset(diamonds, cut %in% c("Good", "Ideal"))

# Create a box plot
ggplot(diamonds_subset, aes(x = cut, y = price, fill = cut)) +
  geom_boxplot() +
  labs(title = "Comparison of Diamond Prices: Very Good vs. Ideal Cut",
       x = "Cut",
       y = "Price (USD)") +
  theme_minimal()

let us conduct a two-sample t-test power analysis

diamonds_good <- subset(diamonds, cut == "Good")
diamonds_ideal <- subset(diamonds, cut == "Ideal")


independent_test <- pwr.t.test(n = NULL, 
                               d = (mean(diamonds_good$price) - mean(diamonds_ideal$price)) / sqrt((sd(diamonds_good$price)^2 + sd(diamonds_ideal$price)^2) / 2), 
                               sig.level = 0.05,
                               power = 0.85,
                               type = "two.sample",
                               alternative = "two.sided")


print(independent_test)

## 
##      Two-sample t test power calculation 
## 
##               n = 1134.977
##               d = 0.1258359
##       sig.level = 0.05
##           power = 0.85
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

plot(independent_test)

Interpretation:
This output suggests that a study design employing two independent groups with sample sizes of 1135 each has an 85% chance of detecting a statistically significant difference in their means if one truly exists.
1. The chosen power level (0.85) indicates a moderate to high level of confidence in detecting an effect, but it needs larger sample sizes compared to lower power levels.
2. The Type II error rate (0.15) indicates a 15% chance of missing a true effect.

# Original values
alpha_original <- 0.05
power_original <- 0.85

# Scenario 1: Decreasing alpha to 0.01
alpha_scenario1 <- 0.01
power_scenario1 <- 0.85

# Scenario 2: Decreasing alpha to 0.01 and increasing power to 0.90
alpha_scenario2 <- 0.01
power_scenario2 <- 0.90

# Scenario 3: Increasing power to 0.90
alpha_scenario3 <- 0.05
power_scenario3 <- 0.90

# Perform power analysis for each scenario
independent_test_original <- pwr.t.test(n = NULL, 
                                        d = (mean(diamonds_good$price) - mean(diamonds_ideal$price)) / sqrt((sd(diamonds_good$price)^2 + sd(diamonds_ideal$price)^2) / 2), 
                                        sig.level = alpha_original,
                                        power = power_original,
                                        type = "two.sample",
                                        alternative = "two.sided")

independent_test_scenario1 <- pwr.t.test(n = NULL, 
                                         d = (mean(diamonds_good$price) - mean(diamonds_ideal$price)) / sqrt((sd(diamonds_good$price)^2 + sd(diamonds_ideal$price)^2) / 2), 
                                         sig.level = alpha_scenario1,
                                         power = power_scenario1,
                                         type = "two.sample",
                                         alternative = "two.sided")

independent_test_scenario2 <- pwr.t.test(n = NULL, 
                                         d = (mean(diamonds_good$price) - mean(diamonds_ideal$price)) / sqrt((sd(diamonds_good$price)^2 + sd(diamonds_ideal$price)^2) / 2), 
                                         sig.level = alpha_scenario2,
                                         power = power_scenario2,
                                         type = "two.sample",
                                         alternative = "two.sided")

independent_test_scenario3 <- pwr.t.test(n = NULL, 
                                         d = (mean(diamonds_good$price) - mean(diamonds_ideal$price)) / sqrt((sd(diamonds_good$price)^2 + sd(diamonds_ideal$price)^2) / 2), 
                                         sig.level = alpha_scenario3,
                                         power = power_scenario3,
                                         type = "two.sample",
                                         alternative = "two.sided")


print("Original Values:")

## [1] "Original Values:"

print(independent_test_original)

## 
##      Two-sample t test power calculation 
## 
##               n = 1134.977
##               d = 0.1258359
##       sig.level = 0.05
##           power = 0.85
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

print("Scenario 1:")

## [1] "Scenario 1:"

print(independent_test_scenario1)

## 
##      Two-sample t test power calculation 
## 
##               n = 1649.744
##               d = 0.1258359
##       sig.level = 0.01
##           power = 0.85
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

print("Scenario 2:")

## [1] "Scenario 2:"

print(independent_test_scenario2)

## 
##      Two-sample t test power calculation 
## 
##               n = 1881.001
##               d = 0.1258359
##       sig.level = 0.01
##           power = 0.9
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

print("Scenario 3:")

## [1] "Scenario 3:"

print(independent_test_scenario3)

## 
##      Two-sample t test power calculation 
## 
##               n = 1328.101
##               d = 0.1258359
##       sig.level = 0.05
##           power = 0.9
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Interpretation:
If we decrease the alpha value, it takes a much larger sample size to achieve the same power level.
If we increase the alpha value, it takes a lesser sample size to achieve the same power level.

If we want to increase the likelihood of detecting true effects, it may require larger sample sizes or higher effect sizes. hence we are taking alpha as 0.05 and power level at 0.85 for optimal sample size.

Hypothesis:

Null Hypothesis: The mean prices of diamonds across all levels of the “cut” variable are equal.
Alternative Hypothesis: At least one pair of mean prices of diamonds across different levels of the “cut” variable are not equal.

# Perform ANOVA
anova_result <- aov(price ~ cut, data = diamonds_subset)
summary(anova_result)

##                Df    Sum Sq   Mean Sq F value   Pr(>F)    
## cut             1 8.878e+08 887750007   61.96 3.64e-15 ***
## Residuals   26455 3.790e+11  14327815                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation:

The P-value is very low, it is safe to assume that we can reject the null hypothesis. let us visualize this

cut_colors <- c("Fair" = "#FF5733", "Good" = "#FFC300", "Very Good" = "#C70039", 
                "Premium" = "#900C3F", "Ideal" = "#581845")
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
  geom_boxplot() +
  labs(title = "Boxplot of Price by Cut",
       x = "Cut",
       y = "Price") +
  scale_fill_manual(values = cut_colors) +  # Apply the custom colors
  theme_minimal()

let us calculate Effect size

SS_between <- 8.878e+08
SS_within <- 3.790e+11

eta_squared <- SS_between / (SS_between + SS_within)
cat("Effect size (partial eta-squared) for ANOVA:", eta_squared, "\n")

## Effect size (partial eta-squared) for ANOVA: 0.002337006

Interpretation:
The effect size (partial eta-squared) for the ANOVA is approximately 0.002337. This indicates that approximately 0.237% of the variance in the dependent variable diamond prices can be explained by the grouping variable cut.

Hypothesis:
Null Hypothesis (H0): There is no association between the diamond’s cut and its color.
Alternative Hypothesis (HA): There is a significant association between the diamond’s cut and its color. The distribution of colors differs depending on the cut category.

let’s perform a chi-square analysis for this;

chi_test <- chisq.test(table(diamonds$cut, diamonds$color))
chi_test

## 
##  Pearson's Chi-squared test
## 
## data:  table(diamonds$cut, diamonds$color)
## X-squared = 310.32, df = 24, p-value < 2.2e-16

Interpretation
The chi-square statistic (310.32) suggests a strong departure from independence between the two variables (cut and color).
The extremely small p-value (less than 2.2e-16) indicates that this observed departure from independence is statistically highly significant at the chosen significance level.

let us visualize it

# Create contingency table
cont_table <- table(diamonds$cut, diamonds$color)

# Check for empty cells in the contingency table
if (any(cont_table == 0)) {
  cat("Warning: Contingency table contains empty cells. Adjusting...\n")
  
  # Add a small value to empty cells to avoid NaN in chi-square test
  cont_table[cont_table == 0] <- 1e-10
}

# Create mosaic plot
mosaicplot(cont_table, main = "Mosaic Plot of Cut vs. Color")

# Create a data frame from the contingency table
cont_df <- as.data.frame.matrix(cont_table)

# Add row names as a column
cont_df$cut <- rownames(cont_df)

# Reshape the data
cont_df <- pivot_longer(cont_df, -cut, names_to = "variable", values_to = "value")

# Plot
ggplot(cont_df, aes(x = cut, y = variable, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Heatmap of Cut vs. Color",
       x = "Cut", y = "Color") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))