# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(pwr)
library(vcd)

## Loading required package: grid

library(pwr)

# Load the data
laptop_prices <- read.csv("/Users/revathiyajjavarapu/Documents/statistics(1)/laptop_prices.csv")

Hypothesis 1: Neyman-Pearson Framework

H0: The average price of laptops with SSD storage is equal to the average price of laptops with HDD storage.

H1: The average price of laptops with SSD storage is different from the average price of laptops with HDD storage.

Test Selection: Use a two-sample t-test (if normally distributed) or Mann-Whitney U test Alpha Level: 0.05 Power Level: 0.8

Effect Size: A minimum difference of $100 is considered significant To ensure a balance between detecting meaningful differences and avoiding Type I errors

# Filter SSD and HDD prices
ssd_prices <- laptop_prices %>% filter(PrimaryStorageType == "SSD") %>% select(Price_euros) %>% na.omit()
hdd_prices <- laptop_prices %>% filter(PrimaryStorageType == "HDD") %>% select(Price_euros) %>% na.omit()

# Check sample sizes
n_ssd <- length(ssd_prices$Price_euros)
n_hdd <- length(hdd_prices$Price_euros)

print(paste("Sample size for SSD:", n_ssd))

## [1] "Sample size for SSD: 837"

print(paste("Sample size for HDD:", n_hdd))

## [1] "Sample size for HDD: 359"

# Calculate the standard deviation of both groups
std_dev_ssd <- sd(ssd_prices$Price_euros)
std_dev_hdd <- sd(hdd_prices$Price_euros)

# Calculate combined standard deviation (pooled)
std_dev <- sqrt(((n_ssd - 1) * std_dev_ssd^2 + (n_hdd - 1) * std_dev_hdd^2) / (n_ssd + n_hdd - 2))

# Effect size based on the desired minimum difference
effect_size <- 100 / std_dev  

# Parameters for power analysis
alpha_level <- 0.05
power_level <- 0.80

# Calculate required sample size
required_sample_size <- pwr.t.test(d = effect_size, power = power_level, sig.level = alpha_level, type = "two.sample")$n

# Check if sample sizes meet the required amount
if (n_ssd >= required_sample_size && n_hdd >= required_sample_size) {
  t_test_result <- t.test(ssd_prices$Price_euros, hdd_prices$Price_euros, var.equal = TRUE, conf.level = 0.95)
  print(t_test_result)
} else {
  print("Not enough data to perform the t-test.")
}

## [1] "Not enough data to perform the t-test."

# Display required sample size per group
cat("Required sample size per group:", ceiling(required_sample_size), "\n")

## Required sample size per group: 591

This indicates that there are enough SSD samples but not enough HDD samples to perform the t-test

Web need at least 591 samples in each group to achieve desired power (0.80) and significance level (0.05)

An alpha level of 0.05 is willing to accept a 5% chance of incorrectly concluding that a significant effect exists when it does not

Power level 0.80 indicating an 80% chance of detecting a true effect if it exists. This means there’s a 20% chance of a Type II error.

Effect size of $100 indicates a threshold that is considered not only statistically significant but also practically meaningful in the context of laptop pricing.

Visualization

Boxplot for SSD vs HDD prices

ggplot(laptop_prices, aes(x = PrimaryStorageType, y = Price_euros, fill = PrimaryStorageType)) +
  geom_boxplot(outlier.color = "red", outlier.shape = 16, notch = TRUE) +
  labs(
    title = "Price Comparison between SSD and HDD Laptops",
    x = "Storage Type",
    y = "Price (Euros)",
    fill = "Storage Type"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

Hypothesis 2: Fisher’s Significance Testing Framework

H0: There is no relationship between the laptop brand and whether it comes with a pre-installed OS. Alternative Hypothesis

H1: There is a relationship between the laptop brand and pre-installed OS status. Test Selection: Fisher’s Significance Testing framework.

laptop_prices <- read.csv("/Users/revathiyajjavarapu/Documents/statistics(1)/laptop_prices.csv")

brand_os_table <- table(laptop_prices$Company, laptop_prices$OS)

fisher_test_result <- fisher.test(brand_os_table, simulate.p.value = TRUE, B = 10000)
print(fisher_test_result)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  brand_os_table
## p-value = 9.999e-05
## alternative hypothesis: two.sided

if (fisher_test_result$p.value < 0.05) {
  cat("Reject the null hypothesis: There is a significant relationship between brand and OS pre-installed.\n")
} else {
  cat("Fail to reject the null hypothesis: There is no significant relationship between brand and OS pre-installed.\n")
}

## Reject the null hypothesis: There is a significant relationship between brand and OS pre-installed.

p-value is much less than 0.05, reject the null hypothesis. This indicates a significant relationship between the laptop brand and the operating system pre-installed.

Visualization

Bar plot for the relationship between brand and OS

# Calculate the proportion of OS types for each brand
os_distribution <- laptop_prices %>%
  filter(!is.na(OS)) %>%
  group_by(Company, OS) %>%
  summarise(count = n()) %>%
  mutate(proportion = count / sum(count))

## `summarise()` has grouped output by 'Company'. You can override using the
## `.groups` argument.

ggplot(os_distribution, aes(x = Company, y = proportion, fill = OS)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(
    title = "Proportion of Different OS Types by Laptop Brand",
    x = "Laptop Brand",
    y = "Proportion",
    fill = "OS"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Windows10 is most preferred OS in all brands by the users.

assignment7

2024-10-12

Hypothesis 1: Neyman-Pearson Framework

Visualization

Hypothesis 2: Fisher’s Significance Testing Framework

Visualization