# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(pwr)
library(vcd)
## Loading required package: grid
library(pwr)
# Load the data
laptop_prices <- read.csv("/Users/revathiyajjavarapu/Documents/statistics(1)/laptop_prices.csv")
H0: The average price of laptops with SSD storage is equal to the average price of laptops with HDD storage.
H1: The average price of laptops with SSD storage is different from the average price of laptops with HDD storage.
Test Selection: Use a two-sample t-test (if normally distributed) or Mann-Whitney U test Alpha Level: 0.05 Power Level: 0.8
Effect Size: A minimum difference of $100 is considered significant To ensure a balance between detecting meaningful differences and avoiding Type I errors
# Filter SSD and HDD prices
ssd_prices <- laptop_prices %>% filter(PrimaryStorageType == "SSD") %>% select(Price_euros) %>% na.omit()
hdd_prices <- laptop_prices %>% filter(PrimaryStorageType == "HDD") %>% select(Price_euros) %>% na.omit()
# Check sample sizes
n_ssd <- length(ssd_prices$Price_euros)
n_hdd <- length(hdd_prices$Price_euros)
print(paste("Sample size for SSD:", n_ssd))
## [1] "Sample size for SSD: 837"
print(paste("Sample size for HDD:", n_hdd))
## [1] "Sample size for HDD: 359"
# Calculate the standard deviation of both groups
std_dev_ssd <- sd(ssd_prices$Price_euros)
std_dev_hdd <- sd(hdd_prices$Price_euros)
# Calculate combined standard deviation (pooled)
std_dev <- sqrt(((n_ssd - 1) * std_dev_ssd^2 + (n_hdd - 1) * std_dev_hdd^2) / (n_ssd + n_hdd - 2))
# Effect size based on the desired minimum difference
effect_size <- 100 / std_dev
# Parameters for power analysis
alpha_level <- 0.05
power_level <- 0.80
# Calculate required sample size
required_sample_size <- pwr.t.test(d = effect_size, power = power_level, sig.level = alpha_level, type = "two.sample")$n
# Check if sample sizes meet the required amount
if (n_ssd >= required_sample_size && n_hdd >= required_sample_size) {
t_test_result <- t.test(ssd_prices$Price_euros, hdd_prices$Price_euros, var.equal = TRUE, conf.level = 0.95)
print(t_test_result)
} else {
print("Not enough data to perform the t-test.")
}
## [1] "Not enough data to perform the t-test."
# Display required sample size per group
cat("Required sample size per group:", ceiling(required_sample_size), "\n")
## Required sample size per group: 591
This indicates that there are enough SSD samples but not enough HDD samples to perform the t-test
Web need at least 591 samples in each group to achieve desired power (0.80) and significance level (0.05)
An alpha level of 0.05 is willing to accept a 5% chance of incorrectly concluding that a significant effect exists when it does not
Power level 0.80 indicating an 80% chance of detecting a true effect if it exists. This means there’s a 20% chance of a Type II error.
Effect size of $100 indicates a threshold that is considered not only statistically significant but also practically meaningful in the context of laptop pricing.
Boxplot for SSD vs HDD prices
ggplot(laptop_prices, aes(x = PrimaryStorageType, y = Price_euros, fill = PrimaryStorageType)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, notch = TRUE) +
labs(
title = "Price Comparison between SSD and HDD Laptops",
x = "Storage Type",
y = "Price (Euros)",
fill = "Storage Type"
) +
theme_minimal() +
theme(legend.position = "none")
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
H0: There is no relationship between the laptop brand and whether it comes with a pre-installed OS. Alternative Hypothesis
H1: There is a relationship between the laptop brand and pre-installed OS status. Test Selection: Fisher’s Significance Testing framework.
laptop_prices <- read.csv("/Users/revathiyajjavarapu/Documents/statistics(1)/laptop_prices.csv")
brand_os_table <- table(laptop_prices$Company, laptop_prices$OS)
fisher_test_result <- fisher.test(brand_os_table, simulate.p.value = TRUE, B = 10000)
print(fisher_test_result)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: brand_os_table
## p-value = 9.999e-05
## alternative hypothesis: two.sided
if (fisher_test_result$p.value < 0.05) {
cat("Reject the null hypothesis: There is a significant relationship between brand and OS pre-installed.\n")
} else {
cat("Fail to reject the null hypothesis: There is no significant relationship between brand and OS pre-installed.\n")
}
## Reject the null hypothesis: There is a significant relationship between brand and OS pre-installed.
p-value is much less than 0.05, reject the null hypothesis. This indicates a significant relationship between the laptop brand and the operating system pre-installed.
Bar plot for the relationship between brand and OS
# Calculate the proportion of OS types for each brand
os_distribution <- laptop_prices %>%
filter(!is.na(OS)) %>%
group_by(Company, OS) %>%
summarise(count = n()) %>%
mutate(proportion = count / sum(count))
## `summarise()` has grouped output by 'Company'. You can override using the
## `.groups` argument.
ggplot(os_distribution, aes(x = Company, y = proportion, fill = OS)) +
geom_bar(stat = "identity", position = "fill") +
labs(
title = "Proportion of Different OS Types by Laptop Brand",
x = "Laptop Brand",
y = "Proportion",
fill = "OS"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Windows10 is most preferred OS in all brands by the users.