# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(pwr)
df <- read.csv("~/Documents/STAT 2024/udemy_courses.csv")
df$is_paid <- as.factor(df$is_paid) # Convert is_paid to a factor
Null Hypothesis(H0): There is no difference in the average number of reviews between paid and free courses.
Alternative Hypothesis(H1): There is a difference in the average number of reviews between paid and free courses.
# parameters
effect_size <- 0.5 # medium effect size
alpha <- 0.05 # significance level
power <- 0.8 # power level
sample_size <- pwr.t.test(d = effect_size, sig.level = alpha, power = power, type = "two.sample")
# Print the required sample size
print(sample_size)
##
## Two-sample t test power calculation
##
## n = 63.76561
## d = 0.5
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
paid_courses <- df %>% filter(is_paid == "True") %>% select(num_reviews)
free_courses <- df %>% filter(is_paid == "False") %>% select(num_reviews)
#means and standard deviations
mean_paid <- mean(paid_courses$num_reviews)
mean_free <- mean(free_courses$num_reviews)
sd_paid <- sd(paid_courses$num_reviews)
sd_free <- sd(free_courses$num_reviews)
n_paid <- nrow(paid_courses)
n_free <- nrow(free_courses)
#t-test
t_test_result <- t.test(paid_courses$num_reviews, free_courses$num_reviews)
#Specify alpha level and Type 2 Error
alpha <- 0.05 # Significance level
beta <- 0.20 # Type 2 error rate (80% power)
#Calculate pooled standard deviation and effect size (Cohen's d)
SD_pooled <- sqrt(((n_paid - 1) * sd_paid^2 + (n_free - 1) * sd_free^2) / (n_paid + n_free - 2))
cohens_d <- (mean_paid - mean_free) / SD_pooled
#Decision making based on p-value
p_value <- t_test_result$p.value
if (p_value < alpha) {
decision <- "Reject the null hypothesis: There is a significant difference in the number of reviews."
} else {
decision <- "Fail to reject the null hypothesis: No significant difference in the number of reviews."
}
#results
cat("Hypothesis 1 Results:\n")
## Hypothesis 1 Results:
cat("Mean (Paid):", mean_paid, "\n")
## Mean (Paid): 131.4379
cat("Mean (Free):", mean_free, "\n")
## Mean (Free): 425.929
cat("Standard Deviation (Paid):", sd_paid, "\n")
## Standard Deviation (Paid): 926.0669
cat("Standard Deviation (Free):", sd_free, "\n")
## Standard Deviation (Free): 994.2826
cat("T-Statistic:", t_test_result$statistic, "\n")
## T-Statistic: -5.018365
cat("P-Value:", p_value, "\n")
## P-Value: 8.206362e-07
cat("Critical Value (one-tailed):", qt(1 - alpha, df = n_paid + n_free - 2), "\n")
## Critical Value (one-tailed): 1.645268
cat("Cohen's d:", cohens_d, "\n")
## Cohen's d: -0.3159799
cat(decision, "\n")
## Reject the null hypothesis: There is a significant difference in the number of reviews.
n=63.76 indicates that we need atleast 64 samples per gropu to achive 0.8 power level with an effect size 0.5 and significance level of 0.05, The probability of correctly rejecting the null hypothesis when it is false usually set to 80% or 0.8. 0.80 ensures a good balance between avoiding false negatives while not increasing sample size unnecessarily
Alpha level decides whether to reject null hypothesis, if p value is less than the alpha level we reject null hypothesis, means willing to accept a 5% chance of rejecting the null hypothesis when it is actually true (Type I error).
Effect size determines the relationship between the groups, cohen’s d value indicates a small-to-moderate effect where the mean of the free courses group is about 0.316 standard deviations higher than the mean of the paid courses group. This is not a very large difference but is still statistically significant due to the test’s power.
Since the p-value (8.21e-07) is much lower than the alpha level of 0.05, we reject the null hypothesis. This indicates a statistically significant difference in the number of reviews between paid and free courses.
The p-value indicates the probability of observing the data
This indicates that there is a significant difference in the average number of reviews between paid and free courses.
Cohen’s d of -0.32 suggests a small to moderate effect size. The negative value indicates that the mean number of reviews for paid courses is lower than that for free courses.
ggplot(df, aes(x = is_paid, y = num_reviews, fill = is_paid)) +
geom_boxplot(outlier.color = "red", outlier.size = 1.5) +
scale_fill_manual(values = c("True" = "green", "False" = "lightcoral")) +
labs(title = "Number of Reviews by Course Type (Paid vs Free)",
x = "Course Type",
y = "Number of Reviews") +
theme_minimal() +
theme(legend.position = "none")
The boxplot will visually depict whether there’s a significant difference in the number of reviews based on the course type The boxplot indicates that the number of reviews for free courses(false) is significantly higher than for paid courses(true)
Test used: Fisher’s Significance Testing Null Hypothesis(H0): There is no correlation between content duration and the number of reviews.
# Step 1: Perform correlation test
cor_test_result <- cor.test(df$content_duration, df$num_reviews)
cor_coefficient <- cor_test_result$estimate
p_value_h2 <- cor_test_result$p.value
# Determine if we reject the null hypothesis
if (p_value_h2 < 0.05) {
decision_h2 <- "Reject the null hypothesis: There is a significant correlation."
} else {
decision_h2 <- "Fail to reject the null hypothesis: No significant correlation."
}
# Print results
cat("Hypothesis 2 Results:\n")
## Hypothesis 2 Results:
cat("Correlation coefficient:", cor_coefficient, "\n")
## Correlation coefficient: 0.2288893
cat("p-value:", p_value_h2, "\n")
## p-value: 6.345979e-45
cat(decision_h2, "\n")
## Reject the null hypothesis: There is a significant correlation.
Correlation coefficient: 0.2288893 indicates a positive correlation between the variables being analyzed, suggesting that as one variable increases, the other variable tends to increase as well Since the p-value is extremely low (much less than 0.05), we reject the null hypothesis, indicating that there is a significant correlation between content duration and the number of reviews statistically.
# Visualization for Hypothesis 2
ggplot(df, aes(x = content_duration, y = num_reviews)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Correlation between Content Duration and Number of Reviews",
x = "Content Duration (hours)",
y = "Number of Reviews") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
The correlation coefficient of approximately 0.229 suggests a positive correlation between content duration and the number of reviews The scatter plot effectively conveys that while not all courses with many subscribers have an equally high number of reviews, a general trend exists where higher subscriber counts are associated with higher review counts.