# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(pwr)
df <- read.csv("~/Documents/STAT 2024/udemy_courses.csv")
table(df$subject)
##
## Business Finance Graphic Design Musical Instruments Web Development
## 1195 603 680 1200
subject_counts <- df %>%
group_by(subject) %>%
summarize(count = n())
threshold <- 50
df$subject_consolidated <- ifelse(df$subject %in% subject_counts$subject[subject_counts$count >= threshold],
df$subject, 'Other')
#ANOVA
anova_result <- aov(num_subscribers ~ subject_consolidated, data = df)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## subject_consolidated 3 2.133e+10 7.11e+09 84.05 <2e-16 ***
## Residuals 3674 3.108e+11 8.46e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
subject_consolidated: 3 indicating there are 4 groups - 4 subjects - since degrees of freedom = number of groups - 1
Sum of Squares 2.133e+10 represents the variability explained by the subject categories Residuals 3.108e+11 represents the variability that remains unexplained by the model
Mean Squares 7.11e+09 represents the sum of squares divided by the degrees of freedom for subjects Residuals 8.46e+07 residual sum of squares divided by its degrees of freedom
The F value is 84.05. This statistic indicates the ratio of variance explained by the model to the variance unexplained (residuals). A higher F value suggests that the means of the groups (subjects) are significantly different from each other
The p-value is < 2e-16, which is extremely low. This indicates that there is strong statistical evidence against the null hypothesis, suggesting that at least one group mean is significantly different from the others
The significant differences imply that certain subjects attract more subscribers than others.
# Box plot
ggplot(df, aes(x = subject, y = num_subscribers)) +
geom_boxplot() +
labs(title = "Number of Subscribers by Subject",
x = "Subject",
y = "Number of Subscribers") +
theme_minimal()
linear_model <- lm(num_subscribers ~ price, data = df)
model_summary <- summary(linear_model)
print(model_summary)
##
## Call:
## lm(formula = num_subscribers ~ price, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4257 -2863 -2343 -607 266248
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2674.740 230.720 11.593 < 2e-16 ***
## price 7.909 2.566 3.082 0.00207 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9493 on 3676 degrees of freedom
## Multiple R-squared: 0.002578, Adjusted R-squared: 0.002306
## F-statistic: 9.499 on 1 and 3676 DF, p-value: 0.002071
# Interpret coefficients
coef_summary <- model_summary$coefficients
intercept <- coef_summary[1, 1] # Intercept
price_coef <- coef_summary[2, 1] # Price coefficient
cat("Intercept:", intercept, "\n")
## Intercept: 2674.74
cat("Price Coefficient:", price_coef, "\n")
## Price Coefficient: 7.909382
if (price_coef < 0) {
cat("Recommendation: To optimize course subscriptions, it may be advisable to keep the price lower, as higher prices appear to negatively impact the number of subscribers.\n")
}
ggplot(df, aes(x = price, y = num_subscribers)) +
geom_point(color = "blue", alpha = 0.5) + # Scatter plot
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
labs(title = "Linear Regression of Number of Subscribers vs. Price",
x = "Price (in USD)",
y = "Number of Subscribers") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
call: indicates the formula used for the linear regression, which is
num_subscribers ~ price. Resisuals indicate the differences between the
observed and predicted values. The residuals indicate some variability,
with a maximum positive residual suggesting some courses have
significantly higher subscriber counts than predicted.
Intercept value represents the expected number of subscribers when the price is zero. Although this is a theoretical value and may not be practically relevant, it provides a baseline for the mode
Price coefficient indicates that for each $1 increase in price, the number of subscribers is expected to increase by approximately 7.909
The p-value for the price coefficient is 0.00207, which is less than 0.01. This indicates that the price variable is statistically significant in predicting the number of subscribers at the 1% level. The intercept is also highly significant.
Residual Standard Error: 9493 on 3676 degrees of freedom This is an estimate of the standard deviation of the residuals, indicating the average distance that the observed values fall from the regression line
Multiple R-squared: 0.002578 Adjusted R-squared: 0.002306 Both values are very low, indicating that the model explains only about 0.26% of the variance in the number of subscribers. This suggests that other factors, not included in this model, are likely influencing the number of subscribers
F-statistic: 9.499 on 1 and 3676 DF, with a p-value of 0.002071
By the visualisation the regression line is constant, there is no strong relation
This indicates that the overall model is statistically significant, meaning at least one of the predictors (in this case, the price) is related to the response variable.