# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(pwr)
df <- read.csv("~/Documents/STAT 2024/udemy_courses.csv")

Anova

table(df$subject)

## 
##    Business Finance      Graphic Design Musical Instruments     Web Development 
##                1195                 603                 680                1200

subject_counts <- df %>%
  group_by(subject) %>%
  summarize(count = n())

threshold <- 50  
df$subject_consolidated <- ifelse(df$subject %in% subject_counts$subject[subject_counts$count >= threshold],
                                  df$subject, 'Other')

#ANOVA
anova_result <- aov(num_subscribers ~ subject_consolidated, data = df)

summary(anova_result)

##                        Df    Sum Sq  Mean Sq F value Pr(>F)    
## subject_consolidated    3 2.133e+10 7.11e+09   84.05 <2e-16 ***
## Residuals            3674 3.108e+11 8.46e+07                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

subject_consolidated: 3 indicating there are 4 groups - 4 subjects - since degrees of freedom = number of groups - 1

Sum of Squares 2.133e+10 represents the variability explained by the subject categories Residuals 3.108e+11 represents the variability that remains unexplained by the model

Mean Squares 7.11e+09 represents the sum of squares divided by the degrees of freedom for subjects Residuals 8.46e+07 residual sum of squares divided by its degrees of freedom

The F value is 84.05. This statistic indicates the ratio of variance explained by the model to the variance unexplained (residuals). A higher F value suggests that the means of the groups (subjects) are significantly different from each other

The p-value is < 2e-16, which is extremely low. This indicates that there is strong statistical evidence against the null hypothesis, suggesting that at least one group mean is significantly different from the others

The significant differences imply that certain subjects attract more subscribers than others.

Visualization

# Box plot
ggplot(df, aes(x = subject, y = num_subscribers)) +
  geom_boxplot() +
  labs(title = "Number of Subscribers by Subject",
       x = "Subject",
       y = "Number of Subscribers") +
  theme_minimal()

linear regression model and coefficients

linear_model <- lm(num_subscribers ~ price, data = df)

model_summary <- summary(linear_model)
print(model_summary)

## 
## Call:
## lm(formula = num_subscribers ~ price, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4257  -2863  -2343   -607 266248 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2674.740    230.720  11.593  < 2e-16 ***
## price          7.909      2.566   3.082  0.00207 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9493 on 3676 degrees of freedom
## Multiple R-squared:  0.002578,   Adjusted R-squared:  0.002306 
## F-statistic: 9.499 on 1 and 3676 DF,  p-value: 0.002071

# Interpret coefficients
coef_summary <- model_summary$coefficients
intercept <- coef_summary[1, 1]  # Intercept
price_coef <- coef_summary[2, 1]  # Price coefficient

cat("Intercept:", intercept, "\n")

## Intercept: 2674.74

cat("Price Coefficient:", price_coef, "\n")

## Price Coefficient: 7.909382

if (price_coef < 0) {
    cat("Recommendation: To optimize course subscriptions, it may be advisable to keep the price lower, as higher prices appear to negatively impact the number of subscribers.\n")
}

Visualization

ggplot(df, aes(x = price, y = num_subscribers)) +
  geom_point(color = "blue", alpha = 0.5) +  # Scatter plot
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Regression line with confidence interval
  labs(title = "Linear Regression of Number of Subscribers vs. Price",
       x = "Price (in USD)",
       y = "Number of Subscribers") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

call: indicates the formula used for the linear regression, which is num_subscribers ~ price. Resisuals indicate the differences between the observed and predicted values. The residuals indicate some variability, with a maximum positive residual suggesting some courses have significantly higher subscriber counts than predicted.

Intercept value represents the expected number of subscribers when the price is zero. Although this is a theoretical value and may not be practically relevant, it provides a baseline for the mode

Price coefficient indicates that for each $1 increase in price, the number of subscribers is expected to increase by approximately 7.909

The p-value for the price coefficient is 0.00207, which is less than 0.01. This indicates that the price variable is statistically significant in predicting the number of subscribers at the 1% level. The intercept is also highly significant.

Residual Standard Error: 9493 on 3676 degrees of freedom This is an estimate of the standard deviation of the residuals, indicating the average distance that the observed values fall from the regression line

Multiple R-squared: 0.002578 Adjusted R-squared: 0.002306 Both values are very low, indicating that the model explains only about 0.26% of the variance in the number of subscribers. This suggests that other factors, not included in this model, are likely influencing the number of subscribers

F-statistic: 9.499 on 1 and 3676 DF, with a p-value of 0.002071

By the visualisation the regression line is constant, there is no strong relation

This indicates that the overall model is statistically significant, meaning at least one of the predictors (in this case, the price) is related to the response variable.

assignment 8

2024-10-21

Anova

Visualization

linear regression model and coefficients

Visualization