Create 2 variables

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

df <- read.csv("~/Documents/STAT 2024/udemy_courses.csv")

sum(is.na(df$num_reviews))
## [1] 0
sum(is.na(df$review_to_subscriber_ratio))
## [1] 0
sum(is.infinite(df$content_efficiency))
## [1] 0
sum(is.na(df$content_efficiency))
## [1] 0
df$content_duration_minutes <- as.numeric(gsub(" hours| minutes", "", df$content_duration)) * 60 +
                                as.numeric(gsub(".* (\\d+) minutes$", "\\1", df$content_duration))

df$subscriber_category <- cut(df$num_subscribers,
                              breaks = c(-Inf, 0, 100, 500, 1000, Inf),
                              labels = c("None", "Low", "Medium", "High", "Very High"))
df$review_to_subscriber_ratio <- df$num_reviews / df$num_subscribers
df$content_efficiency <- df$num_reviews / df$content_duration_minutes
df$content_efficiency <- ifelse(df$content_duration_minutes > 0, 
                                df$num_reviews / df$content_duration_minutes, 
                                NA)



# Pair 1: Number of Subscribers vs. Subscriber Category
df_pair1 <- df %>%
  filter(!is.na(num_subscribers) & !is.na(subscriber_category)) %>%
  select(num_subscribers, subscriber_category) %>%
  na.omit()

# Pair 2: Content Efficiency vs. Content Duration
df_pair2 <- df %>%
  filter(!is.na(content_efficiency) & !is.na(content_duration_minutes)) %>%
  select(content_efficiency, content_duration_minutes) %>%
  na.omit()

# Display the first few rows of each pair
cat("Pair 1: Number of Subscribers vs. Subscriber Category\n")
## Pair 1: Number of Subscribers vs. Subscriber Category
print(head(df_pair1))
##   num_subscribers subscriber_category
## 1            2147           Very High
## 2            2792           Very High
## 3            2174           Very High
## 4            2451           Very High
## 5            1276           Very High
## 6            9221           Very High
cat("\nPair 2: Content Efficiency vs. Content Duration\n")
## 
## Pair 2: Content Efficiency vs. Content Duration
print(head(df_pair2))
##   content_efficiency content_duration_minutes
## 1         0.25136612                     91.5
## 2         0.38797814                   2379.0
## 3         0.48524590                    152.5
## 4         0.06010929                    183.0
## 5         0.36885246                    122.0
## 6         0.75409836                    183.0

In the above code I chose two pairs of numeric variables. The first pair represent the relationship between Number of Subscribers vs. Subscriber Category(self-created column).This pair analyzes how subscriber numbers vary across different categories and the second pair focuses on relationship between Content Efficiency vs. Content Duration which analyzes how effectively courses convert content into subscribers.

Visualization

Pair 1: Number of Reviews vs. Review-to-Subscriber Ratio

rm(list = ls())

library(readr)
library(dplyr)
library(ggplot2)

# Load dataset
df <- read_csv("~/Documents/STAT 2024/udemy_courses.csv")
## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): course_title, url, level, subject
## dbl  (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl  (1): is_paid
## dttm (1): published_timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df)
## spc_tbl_ [3,678 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ course_id          : num [1:3678] 1070968 1113822 1006314 1210588 1011058 ...
##  $ course_title       : chr [1:3678] "Ultimate Investment Banking Course" "Complete GST Course & Certification - Grow Your CA Practice" "Financial Modeling for Business Analysts and Consultants" "Beginner to Pro - Financial Analysis in Excel 2017" ...
##  $ url                : chr [1:3678] "https://www.udemy.com/ultimate-investment-banking-course/" "https://www.udemy.com/goods-and-services-tax/" "https://www.udemy.com/financial-modeling-for-business-analysts-and-consultants/" "https://www.udemy.com/complete-excel-finance-course-from-beginner-to-pro/" ...
##  $ is_paid            : logi [1:3678] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ price              : num [1:3678] 200 75 45 95 200 150 65 95 195 200 ...
##  $ num_subscribers    : num [1:3678] 2147 2792 2174 2451 1276 ...
##  $ num_reviews        : num [1:3678] 23 923 74 11 45 138 178 148 34 14 ...
##  $ num_lectures       : num [1:3678] 51 274 51 36 26 25 26 23 38 15 ...
##  $ level              : chr [1:3678] "All Levels" "All Levels" "Intermediate Level" "All Levels" ...
##  $ content_duration   : num [1:3678] 1.5 39 2.5 3 2 3 1 2.5 2.5 1 ...
##  $ published_timestamp: POSIXct[1:3678], format: "2017-01-18 20:58:58" "2017-03-09 16:34:20" ...
##  $ subject            : chr [1:3678] "Business Finance" "Business Finance" "Business Finance" "Business Finance" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   course_id = col_double(),
##   ..   course_title = col_character(),
##   ..   url = col_character(),
##   ..   is_paid = col_logical(),
##   ..   price = col_double(),
##   ..   num_subscribers = col_double(),
##   ..   num_reviews = col_double(),
##   ..   num_lectures = col_double(),
##   ..   level = col_character(),
##   ..   content_duration = col_double(),
##   ..   published_timestamp = col_datetime(format = ""),
##   ..   subject = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
df$content_efficiency <- ifelse(df$content_duration > 0, 
                                df$num_subscribers / df$content_duration, 
                                NA)
df$content_efficiency <- ifelse(df$content_duration * 60 > 0, 
                                df$num_subscribers / (df$content_duration * 60), 
                                NA)


df$review_to_subscriber_ratio <-  df$num_reviews / (df$num_subscribers)
df$rs_ratio_category <- cut(df$review_to_subscriber_ratio,
                                      breaks = c(-Inf, 0, 0.25, 0.5, 0.75, Inf),
                                      labels = c("None", "Low", "Medium", "High", "Very High"))

# Visualize the relationship between number of reviews and review-to-subscriber ratio
ggplot(df, aes(x = num_reviews, y = review_to_subscriber_ratio,color=rs_ratio_category)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", color = "blue") +  # Add a trend line
    labs(title = "Relationship between Number of Reviews and Review-to-Subscriber Ratio",
         x = "Number of Reviews",
         y = "Review-to-Subscriber Ratio",
         color="review to subscriber ratio category") +
    theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 70 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 70 rows containing missing values or values outside the scale range
## (`geom_point()`).

Review-to-subscriber Ratio is self-created column calculated by dividing the number of reviews by the number of subscribers. This ratio provides insight into how actively the subscribed audience is reviewing the course.Most of the courses with higher rs ratio are near to zero. But that doesn’t mean the number of reviews are high. So we can conclude that the rs ratio provides a balenced scattered line along all different no. of reviews. The rs ratio which is near to zero are having no. of reviews and no. of subscribers nearly equal. rs ratio is explanatory variable in this case.

Pair 2: Content Efficiency vs. Content Duration

rm(list = ls())

library(readr)  
library(dplyr)  

df <- read_csv("~/Documents/STAT 2024/udemy_courses.csv")
## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): course_title, url, level, subject
## dbl  (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl  (1): is_paid
## dttm (1): published_timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df$content_duration_minutes <- as.numeric(gsub(" hours| minutes", "", df$content_duration)) * 60 +
                                as.numeric(gsub(".* (\\d+) minutes$", "\\1", df$content_duration))

df$content_efficiency <- df$num_subscribers / df$content_duration_minutes

df$content_efficiency[is.infinite(df$content_efficiency)] <- NA  
df$content_efficiency[is.na(df$content_efficiency)] <- 0  

head(df[, c("num_subscribers", "content_duration", "content_duration_minutes", "content_efficiency")])
## # A tibble: 6 × 4
##   num_subscribers content_duration content_duration_minutes content_efficiency
##             <dbl>            <dbl>                    <dbl>              <dbl>
## 1            2147              1.5                     91.5              23.5 
## 2            2792             39                     2379                 1.17
## 3            2174              2.5                    152.               14.3 
## 4            2451              3                      183                13.4 
## 5            1276              2                      122                10.5 
## 6            9221              3                      183                50.4
df$content_efficiency_category <- cut(df$content_efficiency,
                                      breaks = c(-Inf, 0, 10, 100, 200, Inf),
                                      labels = c("None", "Low", "Medium", "High", "Very High"))

efficiency_counts <- table(df$content_efficiency_category)
cat("Counts of courses in each content efficiency category:\n")
## Counts of courses in each content efficiency category:
print(efficiency_counts)
## 
##      None       Low    Medium      High Very High 
##        70      2244      1191       117        56
# Visualize the comparison of content duration with the new content_efficiency_category
library(ggplot2)

ggplot(df, aes(x = content_duration_minutes, y = content_efficiency, color = content_efficiency_category)) +
    geom_point(alpha = 0.6) +
    labs(title = "Content Efficiency vs. Content Duration",
         x = "Content Duration (Minutes)",
         y = "Content Efficiency (Subscribers per Minute)",
         color = "Content Efficiency Category") +
    theme_minimal()

The trend shows that shorter courses have greater content efficiency in drawing more subscribers per minute. In contrast longer courses see diminishing returns in terms of subscriber engagement. This means course creators can improve subscriber interest and keep more learners engaged by making their courses to required length. The content efficiency is explanatory and content duration is response variable in this visualuzation.

Calculate Correlation Coefficient for Each Combination:

Pair-1: Number of Reviews vs. Review-to-Subscriber Ratio

df$review_to_subscriber_ratio <- ifelse(df$num_subscribers > 0, 
                                        df$num_reviews / df$num_subscribers, 
                                        NA)

cor(df$num_reviews, df$review_to_subscriber_ratio, use = "complete.obs")
## [1] 0.09189805

The correlation is 0.09 which indicates weak relation. We can say that the number of reviews and rs ratio are weekly dependent on each other. As the number of reviews increases, the rs ratio also increases.

Pair-2: Content Duration vs. Content Efficiency

# Correlation between Content Duration and Content Efficiency
cor_duration_efficiency <- cor(df$content_duration_minutes, df$content_efficiency, use = "complete.obs")
cat("Correlation between Content Duration and Content Efficiency:", cor_duration_efficiency, "\n")
## Correlation between Content Duration and Content Efficiency: -0.1097195

The correlation is -0.109 which indicates weak negative relationship. Low content duration has higher content efficiency and viceversa. This determines that the content efficiency and content duration are inversely propotional.

Confidence interval

rm(list = ls())

library(readr)
library(dplyr)
library(ggplot2)

df <- read_csv("~/Documents/STAT 2024/udemy_courses.csv")
## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): course_title, url, level, subject
## dbl  (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl  (1): is_paid
## dttm (1): published_timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Pair 1: Confidence Interval for Number of Subscribers by Subscriber Category
df$subscriber_category <- cut(df$num_subscribers,
                              breaks = c(-Inf, 0, 15000, 25000, 40000, Inf),
                              labels = c("None", "Low", "Medium", "High", "Very High"))

subscriber_summary <- df %>%
  group_by(subscriber_category) %>%
  summarise(mean_subscribers = mean(num_subscribers, na.rm = TRUE),
            std_error = sd(num_subscribers, na.rm = TRUE) / sqrt(n())) %>%
  mutate(lower_ci = mean_subscribers - 1.96 * std_error,
         upper_ci = mean_subscribers + 1.96 * std_error)

print(subscriber_summary)
## # A tibble: 5 × 5
##   subscriber_category mean_subscribers std_error lower_ci upper_ci
##   <fct>                          <dbl>     <dbl>    <dbl>    <dbl>
## 1 None                              0        0         0        0 
## 2 Low                            1873.      46.5    1782.    1964.
## 3 Medium                        19122.     336.    18464.   19779.
## 4 High                          30048.     656.    28762.   31334.
## 5 Very High                     73031.    6689.    59921.   86140.

This illustrates that confident intervals you calculated give you a range of values for what the population means for number of subscribers and number of reviews might be. 95 % of the times the true population of mean subscribers lies in this range. We can say that the no.of subscribers with higher subscriber category tend to result in this range.

# Pair 2: Confidence Interval for Content Efficiency vs. Content Duration
df$content_duration_minutes <- as.numeric(gsub(" hours| minutes", "", df$content_duration)) * 60 +
                                as.numeric(gsub(".* (\\d+) minutes$", "\\1", df$content_duration))

df$content_efficiency <- df$num_subscribers / df$content_duration_minutes
df$content_efficiency[is.na(df$content_efficiency)] <- 0

mean_efficiency <- mean(df$content_efficiency, na.rm = TRUE)
std_error_efficiency <- sd(df$content_efficiency, na.rm = TRUE) / sqrt(sum(!is.na(df$content_efficiency)))

conf_interval_efficiency <- c(mean_efficiency - 1.96 * std_error_efficiency,
                              mean_efficiency + 1.96 * std_error_efficiency)

cat("95% Confidence Interval for Content Efficiency: [", conf_interval_efficiency[1], ", ", conf_interval_efficiency[2], "]\n")
## 95% Confidence Interval for Content Efficiency: [ 20.31935 ,  24.06173 ]

The confidence interval provides a range in which we are 95% confident that the true mean content efficiency for the population lies in this range. Based on the interval, we can say that the content duration with less time tend to result in higher content efficiency.As the interval is narrow and contains low values, it means the short content often tend to produce less content efficiency.