library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
df <- read.csv("~/Documents/STAT 2024/udemy_courses.csv")
sum(is.na(df$num_reviews))
## [1] 0
sum(is.na(df$review_to_subscriber_ratio))
## [1] 0
sum(is.infinite(df$content_efficiency))
## [1] 0
sum(is.na(df$content_efficiency))
## [1] 0
df$content_duration_minutes <- as.numeric(gsub(" hours| minutes", "", df$content_duration)) * 60 +
as.numeric(gsub(".* (\\d+) minutes$", "\\1", df$content_duration))
df$subscriber_category <- cut(df$num_subscribers,
breaks = c(-Inf, 0, 100, 500, 1000, Inf),
labels = c("None", "Low", "Medium", "High", "Very High"))
df$review_to_subscriber_ratio <- df$num_reviews / df$num_subscribers
df$content_efficiency <- df$num_reviews / df$content_duration_minutes
df$content_efficiency <- ifelse(df$content_duration_minutes > 0,
df$num_reviews / df$content_duration_minutes,
NA)
# Pair 1: Number of Subscribers vs. Subscriber Category
df_pair1 <- df %>%
filter(!is.na(num_subscribers) & !is.na(subscriber_category)) %>%
select(num_subscribers, subscriber_category) %>%
na.omit()
# Pair 2: Content Efficiency vs. Content Duration
df_pair2 <- df %>%
filter(!is.na(content_efficiency) & !is.na(content_duration_minutes)) %>%
select(content_efficiency, content_duration_minutes) %>%
na.omit()
# Display the first few rows of each pair
cat("Pair 1: Number of Subscribers vs. Subscriber Category\n")
## Pair 1: Number of Subscribers vs. Subscriber Category
print(head(df_pair1))
## num_subscribers subscriber_category
## 1 2147 Very High
## 2 2792 Very High
## 3 2174 Very High
## 4 2451 Very High
## 5 1276 Very High
## 6 9221 Very High
cat("\nPair 2: Content Efficiency vs. Content Duration\n")
##
## Pair 2: Content Efficiency vs. Content Duration
print(head(df_pair2))
## content_efficiency content_duration_minutes
## 1 0.25136612 91.5
## 2 0.38797814 2379.0
## 3 0.48524590 152.5
## 4 0.06010929 183.0
## 5 0.36885246 122.0
## 6 0.75409836 183.0
In the above code I chose two pairs of numeric variables. The first pair represent the relationship between Number of Subscribers vs. Subscriber Category(self-created column).This pair analyzes how subscriber numbers vary across different categories and the second pair focuses on relationship between Content Efficiency vs. Content Duration which analyzes how effectively courses convert content into subscribers.
rm(list = ls())
library(readr)
library(dplyr)
library(ggplot2)
# Load dataset
df <- read_csv("~/Documents/STAT 2024/udemy_courses.csv")
## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): course_title, url, level, subject
## dbl (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl (1): is_paid
## dttm (1): published_timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df)
## spc_tbl_ [3,678 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ course_id : num [1:3678] 1070968 1113822 1006314 1210588 1011058 ...
## $ course_title : chr [1:3678] "Ultimate Investment Banking Course" "Complete GST Course & Certification - Grow Your CA Practice" "Financial Modeling for Business Analysts and Consultants" "Beginner to Pro - Financial Analysis in Excel 2017" ...
## $ url : chr [1:3678] "https://www.udemy.com/ultimate-investment-banking-course/" "https://www.udemy.com/goods-and-services-tax/" "https://www.udemy.com/financial-modeling-for-business-analysts-and-consultants/" "https://www.udemy.com/complete-excel-finance-course-from-beginner-to-pro/" ...
## $ is_paid : logi [1:3678] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ price : num [1:3678] 200 75 45 95 200 150 65 95 195 200 ...
## $ num_subscribers : num [1:3678] 2147 2792 2174 2451 1276 ...
## $ num_reviews : num [1:3678] 23 923 74 11 45 138 178 148 34 14 ...
## $ num_lectures : num [1:3678] 51 274 51 36 26 25 26 23 38 15 ...
## $ level : chr [1:3678] "All Levels" "All Levels" "Intermediate Level" "All Levels" ...
## $ content_duration : num [1:3678] 1.5 39 2.5 3 2 3 1 2.5 2.5 1 ...
## $ published_timestamp: POSIXct[1:3678], format: "2017-01-18 20:58:58" "2017-03-09 16:34:20" ...
## $ subject : chr [1:3678] "Business Finance" "Business Finance" "Business Finance" "Business Finance" ...
## - attr(*, "spec")=
## .. cols(
## .. course_id = col_double(),
## .. course_title = col_character(),
## .. url = col_character(),
## .. is_paid = col_logical(),
## .. price = col_double(),
## .. num_subscribers = col_double(),
## .. num_reviews = col_double(),
## .. num_lectures = col_double(),
## .. level = col_character(),
## .. content_duration = col_double(),
## .. published_timestamp = col_datetime(format = ""),
## .. subject = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
df$content_efficiency <- ifelse(df$content_duration > 0,
df$num_subscribers / df$content_duration,
NA)
df$content_efficiency <- ifelse(df$content_duration * 60 > 0,
df$num_subscribers / (df$content_duration * 60),
NA)
df$review_to_subscriber_ratio <- df$num_reviews / (df$num_subscribers)
df$rs_ratio_category <- cut(df$review_to_subscriber_ratio,
breaks = c(-Inf, 0, 0.25, 0.5, 0.75, Inf),
labels = c("None", "Low", "Medium", "High", "Very High"))
# Visualize the relationship between number of reviews and review-to-subscriber ratio
ggplot(df, aes(x = num_reviews, y = review_to_subscriber_ratio,color=rs_ratio_category)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "blue") + # Add a trend line
labs(title = "Relationship between Number of Reviews and Review-to-Subscriber Ratio",
x = "Number of Reviews",
y = "Review-to-Subscriber Ratio",
color="review to subscriber ratio category") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 70 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 70 rows containing missing values or values outside the scale range
## (`geom_point()`).
Review-to-subscriber Ratio is self-created column calculated by dividing the number of reviews by the number of subscribers. This ratio provides insight into how actively the subscribed audience is reviewing the course.Most of the courses with higher rs ratio are near to zero. But that doesn’t mean the number of reviews are high. So we can conclude that the rs ratio provides a balenced scattered line along all different no. of reviews. The rs ratio which is near to zero are having no. of reviews and no. of subscribers nearly equal. rs ratio is explanatory variable in this case.
rm(list = ls())
library(readr)
library(dplyr)
df <- read_csv("~/Documents/STAT 2024/udemy_courses.csv")
## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): course_title, url, level, subject
## dbl (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl (1): is_paid
## dttm (1): published_timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df$content_duration_minutes <- as.numeric(gsub(" hours| minutes", "", df$content_duration)) * 60 +
as.numeric(gsub(".* (\\d+) minutes$", "\\1", df$content_duration))
df$content_efficiency <- df$num_subscribers / df$content_duration_minutes
df$content_efficiency[is.infinite(df$content_efficiency)] <- NA
df$content_efficiency[is.na(df$content_efficiency)] <- 0
head(df[, c("num_subscribers", "content_duration", "content_duration_minutes", "content_efficiency")])
## # A tibble: 6 × 4
## num_subscribers content_duration content_duration_minutes content_efficiency
## <dbl> <dbl> <dbl> <dbl>
## 1 2147 1.5 91.5 23.5
## 2 2792 39 2379 1.17
## 3 2174 2.5 152. 14.3
## 4 2451 3 183 13.4
## 5 1276 2 122 10.5
## 6 9221 3 183 50.4
df$content_efficiency_category <- cut(df$content_efficiency,
breaks = c(-Inf, 0, 10, 100, 200, Inf),
labels = c("None", "Low", "Medium", "High", "Very High"))
efficiency_counts <- table(df$content_efficiency_category)
cat("Counts of courses in each content efficiency category:\n")
## Counts of courses in each content efficiency category:
print(efficiency_counts)
##
## None Low Medium High Very High
## 70 2244 1191 117 56
# Visualize the comparison of content duration with the new content_efficiency_category
library(ggplot2)
ggplot(df, aes(x = content_duration_minutes, y = content_efficiency, color = content_efficiency_category)) +
geom_point(alpha = 0.6) +
labs(title = "Content Efficiency vs. Content Duration",
x = "Content Duration (Minutes)",
y = "Content Efficiency (Subscribers per Minute)",
color = "Content Efficiency Category") +
theme_minimal()
The trend shows that shorter courses have greater content efficiency in drawing more subscribers per minute. In contrast longer courses see diminishing returns in terms of subscriber engagement. This means course creators can improve subscriber interest and keep more learners engaged by making their courses to required length. The content efficiency is explanatory and content duration is response variable in this visualuzation.
df$review_to_subscriber_ratio <- ifelse(df$num_subscribers > 0,
df$num_reviews / df$num_subscribers,
NA)
cor(df$num_reviews, df$review_to_subscriber_ratio, use = "complete.obs")
## [1] 0.09189805
The correlation is 0.09 which indicates weak relation. We can say that the number of reviews and rs ratio are weekly dependent on each other. As the number of reviews increases, the rs ratio also increases.
# Correlation between Content Duration and Content Efficiency
cor_duration_efficiency <- cor(df$content_duration_minutes, df$content_efficiency, use = "complete.obs")
cat("Correlation between Content Duration and Content Efficiency:", cor_duration_efficiency, "\n")
## Correlation between Content Duration and Content Efficiency: -0.1097195
The correlation is -0.109 which indicates weak negative relationship. Low content duration has higher content efficiency and viceversa. This determines that the content efficiency and content duration are inversely propotional.
rm(list = ls())
library(readr)
library(dplyr)
library(ggplot2)
df <- read_csv("~/Documents/STAT 2024/udemy_courses.csv")
## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): course_title, url, level, subject
## dbl (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl (1): is_paid
## dttm (1): published_timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Pair 1: Confidence Interval for Number of Subscribers by Subscriber Category
df$subscriber_category <- cut(df$num_subscribers,
breaks = c(-Inf, 0, 15000, 25000, 40000, Inf),
labels = c("None", "Low", "Medium", "High", "Very High"))
subscriber_summary <- df %>%
group_by(subscriber_category) %>%
summarise(mean_subscribers = mean(num_subscribers, na.rm = TRUE),
std_error = sd(num_subscribers, na.rm = TRUE) / sqrt(n())) %>%
mutate(lower_ci = mean_subscribers - 1.96 * std_error,
upper_ci = mean_subscribers + 1.96 * std_error)
print(subscriber_summary)
## # A tibble: 5 × 5
## subscriber_category mean_subscribers std_error lower_ci upper_ci
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 None 0 0 0 0
## 2 Low 1873. 46.5 1782. 1964.
## 3 Medium 19122. 336. 18464. 19779.
## 4 High 30048. 656. 28762. 31334.
## 5 Very High 73031. 6689. 59921. 86140.
This illustrates that confident intervals you calculated give you a range of values for what the population means for number of subscribers and number of reviews might be. 95 % of the times the true population of mean subscribers lies in this range. We can say that the no.of subscribers with higher subscriber category tend to result in this range.
# Pair 2: Confidence Interval for Content Efficiency vs. Content Duration
df$content_duration_minutes <- as.numeric(gsub(" hours| minutes", "", df$content_duration)) * 60 +
as.numeric(gsub(".* (\\d+) minutes$", "\\1", df$content_duration))
df$content_efficiency <- df$num_subscribers / df$content_duration_minutes
df$content_efficiency[is.na(df$content_efficiency)] <- 0
mean_efficiency <- mean(df$content_efficiency, na.rm = TRUE)
std_error_efficiency <- sd(df$content_efficiency, na.rm = TRUE) / sqrt(sum(!is.na(df$content_efficiency)))
conf_interval_efficiency <- c(mean_efficiency - 1.96 * std_error_efficiency,
mean_efficiency + 1.96 * std_error_efficiency)
cat("95% Confidence Interval for Content Efficiency: [", conf_interval_efficiency[1], ", ", conf_interval_efficiency[2], "]\n")
## 95% Confidence Interval for Content Efficiency: [ 20.31935 , 24.06173 ]
The confidence interval provides a range in which we are 95% confident that the true mean content efficiency for the population lies in this range. Based on the interval, we can say that the content duration with less time tend to result in higher content efficiency.As the interval is narrow and contains low values, it means the short content often tend to produce less content efficiency.