Udemy Project: An R Analysis of Online Course Offerings

2024-05-06

Kliz John Andrei Millares

Source: Kaggle Udemy Dataset

Load the necessary libraries

library(readxl)
library(ggplot2)

Loading packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Read the data from the Excel file

courses_data <- read_excel("C:/Users/Guest123/Documents/Excel Data Analysis/Capstone Proj/udemy_courses excel.xlsx")
Structure of the Dataset
str(courses_data)
## tibble [3,678 × 12] (S3: tbl_df/tbl/data.frame)
##  $ course_id          : num [1:3678] 1070968 1113822 1006314 1210588 1011058 ...
##  $ course_title       : chr [1:3678] "Ultimate Investment Banking Course" "Complete GST Course & Certification - Grow Your CA Practice" "Financial Modeling for Business Analysts and Consultants" "Beginner to Pro - Financial Analysis in Excel 2017" ...
##  $ url                : chr [1:3678] "https://www.udemy.com/ultimate-investment-banking-course/" "https://www.udemy.com/goods-and-services-tax/" "https://www.udemy.com/financial-modeling-for-business-analysts-and-consultants/" "https://www.udemy.com/complete-excel-finance-course-from-beginner-to-pro/" ...
##  $ is_paid            : logi [1:3678] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ price              : num [1:3678] 200 75 45 95 200 150 65 95 195 200 ...
##  $ num_subscribers    : num [1:3678] 2147 2792 2174 2451 1276 ...
##  $ num_reviews        : num [1:3678] 23 923 74 11 45 138 178 148 34 14 ...
##  $ num_lectures       : num [1:3678] 51 274 51 36 26 25 26 23 38 15 ...
##  $ level              : chr [1:3678] "All Levels" "All Levels" "Intermediate Level" "All Levels" ...
##  $ content_duration   : num [1:3678] 1.5 39 2.5 3 2 3 1 2.5 2.5 1 ...
##  $ published_timestamp: chr [1:3678] "2017-01-18T20:58:58Z" "2017-03-09T16:34:20Z" "2016-12-19T19:26:30Z" "2017-05-30T20:07:24Z" ...
##  $ subject            : chr [1:3678] "Business Finance" "Business Finance" "Business Finance" "Business Finance" ...
# Print the first few rows of the dataset
print(head(courses_data))
## # A tibble: 6 × 12
##   course_id course_title         url   is_paid price num_subscribers num_reviews
##       <dbl> <chr>                <chr> <lgl>   <dbl>           <dbl>       <dbl>
## 1   1070968 Ultimate Investment… http… TRUE      200            2147          23
## 2   1113822 Complete GST Course… http… TRUE       75            2792         923
## 3   1006314 Financial Modeling … http… TRUE       45            2174          74
## 4   1210588 Beginner to Pro - F… http… TRUE       95            2451          11
## 5   1011058 How To Maximize You… http… TRUE      200            1276          45
## 6    192870 Trading Penny Stock… http… TRUE      150            9221         138
## # ℹ 5 more variables: num_lectures <dbl>, level <chr>, content_duration <dbl>,
## #   published_timestamp <chr>, subject <chr>
subject_counts <- table(courses_data$subject)

subject_counts_df <- as.data.frame(subject_counts)
colnames(subject_counts_df) <- c("Subject", "Count")

Create a color palette for the subjects

subject_colors <- rainbow(length(unique(subject_counts_df$Subject)))
max_count <- max(subject_counts_df$Count)

max_count_with_margin <- max_count * 1.2  

Create the bar chart using ggplot2 with different colors for each subject

bar_chart <- ggplot(data = subject_counts_df, aes(x = Subject, y = Count, fill = Subject)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Count), vjust = -0.5, size = 3) +  # Add text labels on top of each bar
  scale_fill_manual(values = subject_colors) +  # Use the custom color palette
  labs(title = "Number of Courses by Subject in Udemy",
       caption = "Made by Kliz Millares",
       x = "Subject",
       y = "Number of Courses") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels for better readability
  ylim(0, max_count_with_margin)  # Adjust the y-axis limits

Print the bar chart

print(bar_chart)

Calculate the total number of subscribers for each subject

total_subscribers <- aggregate(num_subscribers ~ subject, data = courses_data, FUN = sum)
total_subscribers$percentage <- (total_subscribers$num_subscribers / sum(total_subscribers$num_subscribers)) * 100

Color for Piechart

custom_colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf")

Create the pie chart using ggplot2 with custom colors and text appearance

pie_chart <- ggplot(total_subscribers, aes(x = "", y = num_subscribers, fill = subject)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +  # Convert the bar chart into a pie chart
  geom_text(aes(label = paste0(round(percentage, 1), "%")), position = position_stack(vjust = 0.5),
            size = 5, color = "white", fontface = "bold") +  # Add data callouts with customized text appearance
  labs(title = "Percentage of Total Subscribers by Subject",
       fill = "Subject", caption = "Made by Kliz Millares",
       x = NULL,
       y = NULL) +
  scale_fill_manual(values = custom_colors) +  # Use the custom color palette
  theme_void() +  # Remove axis labels and background
  theme(legend.position = "right")  # Position the legend
print(pie_chart)

Create histograms for numerical variables

histograms <- list()

histograms$price <- ggplot(courses_data, aes(x = price)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Course Prices",
       caption = "Made by Kliz Millares",
       x = "Price",
       y = "Frequency") +
  theme_minimal()

print(histograms$price)

Aggregate data to calculate total subscribers for each subject

total_subscribers <- aggregate(num_subscribers ~ subject, data = courses_data, FUN = sum)

total_subscribers$num_subscribers <- round(total_subscribers$num_subscribers)

Create a bar plot for the distribution of number of subscribers across subjects

bar_plot <- ggplot(total_subscribers, aes(x = reorder(subject, -num_subscribers), y = num_subscribers, fill = subject)) +
  geom_bar(stat = "identity", color = "black") +
  labs(title = "Distribution of Number of Subscribers Across Subjects",
       caption = "Made by Kliz Millares",
       x = "Subject",
       y = "Total Number of Subscribers") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels for better readability
  scale_fill_brewer(palette = "Set3") +  # Set color palette
  scale_y_continuous(labels = scales::comma)  # Format y-axis labels as whole numbers

print(bar_plot)

Loading package

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor

Filter out free courses (where price is greater than zero)

courses_data_filtered <- subset(courses_data, price > 0)

Create scatter plot for exploring relationship between price and number of subscribers (excluding free courses)

scatter_plot_price_vs_subscribers <- ggplot(courses_data_filtered, aes(x = price, y = num_subscribers)) +
  geom_point(color = "blue") +
  labs(title = "Price vs. Number of Subscribers (Excluding Free Courses)",
       caption = "Made by Kliz Millares",
       x = "Price",
       y = "Number of Subscribers") +
  theme_minimal() +
  scale_y_continuous(labels = comma)  # Format y-axis labels as whole numbers

# Print the scatter plot
print(scatter_plot_price_vs_subscribers)

Create scatter plots for exploring relationships between numerical variables

scatter_plots <- list()

Scatter plot for number of lectures vs. content duration

scatter_plots$lectures_vs_duration <- ggplot(courses_data, aes(x = num_lectures, y = content_duration)) +
  geom_point(color = "green") +
  labs(title = "Number of Lectures vs. Content Duration",
       caption = "Made by Kliz Millares",
       x = "Number of Lectures",
       y = "Content Duration (Hours)") +
  theme_minimal()

# Print the scatter plots
print(scatter_plots$lectures_vs_duration)

Load the necessary libraries

library(readxl)
library(ggplot2)
library(dplyr)
library(lubridate)

Set the variables

courses_data$published_timestamp <- as.Date(courses_data$published_timestamp)

courses_data$year <- year(courses_data$published_timestamp)

courses_published <- courses_data %>%
  group_by(year) %>%
  summarize(num_courses = n())

Create a line plot to visualize the number of courses published over time

line_plot <- ggplot(courses_published, aes(x = year, y = num_courses)) +
  geom_line(color = "#0072B2", size = 1) +
  geom_point(color = "#0072B2", size = 3) +
  labs(title = "Number of Courses Published Over Time",
       x = "Year",
       y = "Number of Courses Published",
       caption = "Source: Udemy Courses Dataset") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 16),
        axis.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        plot.caption = element_text(hjust = 1, size = 12, color = "gray"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print the line plot
print(line_plot)

Load the necessary libraries

library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

Select numerical variables for correlation analysis

numerical_vars <- subset(courses_data, select = c("price", "num_subscribers", "num_reviews", "num_lectures", "content_duration"))

# Calculate the correlation matrix
correlation_matrix <- cor(numerical_vars)

# Print the correlation matrix
print(correlation_matrix)
##                       price num_subscribers num_reviews num_lectures
## price            1.00000000      0.05076935   0.1136959    0.3301604
## num_subscribers  0.05076935      1.00000000   0.6499455    0.1577456
## num_reviews      0.11369592      0.64994554   1.0000000    0.2430288
## num_lectures     0.33016045      0.15774557   0.2430288    1.0000000
## content_duration 0.29344962      0.16183868   0.2288893    0.8016471
##                  content_duration
## price                   0.2934496
## num_subscribers         0.1618387
## num_reviews             0.2288893
## num_lectures            0.8016471
## content_duration        1.0000000

Plot heatmap

correlation_matrix <- cor(numerical_vars)
correlation_df <- melt(correlation_matrix)

ggplot(data = correlation_df, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "blue", high = "red") +
  labs(title = "Correlation Heatmap",
       caption = "Made by Kliz Millares",
       x = "Variables",
       y = "Variables") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_blank())

Summary of R Project on Udemy Online Course Data

This project analyzed Udemy online course data to gain insights into course offerings and customer preferences. Here are the key findings:

Overall, the analysis highlights the success of Web Development courses and a positive trend between course ratings and subscribers. These insights can be used to optimize course offerings and marketing strategies to better serve customer needs.

Trademark © Kliz John Millares