library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Load data
data <- read.csv("~/Downloads/udemy_courses.csv.xls")
#Explanation: I am calculating the mean, median and sum of total number of subscribers for all cources in each subject. So, that the smallest group is tagged as least likely and others as common.
#Group by ‘subject’ and summarize ‘num_subscribers’
# Load necessary libraries
library(dplyr)
# 1. Group by 'subject' and summarize 'num_subscribers'
group_by_subject <- data %>%
group_by(subject) %>%
summarise(
mean_subscribers = mean(num_subscribers, na.rm = TRUE),
median_subscribers = median(num_subscribers, na.rm = TRUE),
total_subscribers = sum(num_subscribers, na.rm = TRUE),
count = n() # To calculate group size
)
# Assign special tag to the smallest group (least number of courses)
group_by_subject <- group_by_subject %>%
mutate(special_tag = ifelse(count == min(count), "Least Likely", "Common"))
# Print the grouped summary
print(group_by_subject)
## # A tibble: 4 × 6
## subject mean_subscribers median_subscribers total_subscribers count
## <chr> <dbl> <dbl> <int> <int>
## 1 Business Finance 1564. 484 1868711 1195
## 2 Graphic Design 1763. 384 1063148 603
## 3 Musical Instrumen… 1245. 138 846689 680
## 4 Web Development 6650. 2416. 7980572 1200
## # ℹ 1 more variable: special_tag <chr>
#Here Web Development and Graphic Design have more subscribers compared to other subjects indicating that they are popular
#Group by ‘level’ and summarize ‘num_lectures’
# 2. Group by 'level' and summarize 'num_lectures'
group_by_level <- data %>%
group_by(level) %>%
summarise(
mean_lectures = mean(num_lectures, na.rm = TRUE),
median_lectures = median(num_lectures, na.rm = TRUE),
total_lectures = sum(num_lectures, na.rm = TRUE),
count = n()
)
# Assign special tag to the smallest group
group_by_level <- group_by_level %>%
mutate(special_tag = ifelse(count == min(count), "Least Likely", "Common"))
# Print the grouped summary
print(group_by_level)
## # A tibble: 4 × 6
## level mean_lectures median_lectures total_lectures count special_tag
## <chr> <dbl> <dbl> <int> <int> <chr>
## 1 All Levels 46.0 27 88698 1929 Common
## 2 Beginner Level 33.0 23 41908 1270 Common
## 3 Expert Level 30.8 21.5 1785 58 Least Like…
## 4 Intermediate L… 35.9 23 15129 421 Common
#Here Expert Level courses have fewer lectures on average, while Beginner and Intermediate level courses could offer more lecturers
#Group by ‘is_paid’ and summarize ‘content_duration’
# Load necessary libraries
library(dplyr)
# 3. Group by 'is_paid' and summarize 'content_duration'
group_by_is_paid <- data %>%
group_by(is_paid) %>%
summarise(
mean_duration = mean(content_duration, na.rm = TRUE),
median_duration = median(content_duration, na.rm = TRUE),
total_duration = sum(content_duration, na.rm = TRUE),
count = n()
)
# Assign special tag to the smallest group
group_by_is_paid <- group_by_is_paid %>%
mutate(special_tag = ifelse(count == min(count), "Least Likely", "Common"))
# Print the grouped summary
print(group_by_is_paid)
## # A tibble: 2 × 6
## is_paid mean_duration median_duration total_duration count special_tag
## <chr> <dbl> <dbl> <dbl> <int> <chr>
## 1 False 2.21 1.5 685. 310 Least Likely
## 2 True 4.27 2.5 14374. 3368 Common
#From Group by price and content_duration Higher- priced courses tend to have longer content durations, while lower priced courses offer shorter content
# Load necessary libraries
library(dplyr)
# Load the data
data <- read.csv("~/Downloads/udemy_courses.csv.xls")
# Calculate the probability of each subject
group_counts <- data %>%
group_by(subject) %>%
summarise(total = n()) %>%
mutate(probability = total / sum(total)) %>%
arrange(probability)
# Assign a special tag to the lowest 20% probability groups
threshold <- quantile(group_counts$probability, 0.2)
group_counts <- group_counts %>%
mutate(special_tag = ifelse(probability <= threshold, "Low Probability", "Other"))
# Merge this special tag back into the original dataset
data <- data %>%
left_join(group_counts %>% select(subject, special_tag), by = "subject")
print(group_counts)
## # A tibble: 4 × 4
## subject total probability special_tag
## <chr> <int> <dbl> <chr>
## 1 Graphic Design 603 0.164 Low Probability
## 2 Musical Instruments 680 0.185 Other
## 3 Business Finance 1195 0.325 Other
## 4 Web Development 1200 0.326 Other
# Display column names in the dataset
colnames(data)
## [1] "course_id" "course_title" "url"
## [4] "is_paid" "price" "num_subscribers"
## [7] "num_reviews" "num_lectures" "level"
## [10] "content_duration" "published_timestamp" "subject"
## [13] "special_tag"
# Assuming 'num_subscribers' is the correct column name, replace in the code:
subscriber_summary <- data %>%
group_by(special_tag) %>%
summarise(mean_subscribers = mean(num_subscribers, na.rm = TRUE), # Use correct column name
median_subscribers = median(num_subscribers, na.rm = TRUE), # Use correct column name
count = n())
print(subscriber_summary)
## # A tibble: 2 × 4
## special_tag mean_subscribers median_subscribers count
## <chr> <dbl> <int> <int>
## 1 Low Probability 1763. 384 603
## 2 Other 3478. 1027 3075
# (Optional) Perform a t-test to compare means
t_test_result <- t.test(num_subscribers ~ special_tag, data = data) # Use correct column name
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: num_subscribers by special_tag
## t = -6.3212, df = 1779, p-value = 3.273e-10
## alternative hypothesis: true difference in means between group Low Probability and group Other is not equal to 0
## 95 percent confidence interval:
## -2247.469 -1183.065
## sample estimates:
## mean in group Low Probability mean in group Other
## 1763.098 3478.365
# Box Plot of Total Subscribers by Special Tag (Updated with correct column name)
ggplot(data, aes(x = special_tag, y = num_subscribers, fill = special_tag)) + # Use correct column name
geom_boxplot() +
theme_minimal() +
labs(title = "Total Subscribers by Special Tag", x = "Special Tag", y = "Number of Subscribers")
# Check column names in the dataset
colnames(data)
## [1] "course_id" "course_title" "url"
## [4] "is_paid" "price" "num_subscribers"
## [7] "num_reviews" "num_lectures" "level"
## [10] "content_duration" "published_timestamp" "subject"
## [13] "special_tag"
# Load necessary libraries for plotting
library(ggplot2)
# Visualization 1: Bar Plot of Subjects by Probability
ggplot(group_counts, aes(x = reorder(subject, probability), y = probability, fill = special_tag)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Probability of Each Subject", x = "Subject", y = "Probability")
# Visualization 2: Box Plot of Total Subscribers by Special Tag
# Replace 'num_subscribers' with the correct column name if it's different
ggplot(data, aes(x = special_tag, y = num_subscribers, fill = special_tag)) + # Correct column name here
geom_boxplot() +
theme_minimal() +
labs(title = "Total Subscribers by Special Tag", x = "Special Tag", y = "Number of Subscribers")
# Visualization 3: Box Plot of Number of Lectures by Special Tag
# Ensure 'num_lectures' is a valid column in your dataset
ggplot(data, aes(x = special_tag, y = num_lectures, fill = special_tag)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Number of Lectures by Special Tag", x = "Special Tag", y = "Number of Lectures")
#Bar plot for subject vs num_subscribers
#a. Bar plot for subject vs num_subscribers:
ggplot(group_by_subject, aes(x = subject, y = total_subscribers, fill = special_tag)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Total Subscribers by Subject", x = "Subject", y = "Total Subscribers")
#This plot visualizes the total number of subscribers for each subject and provides the comparision of subscribers numbers across various subjects #Web Development has more subscribers, may be it’s popular for career advancement
#b. Box plot for level vs num_lectures
ggplot(data, aes(x = level, y = num_lectures, fill = level)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Number of Lectures by Course Level", x = "Course Level", y = "Number of Lectures")
#Beginner level and Intermediate level have more lectures than Expert level.
#c. Bar plot for is_paid vs content_duration
ggplot(group_by_is_paid, aes(x = is_paid, y = total_duration, fill = special_tag)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Total Content Duration by Paid/Free Courses", x = "Is Paid", y = "Total Content Duration")
#Paid courses have most of the content duration than unpaid. Higher the price longer the content duration # Load and Inspect Data
library(readr)
library(dplyr)
library(ggplot2)
# Assuming the dataset is saved as a CSV file
data <- read_csv("~/Downloads/udemy_courses.csv.xls")
## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): course_title, url, level, subject
## dbl (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl (1): is_paid
## dttm (1): published_timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows and columns of the data
head(data)
## # A tibble: 6 × 12
## course_id course_title url is_paid price num_subscribers num_reviews
## <dbl> <chr> <chr> <lgl> <dbl> <dbl> <dbl>
## 1 1070968 Ultimate Investment… http… TRUE 200 2147 23
## 2 1113822 Complete GST Course… http… TRUE 75 2792 923
## 3 1006314 Financial Modeling … http… TRUE 45 2174 74
## 4 1210588 Beginner to Pro - F… http… TRUE 95 2451 11
## 5 1011058 How To Maximize You… http… TRUE 200 1276 45
## 6 192870 Trading Penny Stock… http… TRUE 150 9221 138
## # ℹ 5 more variables: num_lectures <dbl>, level <chr>, content_duration <dbl>,
## # published_timestamp <dttm>, subject <chr>
#4. Create a Data Frame of All Combinations #After extracting two categorical variables level and subject, and then generate all possible combinations
# Extract unique values of 'level' and 'subject'
levels <- unique(data$level)
subjects <- unique(data$subject)
# Create all combinations of 'level' and 'subject'
combinations <- expand.grid(level = levels, subject = subjects)
# Display the first few rows of the combinations
head(combinations)
## level subject
## 1 All Levels Business Finance
## 2 Intermediate Level Business Finance
## 3 Beginner Level Business Finance
## 4 Expert Level Business Finance
## 5 All Levels Graphic Design
## 6 Intermediate Level Graphic Design
#5. Identify Missing Combinations #merging the combinations DataFrame with the actual data to find which combinations are missing
# Keep only the unique combinations in the original data
existing_combinations <- data %>%
select(level, subject) %>%
distinct()
# Merge all combinations with existing combinations and find the missing ones
missing_combinations <- anti_join(combinations, existing_combinations, by = c("level", "subject"))
# Display missing combinations
missing_combinations
## [1] level subject
## <0 rows> (or 0-length row.names)
#6. Count the Occurrence of Each Combination #we can count the number of times the combination of level and subject is appearing in the data
# Group by 'level' and 'subject', and count the occurrences
combination_counts <- data %>%
group_by(level, subject) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'level'. You can override using the
## `.groups` argument.
# Display the most and least common combinations
most_common <- combination_counts[1, ]
least_common <- combination_counts[nrow(combination_counts), ]
most_common
## # A tibble: 1 × 3
## # Groups: level [1]
## level subject count
## <chr> <chr> <int>
## 1 All Levels Business Finance 696
least_common
## # A tibble: 1 × 3
## # Groups: level [1]
## level subject count
## <chr> <chr> <int>
## 1 Expert Level Graphic Design 5
#7. Visualize the Combinations #Now I am visualizing the occurance of combinations through Bar plot
# Bar plot of combination counts
ggplot(combination_counts, aes(x = subject, y = count, fill = level)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Course Levels Across Subjects", x = "Subject", y = "Number of Courses") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#For Graphic Design and Web Development Beginner level courses are little similar. I guess these might be the most combinations and for Business Finance too according tyo the bar plot