Load necessary libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

# Load data
data <- read.csv("~/Downloads/udemy_courses.csv.xls")

Group1

#Explanation: I am calculating the mean, median and sum of total number of subscribers for all cources in each subject. So, that the smallest group is tagged as least likely and others as common.

#Group by ‘subject’ and summarize ‘num_subscribers’

# Load necessary libraries
library(dplyr)

# 1. Group by 'subject' and summarize 'num_subscribers'
group_by_subject <- data %>%
  group_by(subject) %>%
  summarise(
    mean_subscribers = mean(num_subscribers, na.rm = TRUE),
    median_subscribers = median(num_subscribers, na.rm = TRUE),
    total_subscribers = sum(num_subscribers, na.rm = TRUE),
    count = n() # To calculate group size
  )

# Assign special tag to the smallest group (least number of courses)
group_by_subject <- group_by_subject %>%
  mutate(special_tag = ifelse(count == min(count), "Least Likely", "Common"))

# Print the grouped summary
print(group_by_subject)

## # A tibble: 4 × 6
##   subject            mean_subscribers median_subscribers total_subscribers count
##   <chr>                         <dbl>              <dbl>             <int> <int>
## 1 Business Finance              1564.               484            1868711  1195
## 2 Graphic Design                1763.               384            1063148   603
## 3 Musical Instrumen…            1245.               138             846689   680
## 4 Web Development               6650.              2416.           7980572  1200
## # ℹ 1 more variable: special_tag <chr>

#Here Web Development and Graphic Design have more subscribers compared to other subjects indicating that they are popular

Group 2

#Group by ‘level’ and summarize ‘num_lectures’

# 2. Group by 'level' and summarize 'num_lectures'
group_by_level <- data %>%
  group_by(level) %>%
  summarise(
    mean_lectures = mean(num_lectures, na.rm = TRUE),
    median_lectures = median(num_lectures, na.rm = TRUE),
    total_lectures = sum(num_lectures, na.rm = TRUE),
    count = n()
  )

# Assign special tag to the smallest group
group_by_level <- group_by_level %>%
  mutate(special_tag = ifelse(count == min(count), "Least Likely", "Common"))

# Print the grouped summary
print(group_by_level)

## # A tibble: 4 × 6
##   level           mean_lectures median_lectures total_lectures count special_tag
##   <chr>                   <dbl>           <dbl>          <int> <int> <chr>      
## 1 All Levels               46.0            27            88698  1929 Common     
## 2 Beginner Level           33.0            23            41908  1270 Common     
## 3 Expert Level             30.8            21.5           1785    58 Least Like…
## 4 Intermediate L…          35.9            23            15129   421 Common

#Here Expert Level courses have fewer lectures on average, while Beginner and Intermediate level courses could offer more lecturers

Group 3

#Group by ‘is_paid’ and summarize ‘content_duration’

# Load necessary libraries
library(dplyr)

# 3. Group by 'is_paid' and summarize 'content_duration'
group_by_is_paid <- data %>%
  group_by(is_paid) %>%
  summarise(
    mean_duration = mean(content_duration, na.rm = TRUE),
    median_duration = median(content_duration, na.rm = TRUE),
    total_duration = sum(content_duration, na.rm = TRUE),
    count = n()
  )

# Assign special tag to the smallest group
group_by_is_paid <- group_by_is_paid %>%
  mutate(special_tag = ifelse(count == min(count), "Least Likely", "Common"))

# Print the grouped summary
print(group_by_is_paid)

## # A tibble: 2 × 6
##   is_paid mean_duration median_duration total_duration count special_tag 
##   <chr>           <dbl>           <dbl>          <dbl> <int> <chr>       
## 1 False            2.21             1.5           685.   310 Least Likely
## 2 True             4.27             2.5         14374.  3368 Common

#From Group by price and content_duration Higher- priced courses tend to have longer content durations, while lower priced courses offer shorter content

Assign the Lowest Probability Groups a Special Tag

# Load necessary libraries
library(dplyr)

# Load the data
data <- read.csv("~/Downloads/udemy_courses.csv.xls") 

# Calculate the probability of each subject
group_counts <- data %>%
  group_by(subject) %>%
  summarise(total = n()) %>%
  mutate(probability = total / sum(total)) %>%
  arrange(probability)

# Assign a special tag to the lowest 20% probability groups
threshold <- quantile(group_counts$probability, 0.2)
group_counts <- group_counts %>%
  mutate(special_tag = ifelse(probability <= threshold, "Low Probability", "Other"))

# Merge this special tag back into the original dataset
data <- data %>%
  left_join(group_counts %>% select(subject, special_tag), by = "subject")
print(group_counts)

## # A tibble: 4 × 4
##   subject             total probability special_tag    
##   <chr>               <int>       <dbl> <chr>          
## 1 Graphic Design        603       0.164 Low Probability
## 2 Musical Instruments   680       0.185 Other          
## 3 Business Finance     1195       0.325 Other          
## 4 Web Development      1200       0.326 Other

Testable Hypothesis: Average Subscribers Comparison

# Display column names in the dataset
colnames(data)

##  [1] "course_id"           "course_title"        "url"                
##  [4] "is_paid"             "price"               "num_subscribers"    
##  [7] "num_reviews"         "num_lectures"        "level"              
## [10] "content_duration"    "published_timestamp" "subject"            
## [13] "special_tag"

# Assuming 'num_subscribers' is the correct column name, replace in the code:
subscriber_summary <- data %>%
  group_by(special_tag) %>%
  summarise(mean_subscribers = mean(num_subscribers, na.rm = TRUE),  # Use correct column name
            median_subscribers = median(num_subscribers, na.rm = TRUE),  # Use correct column name
            count = n())

print(subscriber_summary)

## # A tibble: 2 × 4
##   special_tag     mean_subscribers median_subscribers count
##   <chr>                      <dbl>              <int> <int>
## 1 Low Probability            1763.                384   603
## 2 Other                      3478.               1027  3075

# (Optional) Perform a t-test to compare means
t_test_result <- t.test(num_subscribers ~ special_tag, data = data)  # Use correct column name
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  num_subscribers by special_tag
## t = -6.3212, df = 1779, p-value = 3.273e-10
## alternative hypothesis: true difference in means between group Low Probability and group Other is not equal to 0
## 95 percent confidence interval:
##  -2247.469 -1183.065
## sample estimates:
## mean in group Low Probability           mean in group Other 
##                      1763.098                      3478.365

# Box Plot of Total Subscribers by Special Tag (Updated with correct column name)
ggplot(data, aes(x = special_tag, y = num_subscribers, fill = special_tag)) +  # Use correct column name
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Total Subscribers by Special Tag", x = "Special Tag", y = "Number of Subscribers")

Visualizations for Each Grouping

# Check column names in the dataset
colnames(data)

##  [1] "course_id"           "course_title"        "url"                
##  [4] "is_paid"             "price"               "num_subscribers"    
##  [7] "num_reviews"         "num_lectures"        "level"              
## [10] "content_duration"    "published_timestamp" "subject"            
## [13] "special_tag"

# Load necessary libraries for plotting
library(ggplot2)

# Visualization 1: Bar Plot of Subjects by Probability
ggplot(group_counts, aes(x = reorder(subject, probability), y = probability, fill = special_tag)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Probability of Each Subject", x = "Subject", y = "Probability")

# Visualization 2: Box Plot of Total Subscribers by Special Tag
# Replace 'num_subscribers' with the correct column name if it's different
ggplot(data, aes(x = special_tag, y = num_subscribers, fill = special_tag)) +  # Correct column name here
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Total Subscribers by Special Tag", x = "Special Tag", y = "Number of Subscribers")

# Visualization 3: Box Plot of Number of Lectures by Special Tag
# Ensure 'num_lectures' is a valid column in your dataset
ggplot(data, aes(x = special_tag, y = num_lectures, fill = special_tag)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Number of Lectures by Special Tag", x = "Special Tag", y = "Number of Lectures")

Data Visualization

#Bar plot for subject vs num_subscribers

#a. Bar plot for subject vs num_subscribers:

ggplot(group_by_subject, aes(x = subject, y = total_subscribers, fill = special_tag)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Total Subscribers by Subject", x = "Subject", y = "Total Subscribers")

#This plot visualizes the total number of subscribers for each subject and provides the comparision of subscribers numbers across various subjects #Web Development has more subscribers, may be it’s popular for career advancement

Box plot for level vs num_lectures

#b. Box plot for level vs num_lectures

ggplot(data, aes(x = level, y = num_lectures, fill = level)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Number of Lectures by Course Level", x = "Course Level", y = "Number of Lectures")

#Beginner level and Intermediate level have more lectures than Expert level.

Bar plot for is_paid vs content_duration

#c. Bar plot for is_paid vs content_duration

ggplot(group_by_is_paid, aes(x = is_paid, y = total_duration, fill = special_tag)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Total Content Duration by Paid/Free Courses", x = "Is Paid", y = "Total Content Duration")

#Paid courses have most of the content duration than unpaid. Higher the price longer the content duration # Load and Inspect Data

Load necessary libraries

library(readr)
library(dplyr)
library(ggplot2)

Load the dataset

# Assuming the dataset is saved as a CSV file
data <- read_csv("~/Downloads/udemy_courses.csv.xls")

## Rows: 3678 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): course_title, url, level, subject
## dbl  (6): course_id, price, num_subscribers, num_reviews, num_lectures, cont...
## lgl  (1): is_paid
## dttm (1): published_timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View the first few rows and columns of the data
head(data)

## # A tibble: 6 × 12
##   course_id course_title         url   is_paid price num_subscribers num_reviews
##       <dbl> <chr>                <chr> <lgl>   <dbl>           <dbl>       <dbl>
## 1   1070968 Ultimate Investment… http… TRUE      200            2147          23
## 2   1113822 Complete GST Course… http… TRUE       75            2792         923
## 3   1006314 Financial Modeling … http… TRUE       45            2174          74
## 4   1210588 Beginner to Pro - F… http… TRUE       95            2451          11
## 5   1011058 How To Maximize You… http… TRUE      200            1276          45
## 6    192870 Trading Penny Stock… http… TRUE      150            9221         138
## # ℹ 5 more variables: num_lectures <dbl>, level <chr>, content_duration <dbl>,
## #   published_timestamp <dttm>, subject <chr>

#4. Create a Data Frame of All Combinations #After extracting two categorical variables level and subject, and then generate all possible combinations

# Extract unique values of 'level' and 'subject'
levels <- unique(data$level)
subjects <- unique(data$subject)

# Create all combinations of 'level' and 'subject'
combinations <- expand.grid(level = levels, subject = subjects)

# Display the first few rows of the combinations
head(combinations)

##                level          subject
## 1         All Levels Business Finance
## 2 Intermediate Level Business Finance
## 3     Beginner Level Business Finance
## 4       Expert Level Business Finance
## 5         All Levels   Graphic Design
## 6 Intermediate Level   Graphic Design

#5. Identify Missing Combinations #merging the combinations DataFrame with the actual data to find which combinations are missing

# Keep only the unique combinations in the original data
existing_combinations <- data %>%
  select(level, subject) %>%
  distinct()

# Merge all combinations with existing combinations and find the missing ones
missing_combinations <- anti_join(combinations, existing_combinations, by = c("level", "subject"))

# Display missing combinations
missing_combinations

## [1] level   subject
## <0 rows> (or 0-length row.names)

#6. Count the Occurrence of Each Combination #we can count the number of times the combination of level and subject is appearing in the data

# Group by 'level' and 'subject', and count the occurrences
combination_counts <- data %>%
  group_by(level, subject) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

## `summarise()` has grouped output by 'level'. You can override using the
## `.groups` argument.

# Display the most and least common combinations
most_common <- combination_counts[1, ]
least_common <- combination_counts[nrow(combination_counts), ]

most_common

## # A tibble: 1 × 3
## # Groups:   level [1]
##   level      subject          count
##   <chr>      <chr>            <int>
## 1 All Levels Business Finance   696

least_common

## # A tibble: 1 × 3
## # Groups:   level [1]
##   level        subject        count
##   <chr>        <chr>          <int>
## 1 Expert Level Graphic Design     5

#7. Visualize the Combinations #Now I am visualizing the occurance of combinations through Bar plot

# Bar plot of combination counts
ggplot(combination_counts, aes(x = subject, y = count, fill = level)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Course Levels Across Subjects", x = "Subject", y = "Number of Courses") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#For Graphic Design and Web Development Beginner level courses are little similar. I guess these might be the most combinations and for Business Finance too according tyo the bar plot

assignment3

2024-09-18

Load necessary libraries

Group1

Group 2

Group 3

Assign the Lowest Probability Groups a Special Tag

Testable Hypothesis: Average Subscribers Comparison

Visualizations for Each Grouping

Data Visualization

Box plot for level vs num_lectures

Bar plot for is_paid vs content_duration

Load necessary libraries

Load the dataset