The dataset is a compilation of udemy online courses created and upoaded on YouTube between 2011 to 2017. It contains 3678 observations and 12 attributes which include course_id, course_title, url, is_paid(defines if a course is free or not), price, num_subscribers, num_review, num_lectures, level, content_duration, published_timestamp, and subject.
library(tidyverse)
#Read data from Github
udemy_course<-read.csv(file="https://raw.githubusercontent.com/nnaemeka-git/payment/main/udemy_courses.csv")
summary(udemy_course)
## course_id course_title url is_paid
## Min. : 8324 Length:3678 Length:3678 Mode :logical
## 1st Qu.: 407693 Class :character Class :character FALSE:310
## Median : 687917 Mode :character Mode :character TRUE :3368
## Mean : 675972
## 3rd Qu.: 961356
## Max. :1282064
## price num_subscribers num_reviews num_lectures
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 20.00 1st Qu.: 111.0 1st Qu.: 4.0 1st Qu.: 15.00
## Median : 45.00 Median : 911.5 Median : 18.0 Median : 25.00
## Mean : 66.05 Mean : 3197.2 Mean : 156.3 Mean : 40.11
## 3rd Qu.: 95.00 3rd Qu.: 2546.0 3rd Qu.: 67.0 3rd Qu.: 45.75
## Max. :200.00 Max. :268923.0 Max. :27445.0 Max. :779.00
## level content_duration published_timestamp subject
## Length:3678 Min. : 0.000 Length:3678 Length:3678
## Class :character 1st Qu.: 1.000 Class :character Class :character
## Mode :character Median : 2.000 Mode :character Mode :character
## Mean : 4.095
## 3rd Qu.: 4.500
## Max. :78.500
level_course <- udemy_course %>%
group_by(level) %>% summarise(Number_of_courses = n())#%>%
print(data.frame(level_course))
## level Number_of_courses
## 1 All Levels 1929
## 2 Beginner Level 1270
## 3 Expert Level 58
## 4 Intermediate Level 421
ggplot(level_course,aes(level,Number_of_courses))+geom_col(fill="#FF5733")+geom_text(aes(label=Number_of_courses))+
labs(x="Level", y="Number of Courses", title="The number of courses/videos for each level") +theme_classic()
All-levels has the highest number of courses uploaded followed by the Beginner level courses. Expert level has only 58 courses uploaded which is the smallest number of courses uploaded for all category of courses.
total_sub <- udemy_course %>%
group_by(level)%>%
summarise(sum_of_subscribers=sum(num_subscribers))
print(data.frame(total_sub))
## level sum_of_subscribers
## 1 All Levels 6915076
## 2 Beginner Level 4051843
## 3 Expert Level 50196
## 4 Intermediate Level 742005
ggplot(total_sub, aes(level,sum_of_subscribers))+geom_col(fill="#C6B80A")+ geom_text(aes(label=sum_of_subscribers))+
labs(x="Level", y="Total Subscribers", title="The number of Subscribers in each course level") +theme_classic()
A total of 6915076 people subscribed to All-level courses. 4051843 subscribed to the Beginner-level course. The expert level has the least subscription of 50196 followed by the intermediate level which has a total of 742005 subscriptions.
subj_video <- udemy_course %>%
group_by(subject,level)%>% summarise(Number_of_courses=n(),.groups = 'drop')
data.frame(subj_video)
## subject level Number_of_courses
## 1 Business Finance All Levels 696
## 2 Business Finance Beginner Level 340
## 3 Business Finance Expert Level 31
## 4 Business Finance Intermediate Level 128
## 5 Graphic Design All Levels 298
## 6 Graphic Design Beginner Level 243
## 7 Graphic Design Expert Level 5
## 8 Graphic Design Intermediate Level 57
## 9 Musical Instruments All Levels 276
## 10 Musical Instruments Beginner Level 296
## 11 Musical Instruments Expert Level 7
## 12 Musical Instruments Intermediate Level 101
## 13 Web Development All Levels 659
## 14 Web Development Beginner Level 391
## 15 Web Development Expert Level 15
## 16 Web Development Intermediate Level 135
subj_video <- udemy_course %>%
group_by(subject,level)%>%
summarise(Number_of_courses=n(),.groups='drop')
subj_video$SubjectLevel <- paste(subj_video$subject,subj_video$level,sep=" - ")
subj_df <- data.frame(subj_video)
ggplot(subj_df,aes(x=reorder(SubjectLevel,Number_of_courses),y=Number_of_courses))+geom_col(fill="#BA25AE")+ geom_text(aes(label=Number_of_courses))+
coord_flip()+
labs(x="Number of courses", y="Subject and Level", title="The number of courses in each subject level") +theme_classic()
course_duratn <- udemy_course %>%
group_by(level)%>%
summarise(course_duration=mean(content_duration, na.rm=TRUE))
data.frame(course_duratn)
## level course_duration
## 1 All Levels 4.869656
## 2 Beginner Level 3.091194
## 3 Expert Level 2.905460
## 4 Intermediate Level 3.733333
ggplot(course_duratn,aes(level,course_duration))+geom_col(fill="#BCC60A")+ geom_text(aes(label=course_duration))+
labs(x="Level", y="Average Course Duration (mins)",title="The Average of course course duration") +theme_classic()
All levels courses have the highest average content duration of approximately 5 minutes
udemy_course %>%
ggplot(aes(price,num_subscribers))+geom_jitter(size=4)+
labs(x="Price ($)", y="Number of Subscribers", title="The Relationship between Price and Number of Subscribers") +theme_classic()
The relationship was affected by free courses (i.e courses with $0 price), proven by the presence of outliers. Though, there was no clear relationship between price of a course and number of subscribers to the course.
udemy_course %>%
filter(price !=0) %>%
ggplot(aes(price,num_subscribers))+geom_jitter(size=4)+
labs(x="Price ($)", y="Number of Subscribers", title="The Relationship between Price and Number of Subscribers after removing free courses") +theme_classic()
The relationship between price of a course and the number of subscribers seem hidden even when free courses were removed. That is, there is no defined relationship based on the data available.
udemy_course%>%
ggplot(aes(price)) +geom_histogram(binwidth=10,fill="#30B3E5")+
labs(x="Price ($)", y="Frequency", title="The Price distribution") +theme_classic()
udemy_course%>%
filter(price !=0) %>%
ggplot(aes(price)) +geom_histogram(binwidth=10,fill="#30B3E5")+
labs(x="Price ($)", y="Frequency", title="The Price distribution after removing free courses") +theme_bw()
The distribution of price of udemy courses is skewed to the right, centered at $66.05 with most of the data between $20 to $55 with a range of roughly $180 ($20 to $200).
udemy_course%>%
ggplot(aes(content_duration,level))+ geom_boxplot()+
labs(x="Content Duration", y="Level", title="The Content duration distribution") +theme_bw()
udemy_course%>%
ggplot(aes(level,price))+ geom_boxplot()+
labs(x="Level", y="Price ($)", title="Price distribution") +theme_bw()
The mean price of All-Level, expert and intermediate courses are above $50 while Beginners level is below $50. The price range for expert level course is between $20 and $200. The price of all level courses is between $0 and $200 while intermediate and beginners courses range from $0 to $150 with noticeable outliers above $150.