This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
filepath <- "/Users/jkwan/Documents/INST314/MoocsFinal.csv"
moocs_data <- read.csv(filepath)
head(moocs_data)
str(moocs_data)
'data.frame': 5300 obs. of 18 variables:
$ course_id : chr "HarvardX/CS50x/2012" "HarvardX/CS50x/2012" "MITx/6.00x/2012_Fall" "HarvardX/CS50x/2012" ...
$ userid_DI : chr "MHxPC130428093" "MHxPC130299882" "MHxPC130417083" "MHxPC130120970" ...
$ registered : int 1 1 1 1 1 1 1 1 1 1 ...
$ viewed : int 1 1 0 1 1 1 0 0 1 1 ...
$ explored : int 0 0 0 0 0 0 0 0 0 0 ...
$ certified : int 0 0 0 0 0 0 0 0 0 0 ...
$ final_cc_cname_DI: chr "United States" "United States" "Brazil" "United States" ...
$ LoE_DI : chr "Master's" "Secondary" "Secondary" "Master's" ...
$ YoB : int 1989 1956 1992 1979 1986 1973 1989 1995 1991 1993 ...
$ gender : chr "f" "m" "m" "m" ...
$ grade : num 0 0 0 0 0 0 0 NA 0 0 ...
$ start_time_DI : chr "2013-03-04" "2012-10-15" "2012-08-24" "2013-04-20" ...
$ last_event_DI : chr "" "" "2012-08-24" "2013-04-23" ...
$ nevents : int 0 0 2 14 0 13 0 1 0 16 ...
$ ndays_act : int 0 0 1 2 0 1 0 1 0 1 ...
$ nplay_video : int 0 0 0 0 0 0 0 0 0 0 ...
$ nchapters : int 1 2 0 1 1 1 0 0 2 2 ...
$ nforum_posts : int 0 0 0 0 0 0 0 0 0 0 ...
# **Question 2:**
summary(moocs_data$viewed)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 1.0000 0.6149 1.0000 1.0000
summary(moocs_data$explored)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.05943 0.00000 1.00000
summary(moocs_data$certified)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.02585 0.00000 1.00000
summary(moocs_data$nforum_posts)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01132 0.00000 4.00000
summary(moocs_data$nevents)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 0.0 3.0 266.7 61.0 17828.0
summary(moocs_data$ndays_act)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 1.000 4.048 3.000 162.000
summary(moocs_data$nplay_video)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 0.00 29.93 1.00 6209.00
# Question 3:
table(moocs_data$LoE, moocs_data$certified)
0 1
Bachelor's 2021 53
Doctorate 122 3
Less than Secondary 140 6
Master's 1189 34
Secondary 1691 41
prop.table(table(moocs_data$LoE, moocs_data$certified), margin = 1)
0 1
Bachelor's 0.97444552 0.02555448
Doctorate 0.97600000 0.02400000
Less than Secondary 0.95890411 0.04109589
Master's 0.97219951 0.02780049
Secondary 0.97632794 0.02367206
# Question 3 Visualization (Bar Chart)
library(ggplot2)
ggplot(moocs_data, aes(x = LoE_DI, fill = as.factor(certified))) +
geom_bar(position = "fill") +
labs(title = "Certification Rate by Education Level",
x = "Level of Education",
y = "Proportion",
fill = "Certified (1=Yes, 0=No)") +
theme_minimal()
# Question 4
summary(moocs_data$nforum_posts)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01132 0.00000 4.00000
tapply(moocs_data$nforum_posts, moocs_data$certified, summary)
$`0`
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01026 0.00000 4.00000
$`1`
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.05109 0.00000 3.00000
#Question 4 Visualization (Bar graph)
library(ggplot2)
# Create a categorical variable for forum posts
moocs_data$forum_activity <- ifelse(moocs_data$nforum_posts > 0, "Posted", "Did Not Post")
# Create bar chart
ggplot(moocs_data, aes(x = forum_activity, fill = as.factor(certified))) +
geom_bar(position = "fill") +
labs(title = "Forum Participation vs. Certification",
x = "Forum Activity",
y = "Proportion",
fill = "Certified (1=Yes, 0=No)") +
theme_minimal()
# Quetion 5
subset_data <- subset(moocs_data, LoE_DI %in% c("Less than Secondary", "Secondary"))
summary(subset_data$nforum_posts)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01384 0.00000 3.00000
#Certification Rates
table(subset_data$certified)
0 1
1831 47
prop.table(table(subset_data$certified))
0 1
0.97497338 0.02502662
# Forum Posts vs. Certification Breakdown
tapply(subset_data$nforum_posts, subset_data$certified, summary)
$`0`
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01256 0.00000 3.00000
$`1`
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.06383 0.00000 3.00000
# Question 5
library(ggplot2)
# Create a categorical variable for forum posts
subset_data$forum_activity <- ifelse(subset_data$nforum_posts > 0, "Posted", "Did Not Post")
# Create bar chart
ggplot(subset_data, aes(x = forum_activity, fill = as.factor(certified))) +
geom_bar(position = "fill") +
labs(title = "Forum Participation vs. Certification (Non-Bachelor's)",
x = "Forum Activity",
y = "Proportion",
fill = "Certified (1=Yes, 0=No)") +
theme_minimal()
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.