Coursera Course completion EDA

This is the on-site interview portion of the interview where I will conduct EDA of the course data to examine what we can explore to improve course completion

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(skimr)
library(knitr)

data <- read_csv('~/Downloads/onsite.csv')

## Rows: 85607 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): user_id
## dbl (38): registration_time, grade, passed, age, education, employment, gend...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skim(data) %>% knitr::kable()

skim_type	skim_variable	n_missing	complete_rate	character.min	character.max	character.empty	character.n_unique	character.whitespace	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
character	user_id	0	1.0000000	40	40	0	85607	0	NA	NA	NA	NA	NA	NA	NA	NA
numeric	registration_time	0	1.0000000	NA	NA	NA	NA	NA	-5.592963e+06	5.589680e+06	-33403322	-10228307.50	-3543499.00	-845833.500	1214678	▁▁▂▃▇
numeric	grade	0	1.0000000	NA	NA	NA	NA	NA	1.246811e+01	2.961924e+01	0	0.00	0.00	1.998	100	▇▁▁▁▁
numeric	passed	0	1.0000000	NA	NA	NA	NA	NA	9.598510e-02	2.945726e-01	0	0.00	0.00	0.000	1	▇▁▁▁▁
numeric	age	75630	0.1165442	NA	NA	NA	NA	NA	3.204801e+01	1.028923e+01	13	25.00	29.00	37.000	150	▇▂▁▁▁
numeric	education	75792	0.1146518	NA	NA	NA	NA	NA	7.138869e+00	1.860033e+00	0	7.00	7.00	8.000	10	▁▂▁▇▂
numeric	employment	76076	0.1113344	NA	NA	NA	NA	NA	1.584723e+00	2.359023e+00	0	0.00	0.00	3.000	8	▇▁▁▂▁
numeric	gender	75531	0.1177007	NA	NA	NA	NA	NA	8.691941e-01	3.462092e-01	0	1.00	1.00	1.000	2	▁▁▇▁▁
numeric	reading	75936	0.1129697	NA	NA	NA	NA	NA	3.506876e+00	5.664977e-01	0	3.00	4.00	4.000	4	▁▁▁▆▇
numeric	speaking	75946	0.1128529	NA	NA	NA	NA	NA	3.271193e+00	7.609432e-01	0	3.00	3.00	4.000	4	▁▁▂▇▇
numeric	writing	75923	0.1131216	NA	NA	NA	NA	NA	3.343556e+00	6.995069e-01	0	3.00	3.00	4.000	4	▁▁▂▇▇
numeric	num_lectures_watched	0	1.0000000	NA	NA	NA	NA	NA	8.953508e+00	1.856616e+01	0	0.00	0.00	11.000	602	▇▁▁▁▁
numeric	num_unique_lectures_watched	0	1.0000000	NA	NA	NA	NA	NA	5.823905e+00	9.632940e+00	0	0.00	0.00	8.000	27	▇▁▁▁▂
numeric	num_lectures_streamed	0	1.0000000	NA	NA	NA	NA	NA	5.968344e+00	1.233228e+01	0	0.00	0.00	4.000	168	▇▁▁▁▁
numeric	num_unique_lectures_streamed	0	1.0000000	NA	NA	NA	NA	NA	4.131590e+00	8.015672e+00	0	0.00	0.00	3.000	27	▇▁▁▁▁
numeric	num_lectures_downloaded	0	1.0000000	NA	NA	NA	NA	NA	2.985165e+00	1.336439e+01	0	0.00	0.00	0.000	602	▇▁▁▁▁
numeric	num_unique_lectures_downloaded	0	1.0000000	NA	NA	NA	NA	NA	2.179658e+00	6.744922e+00	0	0.00	0.00	0.000	27	▇▁▁▁▁
numeric	num_quizzes_submitted	0	1.0000000	NA	NA	NA	NA	NA	1.140982e+00	2.652004e+00	0	0.00	0.00	0.000	54	▇▁▁▁▁
numeric	num_unique_quizzes_submitted	0	1.0000000	NA	NA	NA	NA	NA	6.584742e-01	1.355048e+00	0	0.00	0.00	0.000	4	▇▁▁▁▁
numeric	total_quiz_first_score	66411	0.2242340	NA	NA	NA	NA	NA	1.253699e+01	5.667405e+00	0	7.75	14.00	17.750	20	▂▃▃▃▇
numeric	total_quiz_last_score	66411	0.2242340	NA	NA	NA	NA	NA	1.386138e+01	6.229622e+00	0	9.00	15.25	20.000	20	▁▂▂▂▇
numeric	total_quiz_min_score	66411	0.2242340	NA	NA	NA	NA	NA	1.236008e+01	5.644016e+00	0	7.50	13.75	17.500	20	▂▃▃▅▇
numeric	total_quiz_max_score	66411	0.2242340	NA	NA	NA	NA	NA	1.390936e+01	6.218716e+00	0	9.00	15.50	20.000	20	▁▂▂▂▇
numeric	num_assignment_parts_submitted	0	1.0000000	NA	NA	NA	NA	NA	9.082318e-01	2.833249e+00	0	0.00	0.00	0.000	102	▇▁▁▁▁
numeric	num_unique_assignment_parts_submitted	0	1.0000000	NA	NA	NA	NA	NA	6.742556e-01	1.931242e+00	0	0.00	0.00	0.000	7	▇▁▁▁▁
numeric	total_assignment_parts_first_score	74653	0.1279568	NA	NA	NA	NA	NA	9.969509e+01	4.826193e+01	0	60.00	100.00	150.000	150	▂▂▁▅▇
numeric	total_assignment_parts_last_score	74653	0.1279568	NA	NA	NA	NA	NA	1.186233e+02	4.545989e+01	0	100.00	150.00	150.000	150	▁▁▁▃▇
numeric	total_assignment_parts_min_score	74653	0.1279568	NA	NA	NA	NA	NA	9.950886e+01	4.827799e+01	0	60.00	100.00	150.000	150	▂▂▁▅▇
numeric	total_assignment_parts_max_score	74653	0.1279568	NA	NA	NA	NA	NA	1.187105e+02	4.545821e+01	0	100.00	150.00	150.000	150	▁▁▁▃▇
numeric	num_threads	0	1.0000000	NA	NA	NA	NA	NA	1.620190e-02	1.781373e-01	0	0.00	0.00	0.000	10	▇▁▁▁▁
numeric	num_posts	0	1.0000000	NA	NA	NA	NA	NA	1.363323e-01	1.334242e+00	0	0.00	0.00	0.000	131	▇▁▁▁▁
numeric	num_up_voted_posts	0	1.0000000	NA	NA	NA	NA	NA	6.382660e-02	7.203496e-01	0	0.00	0.00	0.000	76	▇▁▁▁▁
numeric	num_down_voted_posts	0	1.0000000	NA	NA	NA	NA	NA	1.133100e-03	3.819550e-02	0	0.00	0.00	0.000	3	▇▁▁▁▁
numeric	num_votes_given	0	1.0000000	NA	NA	NA	NA	NA	4.778231e-01	8.937769e+00	-9	0.00	0.00	0.000	1413	▇▁▁▁▁
numeric	num_votes_received	0	1.0000000	NA	NA	NA	NA	NA	4.744355e-01	9.248649e+00	-27	0.00	0.00	0.000	1752	▇▁▁▁▁
numeric	num_up_votes_given	0	1.0000000	NA	NA	NA	NA	NA	4.948778e-01	9.002486e+00	0	0.00	0.00	0.000	1417	▇▁▁▁▁
numeric	num_up_votes_received	0	1.0000000	NA	NA	NA	NA	NA	4.912098e-01	9.370208e+00	0	0.00	0.00	0.000	1755	▇▁▁▁▁
numeric	num_down_votes_given	0	1.0000000	NA	NA	NA	NA	NA	-1.705470e-02	4.201418e-01	-78	0.00	0.00	0.000	0	▁▁▁▁▇
numeric	num_down_votes_received	0	1.0000000	NA	NA	NA	NA	NA	-1.677430e-02	4.226480e-01	-75	0.00	0.00	0.000	0	▁▁▁▁▇

Overall completion rate: ~10%

mean(data$passed)

## [1] 0.09598514

Question1: Demographics: are there certain demographics that have higher course completion rate?

data %>% 
  filter(!is.na(employment)) %>%
  ggplot(aes(x = factor(employment))) +
  geom_bar() + 
  labs(x = 'Employment code', y = 'num users', title = 'User employment distribution')

data %>%
  ggplot(aes(x =age)) + 
  geom_histogram() + 
  labs(x = 'Age', y = 'num users', title = 'User age distribution')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 75630 rows containing non-finite values (stat_bin).

data %>%
  group_by(age) %>%
  summarise(completion_rate = mean(passed)) %>%
  ggplot(aes(x =age, y = completion_rate)) + 
  geom_point() + 
  geom_smooth() +
  labs(x = 'Age', y = 'Completion rate', title = 'User age vs completion rate')

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

Question2: Do users who watched more video tend to have higher completion rate?

data %>%
  mutate(num_unique_lectures_watched_quantile = ntile(num_unique_lectures_watched, 4)) %>%
  group_by(num_unique_lectures_watched_quantile) %>% 
  summarise(completion_rate = mean(passed)) %>%
  ggplot(aes(x = num_unique_lectures_watched_quantile, y = completion_rate)) + 
  geom_col() +
  labs(x = 'Num unique lectures watched quantile', 
       y = 'Completion rate', 
       title = 'Num unique lecture watched quantile vs completion rate')

data %>%
  mutate(num_times_watched_per_lecture = round(num_lectures_watched/num_unique_lectures_watched,1)) %>%
  group_by(num_times_watched_per_lecture) %>% 
  summarise(completion_rate = mean(passed)) %>%
  ggplot(aes(x = num_times_watched_per_lecture, y = completion_rate)) + 
  geom_point() + 
  geom_smooth(method = 'lm') + 
  labs(x = 'Num watches per lecture', y = 'Completion rate', title = 'Num watches per lecture vs completion rate')

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

Question3: Do users who submit more assignment tend to have higher completion rate?

data %>%
  group_by(num_unique_assignment_parts_submitted) %>% 
  summarise(completion_rate = mean(passed)) %>%
  ggplot(aes(x = factor(num_unique_assignment_parts_submitted), y = completion_rate)) + 
  geom_col() +
  labs(x = 'Num unique assignment submissions', 
       y = 'Completion rate', 
       title = 'Num unique assignment submissions vs completion rate')

data %>%
  filter(!is.na(num_unique_assignment_parts_submitted)) %>%
  mutate(num_submission_per_assignment = round(num_assignment_parts_submitted/num_unique_assignment_parts_submitted,1)) %>%
  group_by(num_submission_per_assignment) %>% 
  summarise(completion_rate = mean(passed)) %>%
  ungroup() %>%
  ggplot(aes(x = num_submission_per_assignment, y = completion_rate)) + 
  geom_point() + 
  geom_smooth(method = 'lm') + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  labs(x = 'Num submissions per assignment', 
       y = 'Completion rate', 
       title = 'Num assignment submissions vs completion rate')

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

Conclusion:

With our current data, demographic information is not as useful because we are missing 90% of the demographic information. –> Recommendation: We should try to collect more demographic information and verify the validity of those information before conducting analysis to arrive at meaningful result.
However, with the behavioral information, we are able to extract some interesting insights: those who watch more unique lectures tend to have higher completion rate, though frequency of watching each video does not improve completion rate –> Recommendation: We should explore the hypothesis that “Watching more unique lectures in the course could improve completion rate”. We can experiment with encouraging students to watch more lectures and verify the course lecture watch rate and completion rate.
Assignment submission also has position relationship with completion rate: those who have more unique submissions tend to have higher completion rate. We also see some signs of students who make more attempts in each assignment have higher completion rate; however, this data is quite noisy and should be further verified –> Recommendation: We should explore the hypothesis that “Completing more assignments in the course could improve completion rate”. We can experiment with encouraging students to attempt assignment and verify the course lecture watch rate and completion rate.